@@ -62,14 +62,17 @@ namespace KokkosResilience
62
62
void veloc_internal_error_throw ( int e, const char *name, const char *file, int line = 0 )
63
63
{
64
64
std::ostringstream out;
65
- out << name << " error: VELOC operation failed" ;
65
+ out << name << " error: VELOC operation failed ( " << e << " ) " ;
66
66
if ( file )
67
67
{
68
68
out << " " << file << " :" << line;
69
69
}
70
70
71
71
// TODO: implement exception class
72
72
// Kokkos::Impl::throw_runtime_exception( out.str() );
73
+ // In the meantime, we'll print at least
74
+ out << " \n " ;
75
+ std::cerr << out.str ();
73
76
}
74
77
75
78
inline void veloc_internal_safe_call ( int e, const char *name, const char *file, int line = 0 )
@@ -82,7 +85,7 @@ namespace KokkosResilience
82
85
VeloCMemoryBackend::VeloCMemoryBackend (ContextBase &ctx, MPI_Comm mpi_comm)
83
86
: m_context(&ctx), m_mpi_comm(mpi_comm) {
84
87
const auto &vconf = m_context->config ()[" backends" ][" veloc" ][" config" ].as < std::string >();
85
- veloc_client = veloc::get_client (m_mpi_comm, vconf. c_str () );
88
+ veloc_client = veloc::get_client (m_mpi_comm, vconf);
86
89
}
87
90
88
91
VeloCMemoryBackend::~VeloCMemoryBackend ()
@@ -205,104 +208,99 @@ namespace KokkosResilience
205
208
// No-op, don't do anything
206
209
}
207
210
208
- VeloCFileBackend::VeloCFileBackend (MPIContext<VeloCFileBackend> &,
209
- MPI_Comm mpi_comm,
210
- const std::string &veloc_config) {
211
- VELOC_SAFE_CALL ( VELOC_Init ( mpi_comm, veloc_config. c_str ()) );
211
+ VeloCFileBackend::VeloCFileBackend (ContextBase& context, MPI_Comm mpi_comm)
212
+ : m_context(context), m_mpi_comm( mpi_comm) {
213
+ const auto &vconf = m_context-> config ()[ " backends " ][ " veloc " ][ " config " ]. as < std::string >();
214
+ veloc_client = veloc::get_client (m_mpi_comm, vconf );
212
215
}
213
216
214
217
VeloCFileBackend::~VeloCFileBackend ()
215
218
{
216
- VELOC_Finalize ( false );
219
+ veloc_client-> checkpoint_wait ( );
217
220
}
218
221
219
222
void
220
223
VeloCFileBackend::checkpoint ( const std::string &label, int version,
221
224
std::unordered_set<Registration> &members )
222
225
{
223
226
// Wait for previous checkpoint to finish
224
- VELOC_SAFE_CALL ( VELOC_Checkpoint_wait () );
227
+ VELOC_SAFE_CALL ( veloc_client-> checkpoint_wait () );
225
228
226
229
// Start new checkpoint
227
- VELOC_SAFE_CALL ( VELOC_Checkpoint_begin ( label.c_str (), version ));
228
-
229
- char veloc_file_name[VELOC_MAX_NAME];
230
-
231
- bool status = true ;
232
- try
233
- {
234
- VELOC_SAFE_CALL ( VELOC_Route_file ( veloc_file_name, veloc_file_name ) );
230
+ VELOC_SAFE_CALL ( veloc_client->checkpoint_begin ( label, version ) );
235
231
236
- std::string fname ( veloc_file_name );
232
+ bool success = true ;
233
+ try {
234
+ std::string fname = veloc_client->route_file (label);
237
235
std::ofstream vfile ( fname, std::ios::binary );
238
236
239
237
#ifdef KR_ENABLE_TRACING
240
- auto write_trace = Util::begin_trace< Util::TimingTrace< std::string > >( *m_context, " write" );
238
+ auto write_trace = Util::begin_trace< Util::TimingTrace< std::string > >(
239
+ *m_context, " write"
240
+ );
241
241
#endif
242
- for ( auto &&member : members )
243
- {
244
- status = member->serialize (vfile);
245
- if (!status) break ;
242
+ for ( auto & member : members ) {
243
+ success = member.serialize (vfile);
244
+ if (!success) break ;
246
245
}
247
246
#ifdef KR_ENABLE_TRACING
248
247
write_trace.end ();
249
248
#endif
250
- }
251
- catch ( ... )
252
- {
253
- status = false ;
249
+ } catch ( const std::exception& e){
250
+ success = false ;
251
+ std::cerr << " VelocFileBackend::checkpoint error: " + e.what ();
252
+ } catch ( ... ) {
253
+ success = false ;
254
+ std::cerr << " VelocFileBackend::checkpoint error: (unknown exception type)"
254
255
}
255
256
256
- VELOC_SAFE_CALL ( VELOC_Checkpoint_end ( status ) );
257
+ VELOC_SAFE_CALL ( veloc_client-> checkpoint_end (success) );
257
258
}
258
259
259
260
bool
260
261
VeloCFileBackend::restart_available ( const std::string &label, int version )
261
262
{
262
- int latest = VELOC_Restart_test ( label.c_str (), 0 );
263
-
264
263
// res is < 0 if no versions available, else it is the latest version
265
- return version <= latest ;
264
+ return version <= latest_version (label) ;
266
265
}
267
266
268
267
int VeloCFileBackend::latest_version ( const std::string &label ) const noexcept
269
268
{
270
- return VELOC_Restart_test ( label. c_str () , 0 );
269
+ return veloc_client-> restart_test ( label, 0 );
271
270
}
272
271
273
272
void VeloCFileBackend::restart ( const std::string &label, int version,
274
273
std::unordered_set<Registration> &members )
275
274
{
276
- VELOC_SAFE_CALL ( VELOC_Restart_begin ( label. c_str () , version ));
275
+ VELOC_SAFE_CALL ( veloc_client-> restart_begin ( label, version ));
277
276
278
277
char veloc_file_name[VELOC_MAX_NAME];
279
278
280
- bool status = true ;
281
- try
282
- {
283
- VELOC_SAFE_CALL ( VELOC_Route_file ( veloc_file_name, veloc_file_name ) );
284
- printf ( " restore file name: %s\n " , veloc_file_name );
285
-
286
- std::string fname ( veloc_file_name );
279
+ bool success = true ;
280
+ try {
281
+ std::string fname = veloc_client->route_file (label);
287
282
std::ifstream vfile ( fname, std::ios::binary );
288
283
289
284
#ifdef KR_ENABLE_TRACING
290
- auto read_trace = Util::begin_trace< Util::TimingTrace< std::string > >( *m_context, " read" );
285
+ auto read_trace = Util::begin_trace< Util::TimingTrace< std::string > >(
286
+ *m_context, " read"
287
+ );
291
288
#endif
292
- for ( auto &&member : members )
293
- {
294
- status = member->deserialize (vfile);
295
- if (!status) break ;
289
+ for ( auto & member : members ) {
290
+ success = member.deserialize (vfile);
291
+ if (!success) break ;
296
292
}
297
293
#ifdef KR_ENABLE_TRACING
298
294
read_trace.end ();
299
295
#endif
300
- }
301
- catch ( ... )
302
- {
303
- status = false ;
296
+ } catch ( const std::exception& e ){
297
+ success = false ;
298
+ std::cerr << " VelocFileBackend::restart error: " + e.what ();
299
+ } catch ( ... ) {
300
+ success = false ;
301
+ std::cerr << " VelocFileBackend::checkpoint error: (unknown exception type)"
304
302
}
305
303
306
- VELOC_SAFE_CALL ( VELOC_Restart_end ( status ) );
304
+ VELOC_SAFE_CALL ( veloc_client-> restart_end (success) );
307
305
}
308
306
}
0 commit comments