Skip to content

Commit 4b13683

Browse files
Style and small bugfixes
1 parent ce6b5da commit 4b13683

File tree

10 files changed

+252
-240
lines changed

10 files changed

+252
-240
lines changed

src/resilience/AutomaticCheckpoint.hpp

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -179,7 +179,6 @@ namespace KokkosResilience
179179
template< typename Context, typename F, typename FilterFunc, typename... T, std::enable_if_t<is_filter_v<FilterFunc>>* = nullptr>
180180
void checkpoint( Context &ctx, const std::string &label, int iteration, F &&fun, FilterFunc &&filter, Detail::RegInfo<T>... explicit_members)
181181
{
182-
static_assert(is_filter_v<FilterFunc>);
183182
Detail::checkpoint_impl( ctx, label, iteration, std::forward< F >( fun ), std::forward< FilterFunc >( filter ), explicit_members...);
184183
}
185184

src/resilience/backend/VelocBackend.cpp

Lines changed: 47 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -62,14 +62,17 @@ namespace KokkosResilience
6262
void veloc_internal_error_throw( int e, const char *name, const char *file, int line = 0 )
6363
{
6464
std::ostringstream out;
65-
out << name << " error: VELOC operation failed";
65+
out << name << " error: VELOC operation failed (" << e << ")";
6666
if ( file )
6767
{
6868
out << " " << file << ":" << line;
6969
}
7070

7171
// TODO: implement exception class
7272
//Kokkos::Impl::throw_runtime_exception( out.str() );
73+
// In the meantime, we'll print at least
74+
out << "\n";
75+
std::cerr << out.str();
7376
}
7477

7578
inline void veloc_internal_safe_call( int e, const char *name, const char *file, int line = 0 )
@@ -82,7 +85,7 @@ namespace KokkosResilience
8285
VeloCMemoryBackend::VeloCMemoryBackend(ContextBase &ctx, MPI_Comm mpi_comm)
8386
: m_context(&ctx), m_mpi_comm(mpi_comm) {
8487
const auto &vconf = m_context->config()["backends"]["veloc"]["config"].as< std::string >();
85-
veloc_client = veloc::get_client(m_mpi_comm, vconf.c_str());
88+
veloc_client = veloc::get_client(m_mpi_comm, vconf);
8689
}
8790

8891
VeloCMemoryBackend::~VeloCMemoryBackend()
@@ -205,104 +208,99 @@ namespace KokkosResilience
205208
// No-op, don't do anything
206209
}
207210

208-
VeloCFileBackend::VeloCFileBackend(MPIContext<VeloCFileBackend> &,
209-
MPI_Comm mpi_comm,
210-
const std::string &veloc_config) {
211-
VELOC_SAFE_CALL( VELOC_Init( mpi_comm, veloc_config.c_str()));
211+
VeloCFileBackend::VeloCFileBackend(ContextBase& context, MPI_Comm mpi_comm)
212+
: m_context(context), m_mpi_comm(mpi_comm) {
213+
const auto &vconf = m_context->config()["backends"]["veloc"]["config"].as< std::string >();
214+
veloc_client = veloc::get_client(m_mpi_comm, vconf);
212215
}
213216

214217
VeloCFileBackend::~VeloCFileBackend()
215218
{
216-
VELOC_Finalize( false );
219+
veloc_client->checkpoint_wait();
217220
}
218221

219222
void
220223
VeloCFileBackend::checkpoint( const std::string &label, int version,
221224
std::unordered_set<Registration> &members )
222225
{
223226
// Wait for previous checkpoint to finish
224-
VELOC_SAFE_CALL( VELOC_Checkpoint_wait());
227+
VELOC_SAFE_CALL( veloc_client->checkpoint_wait() );
225228

226229
// Start new checkpoint
227-
VELOC_SAFE_CALL( VELOC_Checkpoint_begin( label.c_str(), version ));
228-
229-
char veloc_file_name[VELOC_MAX_NAME];
230-
231-
bool status = true;
232-
try
233-
{
234-
VELOC_SAFE_CALL( VELOC_Route_file( veloc_file_name, veloc_file_name ) );
230+
VELOC_SAFE_CALL( veloc_client->checkpoint_begin( label, version ) );
235231

236-
std::string fname( veloc_file_name );
232+
bool success = true;
233+
try {
234+
std::string fname = veloc_client->route_file(label);
237235
std::ofstream vfile( fname, std::ios::binary );
238236

239237
#ifdef KR_ENABLE_TRACING
240-
auto write_trace = Util::begin_trace< Util::TimingTrace< std::string > >( *m_context, "write" );
238+
auto write_trace = Util::begin_trace< Util::TimingTrace< std::string > >(
239+
*m_context, "write"
240+
);
241241
#endif
242-
for ( auto &&member : members )
243-
{
244-
status = member->serialize(vfile);
245-
if(!status) break;
242+
for ( auto& member : members ) {
243+
success = member.serialize(vfile);
244+
if(!success) break;
246245
}
247246
#ifdef KR_ENABLE_TRACING
248247
write_trace.end();
249248
#endif
250-
}
251-
catch ( ... )
252-
{
253-
status = false;
249+
} catch ( const std::exception& e){
250+
success = false;
251+
std::cerr << "VelocFileBackend::checkpoint error: " + e.what();
252+
} catch ( ... ) {
253+
success = false;
254+
std::cerr << "VelocFileBackend::checkpoint error: (unknown exception type)"
254255
}
255256

256-
VELOC_SAFE_CALL( VELOC_Checkpoint_end( status ));
257+
VELOC_SAFE_CALL( veloc_client->checkpoint_end(success) );
257258
}
258259

259260
bool
260261
VeloCFileBackend::restart_available( const std::string &label, int version )
261262
{
262-
int latest = VELOC_Restart_test( label.c_str(), 0 );
263-
264263
// res is < 0 if no versions available, else it is the latest version
265-
return version <= latest;
264+
return version <= latest_version(label);
266265
}
267266

268267
int VeloCFileBackend::latest_version( const std::string &label ) const noexcept
269268
{
270-
return VELOC_Restart_test( label.c_str(), 0 );
269+
return veloc_client->restart_test(label, 0);
271270
}
272271

273272
void VeloCFileBackend::restart( const std::string &label, int version,
274273
std::unordered_set<Registration> &members )
275274
{
276-
VELOC_SAFE_CALL( VELOC_Restart_begin( label.c_str(), version ));
275+
VELOC_SAFE_CALL( veloc_client->restart_begin( label, version ));
277276

278277
char veloc_file_name[VELOC_MAX_NAME];
279278

280-
bool status = true;
281-
try
282-
{
283-
VELOC_SAFE_CALL( VELOC_Route_file( veloc_file_name, veloc_file_name ) );
284-
printf( "restore file name: %s\n", veloc_file_name );
285-
286-
std::string fname( veloc_file_name );
279+
bool success = true;
280+
try {
281+
std::string fname = veloc_client->route_file(label);
287282
std::ifstream vfile( fname, std::ios::binary );
288283

289284
#ifdef KR_ENABLE_TRACING
290-
auto read_trace = Util::begin_trace< Util::TimingTrace< std::string > >( *m_context, "read" );
285+
auto read_trace = Util::begin_trace< Util::TimingTrace< std::string > >(
286+
*m_context, "read"
287+
);
291288
#endif
292-
for ( auto &&member : members )
293-
{
294-
status = member->deserialize(vfile);
295-
if(!status) break;
289+
for ( auto& member : members ) {
290+
success = member.deserialize(vfile);
291+
if(!success) break;
296292
}
297293
#ifdef KR_ENABLE_TRACING
298294
read_trace.end();
299295
#endif
300-
}
301-
catch ( ... )
302-
{
303-
status = false;
296+
} catch ( const std::exception& e ){
297+
success = false;
298+
std::cerr << "VelocFileBackend::restart error: " + e.what();
299+
} catch ( ... ) {
300+
success = false;
301+
std::cerr << "VelocFileBackend::checkpoint error: (unknown exception type)"
304302
}
305303

306-
VELOC_SAFE_CALL( VELOC_Restart_end( status ));
304+
VELOC_SAFE_CALL( veloc_client->restart_end(success) );
307305
}
308306
}

src/resilience/backend/VelocBackend.hpp

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -128,7 +128,7 @@ namespace KokkosResilience
128128
{
129129
public:
130130

131-
VeloCFileBackend( MPIContext< VeloCFileBackend > &ctx, MPI_Comm mpi_comm, const std::string &veloc_config);
131+
VeloCFileBackend( MPIContext< VeloCFileBackend > &ctx, MPI_Comm mpi_comm);
132132
~VeloCFileBackend();
133133

134134
void checkpoint( const std::string &label, int version,
@@ -141,6 +141,11 @@ namespace KokkosResilience
141141
std::unordered_set<Registration> &members );
142142

143143
void register_hashes( std::unordered_set<Registration> &members ) {} // Do nothing
144+
145+
veloc::client_t *veloc_client;
146+
147+
ContextBase *m_context;
148+
MPI_Comm m_mpi_comm;
144149
};
145150
}
146151

src/resilience/registration/Custom.hpp

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -39,13 +39,14 @@
3939
* Questions? Contact Christian R. Trott ([email protected])
4040
*/
4141

42-
#ifndef _INC_RESILIENCE_REGISTRATION_CUSTOM_HPP
43-
#define _INC_RESILIENCE_REGISTRATION_CUSTOM_HPP
42+
#ifndef INC_RESILIENCE_REGISTRATION_CUSTOM_HPP
43+
#define INC_RESILIENCE_REGISTRATION_CUSTOM_HPP
4444

4545
#include "Registration.hpp"
4646

4747
namespace KokkosResilience::Detail {
48-
struct CustomRegistration : public RegistrationBase {
48+
class CustomRegistration : public RegistrationBase {
49+
public:
4950
CustomRegistration() = delete;
5051
CustomRegistration(serializer_t&& serializer, deserializer_t&& deserializer, const std::string name) :
5152
RegistrationBase(name),
@@ -79,4 +80,4 @@ namespace KokkosResilience::Detail {
7980
};
8081
}
8182

82-
#endif
83+
#endif //INC_RESILIENCE_REGISTRATION_CUSTOM_HPP

src/resilience/registration/Registration.cpp

Lines changed: 19 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,26 +1,38 @@
11
#include "Registration.hpp"
22
#include <string>
3-
#include <locale> //isalnum
3+
#include <locale>
44

55
namespace KokkosResilience {
66
namespace Detail {
77
std::string sanitized_label(std::string label){
8-
//If character not alphanumeric, can only be an underscore.
8+
// If character is not alphanumeric, can only be an underscore.
99
for(char& c : label){
1010
if(!std::isalnum(c)) c = '_';
1111
}
1212
return label;
1313
}
1414

1515
size_t label_hash(const std::string& name) {
16-
const size_t base = 7;
16+
// Hash by summing each character multiplied by some value based on the
17+
// position of the character
18+
// To try to make sure the hash for "ab" is different from "ba", we want
19+
// to make our base position-based value larger than the number of
20+
// possible characters. We allow [a-zA-Z0-9_] = 63 characters
21+
// We also want to make sure the character*position value is unique for
22+
// all possible characters and positions (as best we can) - so we make
23+
// the position component the ith power of a prime number
24+
// C++ hashes are type size_t, but we want these hashes to be compatible
25+
// as IDs for various checkpointing libraries, which are usually integers
26+
27+
// 67 is first prime number larger than 63
28+
const size_t base = 67;
1729
size_t hash = 0;
1830
for(size_t i = 0; i < name.length(); i++){
19-
hash += static_cast<size_t>((static_cast<size_t>(name[i]) *
20-
static_cast<size_t>(pow(base, i))
21-
) % INT_MAX);
31+
float character_val = name[i];
32+
float position_val = powf(base, i);
33+
hash += static_cast<size_t>( (character_val * position_val) % INT_MAX );
2234
}
23-
return static_cast<size_t>(hash%INT_MAX);
35+
return hash % INT_MAX;
2436
}
2537

2638
RegistrationBase::RegistrationBase(const std::string member_name) :

0 commit comments

Comments
 (0)