Skip to content

Commit dae0e5e

Browse files
committed
"Bugfix error time double-counting, better coordinate algorithm"
1 parent bf667e9 commit dae0e5e

File tree

3 files changed

+34
-25
lines changed

3 files changed

+34
-25
lines changed

cmake/resilienceConfig.cmake.in

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,8 @@ include("${CMAKE_CURRENT_LIST_DIR}/resilienceTargets.cmake")
88
#All options defined with kr_option are exposed to linking targets' CMakeLists
99
set(KR_EXPOSED_OPTIONS @KR_EXPOSED_OPTIONS@)
1010
set(KR_EXPOSED_OPTION_VALUES @KR_EXPOSED_OPTION_VALUES@)
11-
foreach (OPT in ZIP_LISTS KR_EXPOSED_OPTIONS KR_EXPOSED_OPTION_VALUES)
12-
set(${OPT_1} ${OPT_2})
11+
foreach (OPT VAL in ZIP_LISTS KR_EXPOSED_OPTIONS KR_EXPOSED_OPTION_VALUES)
12+
set(${OPT} ${VAL})
1313
endforeach()
1414

1515
# VeloC needs to add a cmake config...

src/resilience/exec_space/openMP/Resilient_OpenMP_Error_Injector.hpp

Lines changed: 24 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -59,29 +59,27 @@ struct Error{
5959
inline std::optional<Error> global_error_settings;
6060

6161
struct ErrorInjectionTracking{
62-
inline static int64_t error_counter;
62+
inline static size_t error_counter;
6363
inline static std::mt19937 random_gen{0};
6464
inline static size_t global_next_inject = 0;
6565
inline static std::chrono::duration<long int, std::nano> elapsed_seconds{};
6666
inline static std::chrono::duration<long int, std::nano> total_error_time{};
67-
inline static std::mutex global_time_mutex;
6867
};
6968

7069
// Calculates coordinate formulas from linear iterator
7170
template< typename View>
7271
auto get_inject_indices_array( const View &view, std::size_t next_inject ){
7372

7473
std::array<std::size_t, 8> indices {};
75-
size_t dim_product = 1;
74+
size_t next_inject_copy = next_inject;
7675

7776
// View.extent() returns 1 for uninitialized dimensions
78-
// this array returns accurate coordinates up to the existing view rank
79-
// coordinates past rank are inaccurate, but are truncated by view.access() in the main injector
80-
indices[0] = next_inject % view.extent(0);
81-
82-
for(int i=1;i<8;i++){
83-
indices[i] = ((next_inject - (indices[i-1] * dim_product)) / (dim_product * view.extent(i-1) )) % view.extent(i);
84-
dim_product = dim_product * view.extent(i-1);
77+
// this array returns accurate coordinates up to the existing view rank
78+
// and zero for the rest, which are truncated by view.access() in the main injector
79+
// assumes column-major (Fortran) ordering
80+
for(int i=0;i<8;i++){
81+
indices[i] = next_inject_copy % view.extent(i);
82+
next_inject_copy /= view.extent(i);
8583
}
8684

8785
return indices;
@@ -104,6 +102,7 @@ void error_injection(View& original, View& copy_0, View& copy_1)
104102

105103
size_t next_inject = ErrorInjectionTracking::global_next_inject;
106104
std::array<size_t, 8> indices {};
105+
//auto access = std::mem_fn(&View::access);
107106

108107
for (int j = 0; j<=2; j++){
109108
while (next_inject < total_extent)
@@ -112,19 +111,28 @@ void error_injection(View& original, View& copy_0, View& copy_1)
112111
if (j==0){//Inject in the original if j is 0
113112
//replace value with noise
114113
original.access(indices[0],indices[1],indices[2],indices[3],indices[4],indices[5],indices[6],indices[7])
115-
= static_cast<typename View::value_type>(ErrorInjectionTracking::random_gen());
114+
//Incorrect because access expects individual indices, not a tuple.
115+
//Hence the need to use apply with a tuple
116+
//access(original, indices)
117+
//auto tuple = std::make_tuple(original, indices);
118+
//std::apply(access, tuple)
119+
= static_cast<typename View::value_type>(ErrorInjectionTracking::random_gen());
116120
ErrorInjectionTracking::error_counter++;
117121
}
122+
//#if 0
118123
else if(j==1){//Else inject in one of the other two copies, copy[0]
119124
copy_0.access(indices[0],indices[1],indices[2],indices[3],indices[5],indices[5],indices[6],indices[7])
120-
= static_cast<typename View::value_type>(ErrorInjectionTracking::random_gen());
125+
//access(copy_0, indices)
126+
= static_cast<typename View::value_type>(ErrorInjectionTracking::random_gen());
121127
ErrorInjectionTracking::error_counter++;
122128
}
123129
else{//or copy[1]
124130
copy_1.access(indices[0],indices[1],indices[2],indices[3],indices[5],indices[5],indices[6],indices[7])
125-
= static_cast<typename View::value_type>(ErrorInjectionTracking::random_gen());
131+
//access(copy_1, indices)
132+
= static_cast<typename View::value_type>(ErrorInjectionTracking::random_gen());
126133
ErrorInjectionTracking::error_counter++;
127134
}
135+
//#endif
128136
next_inject = global_error_settings->geometric(ErrorInjectionTracking::random_gen)+next_inject+1;
129137
}
130138
if(total_extent != 1){
@@ -137,10 +145,11 @@ void error_injection(View& original, View& copy_0, View& copy_1)
137145
KOKKOS_INLINE_FUNCTION
138146
void print_total_error_time() {
139147

140-
ErrorInjectionTracking::global_time_mutex.lock();
148+
static std::mutex global_time_mutex;
149+
global_time_mutex.lock();
141150
std::cout << "The value of ErrorInjectionTracking::total_error_time.count() is " << ErrorInjectionTracking::total_error_time.count() << " nanoseconds." << std::endl;
142151
std::cout << "The total number of errors inserted is " << ErrorInjectionTracking::error_counter << " errors." << std::endl;
143-
ErrorInjectionTracking::global_time_mutex.unlock();
152+
global_time_mutex.unlock();
144153

145154
}
146155

src/resilience/exec_space/openMP/Resilient_OpenMP_Parallel_For.hpp

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -212,8 +212,8 @@ class ParallelFor< FunctorType
212212
const auto start{std::chrono::steady_clock::now()};
213213
KokkosResilience::inject_error_duplicates();
214214
const auto stop{std::chrono::steady_clock::now()};
215-
KokkosResilience::ErrorInjectionTracking::elapsed_seconds = KokkosResilience::ErrorInjectionTracking::elapsed_seconds + (std::chrono::duration_cast<std::chrono::nanoseconds>(stop - start));
216-
KokkosResilience::ErrorInjectionTracking::total_error_time = KokkosResilience::ErrorInjectionTracking::total_error_time + KokkosResilience::ErrorInjectionTracking::elapsed_seconds;
215+
KokkosResilience::ErrorInjectionTracking::elapsed_seconds = (std::chrono::duration_cast<std::chrono::nanoseconds>(stop - start));
216+
KokkosResilience::ErrorInjectionTracking::total_error_time += KokkosResilience::ErrorInjectionTracking::elapsed_seconds;
217217

218218
// Combine the duplicate views and majority vote on correctness
219219
success = KokkosResilience::combine_resilient_duplicates();
@@ -241,8 +241,8 @@ class ParallelFor< FunctorType
241241
const auto start{std::chrono::steady_clock::now()};
242242
KokkosResilience::inject_error_duplicates();
243243
const auto stop{std::chrono::steady_clock::now()};
244-
KokkosResilience::ErrorInjectionTracking::elapsed_seconds = KokkosResilience::ErrorInjectionTracking::elapsed_seconds + (std::chrono::duration_cast<std::chrono::nanoseconds>(stop - start));
245-
KokkosResilience::ErrorInjectionTracking::total_error_time = KokkosResilience::ErrorInjectionTracking::total_error_time + KokkosResilience::ErrorInjectionTracking::elapsed_seconds;
244+
KokkosResilience::ErrorInjectionTracking::elapsed_seconds = (std::chrono::duration_cast<std::chrono::nanoseconds>(stop - start));
245+
KokkosResilience::ErrorInjectionTracking::total_error_time += KokkosResilience::ErrorInjectionTracking::elapsed_seconds;
246246

247247
// Combine the duplicate views, majority vote not triggered due to CMAKE macro
248248
success = KokkosResilience::combine_resilient_duplicates();
@@ -259,8 +259,8 @@ class ParallelFor< FunctorType
259259
start=std::chrono::steady_clock::now();
260260
KokkosResilience::inject_error_duplicates();
261261
stop=std::chrono::steady_clock::now();
262-
KokkosResilience::ErrorInjectionTracking::elapsed_seconds = KokkosResilience::ErrorInjectionTracking::elapsed_seconds + (std::chrono::duration_cast<std::chrono::nanoseconds>(stop - start));
263-
KokkosResilience::ErrorInjectionTracking::total_error_time = KokkosResilience::ErrorInjectionTracking::total_error_time + KokkosResilience::ErrorInjectionTracking::elapsed_seconds;
262+
KokkosResilience::ErrorInjectionTracking::elapsed_seconds = (std::chrono::duration_cast<std::chrono::nanoseconds>(stop - start));
263+
KokkosResilience::ErrorInjectionTracking::total_error_time += KokkosResilience::ErrorInjectionTracking::elapsed_seconds;
264264

265265
success = KokkosResilience::combine_resilient_duplicates();
266266
KokkosResilience::clear_duplicates_map();
@@ -280,8 +280,8 @@ class ParallelFor< FunctorType
280280
const auto start{std::chrono::steady_clock::now()};
281281
KokkosResilience::inject_error_duplicates();
282282
const auto stop{std::chrono::steady_clock::now()};
283-
KokkosResilience::ErrorInjectionTracking::elapsed_seconds = KokkosResilience::ErrorInjectionTracking::elapsed_seconds + (std::chrono::duration_cast<std::chrono::nanoseconds>(stop - start));
284-
KokkosResilience::ErrorInjectionTracking::total_error_time = KokkosResilience::ErrorInjectionTracking::total_error_time + KokkosResilience::ErrorInjectionTracking::elapsed_seconds;
283+
KokkosResilience::ErrorInjectionTracking::elapsed_seconds = (std::chrono::duration_cast<std::chrono::nanoseconds>(stop - start));
284+
KokkosResilience::ErrorInjectionTracking::total_error_time += KokkosResilience::ErrorInjectionTracking::elapsed_seconds;
285285

286286
// Combine the duplicate views and majority vote on correctness
287287
success = KokkosResilience::combine_resilient_duplicates();

0 commit comments

Comments
 (0)