Skip to content

Commit ff24fe1

Browse files
authored
Merge pull request #97 from kokkos/elisabethgiem/change-tmr-cmake-option
Change TMR/DMR cmake option
2 parents 2ebbf7f + 645891b commit ff24fe1

File tree

5 files changed

+25
-23
lines changed

5 files changed

+25
-23
lines changed

CMakeLists.txt

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -88,9 +88,16 @@ kr_option(KR_ENABLE_HDF5_PARALLEL "use parallel version of HDF5" OFF KR_ENABLE_H
8888
#Exec space options
8989
kr_option(KR_ENABLE_EXEC_SPACES "enable resilient execution spaces" OFF)
9090
kr_option(KR_ENABLE_OPENMP_EXEC_SPACE "enable the resilient OpenMP execution space" ON "KR_ENABLE_EXEC_SPACES;KR_OPENMP_DEVICE_ENABLED")
91-
kr_option(KR_ENABLE_DMR "enable double modular redundancy" OFF "KR_ENABLE_OPENMP_EXEC_SPACE")
92-
kr_option(KR_ENABLE_TMR "enable triple modular redundancy" ON "KR_ENABLE_OPENMP_EXEC_SPACE;NOT KR_ENABLE_DMR")
93-
kr_option(KR_ENABLE_WRAPPER "enable kernel fusing" OFF "KR_ENABLE_OPENMP_EXEC_SPACE")
91+
if(KR_ENABLE_OPENMP_EXEC_SPACE)
92+
set(KR_MODULAR_REDUNDANCY TRIPLE CACHE STRING "choose level of modular redundancy")
93+
set_property(CACHE KR_MODULAR_REDUNDANCY PROPERTY STRINGS OFF DOUBLE TRIPLE)
94+
if (KR_MODULAR_REDUNDANCY STREQUAL TRIPLE)
95+
target_compile_definitions(resilience PUBLIC KR_TRIPLE_MODULAR_REDUNDANCY)
96+
elseif (KR_MODULAR_REDUNDANCY STREQUAL DOUBLE)
97+
target_compile_definitions(resilience PUBLIC KR_DOUBLE_MODULAR_REDUNDANCY)
98+
endif()
99+
endif()
100+
kr_option(KR_KERNEL_FUSING "enable kernel fusing" OFF "KR_ENABLE_OPENMP_EXEC_SPACE;KR_MODULAR_REDUNDANCY")
94101

95102
# VeloC backend
96103
if (KR_ENABLE_VELOC_BACKEND)

src/resilience/exec_space/openMP/Resilient_OpenMP_Parallel_For.hpp

Lines changed: 8 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -120,7 +120,7 @@ class ParallelFor< FunctorType
120120

121121
using surrogate_policy = Kokkos::RangePolicy < Kokkos::OpenMP, WorkTag, LaunchBounds>;
122122

123-
#ifdef KR_ENABLE_WRAPPER
123+
#ifdef KR_KERNEL_FUSING
124124
auto make_wrapper (int64_t work_size, int64_t offset, const FunctorType &functor_copy_0, const FunctorType &functor_copy_1) const{
125125
if constexpr (std::is_void_v<WorkTag>){
126126
auto wrapper_functor = [&, work_size, offset](int64_t i){
@@ -173,7 +173,8 @@ class ParallelFor< FunctorType
173173
bool success = 0; //! This bool indicates that all views successfully reached a consensus.
174174

175175
surrogate_policy wrapper_policy;
176-
#ifdef KR_ENABLE_TMR
176+
177+
#if KR_TRIPLE_MODULAR_REDUNDANCY
177178
wrapper_policy = surrogate_policy(m_policy.begin(), m_policy.end());
178179
// Trigger Subscriber constructors
179180
KokkosResilience::ResilientDuplicatesSubscriber::in_resilient_parallel_loop = true;
@@ -182,7 +183,7 @@ class ParallelFor< FunctorType
182183
KokkosResilience::ResilientDuplicatesSubscriber::in_resilient_parallel_loop = false;
183184
#endif
184185

185-
#ifdef KR_ENABLE_WRAPPER
186+
#ifdef KR_KERNEL_FUSING
186187
auto work_size = m_policy.end() - m_policy.begin();
187188
auto offset = m_policy.begin();
188189
wrapper_policy = surrogate_policy(0, 3 * work_size );
@@ -197,10 +198,8 @@ class ParallelFor< FunctorType
197198

198199
#endif
199200

200-
#ifdef KR_ENABLE_TMR
201-
201+
#ifdef KR_TRIPLE_MODULAR_REDUNDANCY
202202
// TMR execution with no wrapper scheduling
203-
204203
Impl::ParallelFor< decltype(m_functor) , surrogate_policy, Kokkos::OpenMP > closure0( m_functor , wrapper_policy );
205204
Impl::ParallelFor< decltype(m_functor) , surrogate_policy, Kokkos::OpenMP > closure1( functor_copy_0 , wrapper_policy );
206205
Impl::ParallelFor< decltype(m_functor) , surrogate_policy, Kokkos::OpenMP > closure2( functor_copy_1 , wrapper_policy );
@@ -223,8 +222,7 @@ class ParallelFor< FunctorType
223222

224223
#endif
225224

226-
#ifdef KR_ENABLE_DMR
227-
225+
#ifdef KR_DOUBLE_MODULAR_REDUNDANCY
228226
//DMR with failover to TMR on error
229227
wrapper_policy = surrogate_policy(m_policy.begin(), m_policy.end());
230228

@@ -270,7 +268,8 @@ class ParallelFor< FunctorType
270268
}
271269
KokkosResilience::clear_duplicates_map();
272270
#endif
273-
#ifdef KR_ENABLE_WRAPPER
271+
272+
#ifdef KR_KERNEL_FUSION
274273

275274
// TMR with kernel fusion
276275
// Functor is fused, with iterations bound to duplicated functors in 3x range

src/resilience/exec_space/openMP/Resilient_OpenMP_Subscriber.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@
4949
namespace KokkosResilience {
5050

5151
bool ResilientDuplicatesSubscriber::in_resilient_parallel_loop = false;
52-
#ifdef KR_ENABLE_DMR
52+
#ifdef KR_DOUBLE_MODULAR_REDUNDANCY
5353
bool ResilientDuplicatesSubscriber::dmr_failover_to_tmr = false;
5454
#endif
5555
}

src/resilience/exec_space/openMP/Resilient_OpenMP_Subscriber.hpp

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -208,7 +208,7 @@ struct CombineDuplicates: public CombineDuplicatesBase
208208
bool execute() override
209209
{
210210
success() = 1;
211-
#ifdef KR_ENABLE_DMR
211+
#ifdef KR_DOUBLE_MODULAR_REDUNDANCY
212212
if (duplicate_count < 1){
213213
Kokkos::abort("Aborted in CombineDuplicates, no duplicate created");
214214
}
@@ -238,7 +238,7 @@ struct CombineDuplicates: public CombineDuplicatesBase
238238
KOKKOS_INLINE_FUNCTION
239239
void operator ()(Args&&... its) const{ //function parameter pack
240240

241-
#ifdef KR_ENABLE_DMR
241+
#ifdef KR_DOUBLE_MODULAR_REDUNDANCY
242242
//Indicates dmr_failover_to_tmr tripped
243243
if(duplicate_count == 2 ){
244244
//Main combiner begin, dmr failover has tripped into TMR
@@ -280,13 +280,12 @@ struct CombineDuplicates: public CombineDuplicatesBase
280280

281281
void inject_error() override
282282
{
283-
#ifdef KR_ENABLE_TMR
283+
#ifdef KR_TRIPLE_MODULAR_REDUNDANCY
284284
//Any-dimensional TMR error injector
285285
size_t total_extent = 1;
286286
for(int i=0; i<= (int)rank; i++){
287287
total_extent = total_extent * original.extent(i);
288288
}
289-
290289
//requires error in range, unless view size too small
291290
if (total_extent !=1 && (ErrorInject::global_next_inject > total_extent))
292291
{
@@ -337,7 +336,7 @@ struct ResilientDuplicatesSubscriber {
337336
// Gating for using subscriber only inside resilient parallel loops
338337
static bool in_resilient_parallel_loop;
339338

340-
#ifdef KR_ENABLE_DMR
339+
#ifdef KR_DOUBLE_MODULAR_REDUNDANCY
341340
static bool dmr_failover_to_tmr;
342341

343342
#endif
@@ -382,7 +381,7 @@ struct ResilientDuplicatesSubscriber {
382381
if (inserted || extents_resized) {
383382
res.original = original;
384383

385-
#ifdef KR_ENABLE_DMR
384+
#ifdef KR_DOUBLE_MODULAR_REDUNDANCY
386385
if (dmr_failover_to_tmr){
387386
// Create second copy
388387
set_duplicate_view(res.copy[1], original, 1);

tests/TestOpenMPResilientExecution.cpp

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -102,11 +102,9 @@ TEST(TestResOpenMP, TestResilientForDouble)
102102
{
103103

104104
KokkosResilience::global_error_settings = KokkosResilience::Error(0.001);
105-
106105
// Allocate y, x vectors.
107106
ResilientView<double*> y( "y", N );
108107
ResilientView<double*> x( "x", N );
109-
110108
//Integer vector 1 long to count data accesses, because scalar view bugs (previously)
111109
ResilientView<int*> counter( "DataAccesses", 1);
112110

@@ -117,11 +115,10 @@ TEST(TestResOpenMP, TestResilientForDouble)
117115
y ( i ) = i;
118116
Kokkos::atomic_inc(&counter(0));
119117
});
120-
121118
//reset global error settings
119+
KokkosResilience::print_total_error_time();
122120
KokkosResilience::ErrorInject::error_counter=0;
123121
KokkosResilience::global_error_settings.reset();
124-
KokkosResilience::print_total_error_time();
125122
KokkosResilience::clear_duplicates_cache();
126123

127124
Kokkos::deep_copy(x, y);

0 commit comments

Comments
 (0)