15
15
16
16
import io
17
17
import logging
18
+ import random
18
19
import threading
19
-
20
+ import time
20
21
from collections import defaultdict
21
22
22
23
from horovod .runner .common .util import safe_shell_exec
23
24
from horovod .runner .elastic .worker import HostUpdateResult
24
25
26
+ # The default lower bound for cooldown period. If a range is provided,
27
+ # the provided lower limit must be at or above this lower bound
28
+ DEFAULT_COOLDOWN_LOWER_LIMIT_SECONDS = 1
29
+ # The default upper bound for cooldown period. If a range is provided,
30
+ # the provided upper limit must be at or below this upper bound
31
+ DEFAULT_COOLDOWN_UPPER_LIMIT_SECONDS = 1 * 60 * 60
25
32
26
33
class HostState (object ):
27
- def __init__ (self ):
34
+
35
+ def __init__ (self , cooldown_range = None ):
28
36
self ._event = threading .Event ()
29
37
30
- # TODO(travis): blacklisted hosts should have a timeout period that increases with each failure
31
38
self ._blacklisted = False
39
+ self ._blacklist_count = 0
40
+ if cooldown_range :
41
+ HostState ._validate_cooldown_range (cooldown_range )
42
+ self ._cooldown_lower_limit , self ._cooldown_upper_limit = cooldown_range
43
+ else :
44
+ self ._cooldown_lower_limit = - 1
45
+ self ._cooldown_upper_limit = - 1
46
+ self ._cooldown_period_end_ts = 0
47
+
48
+ @staticmethod
49
+ def _validate_cooldown_range (cooldown_range ):
50
+ cooldown_lower_limit , cooldown_upper_limit = cooldown_range
51
+
52
+ if (cooldown_lower_limit < DEFAULT_COOLDOWN_LOWER_LIMIT_SECONDS ):
53
+ raise ValueError (f"Provided cooldown lower limit: { cooldown_lower_limit } \
54
+ cannot be lower than default cooldown lower limit: { DEFAULT_COOLDOWN_LOWER_LIMIT_SECONDS } " )
55
+
56
+
57
+ if (cooldown_upper_limit > DEFAULT_COOLDOWN_UPPER_LIMIT_SECONDS ):
58
+ raise ValueError (f"Provided cooldown upper limit: { cooldown_upper_limit } \
59
+ cannot be higher than default cooldown upper limit: { DEFAULT_COOLDOWN_UPPER_LIMIT_SECONDS } " )
32
60
33
61
def get_event (self ):
34
62
if self ._event .is_set ():
@@ -39,13 +67,48 @@ def get_event(self):
39
67
def set_event (self ):
40
68
self ._event .set ()
41
69
70
+ def _in_cooldown_period (self , current_time ):
71
+ return self ._cooldown_period_end_ts > current_time
72
+
73
+
74
+ def _set_cooldown_period (self , current_time ):
75
+ if self ._cooldown_lower_limit == - 1 or self ._cooldown_upper_limit == - 1 :
76
+ return
77
+ self ._blacklist_count += 1
78
+
79
+ cooldown_delay = self ._cooldown_lower_limit * (1 << self ._blacklist_count ) + (random .uniform (0 ,1 ) * self ._cooldown_lower_limit )
80
+ logging .debug (f"{ self ._blacklist_count } :{ self ._cooldown_period_end_ts } cooldown_delay: { cooldown_delay } " )
81
+ # We need to ensure that the cooldown upper limit is the upper bound of the delay
82
+ cooldown_delta_seconds = max (self ._cooldown_lower_limit , min (self ._cooldown_upper_limit , cooldown_delay ))
83
+
84
+ self ._cooldown_period_end_ts = current_time + cooldown_delta_seconds
85
+ logging .debug (f"cooldown delta seconds: { cooldown_delta_seconds } " )
86
+
42
87
def blacklist (self ):
88
+ """Moves this host to a blacklist, and starts the cooldown period."""
43
89
self ._blacklisted = True
90
+ now = time .time ()
91
+ if self ._in_cooldown_period (now ):
92
+ return
93
+ self ._set_cooldown_period (now )
44
94
self .set_event ()
45
95
96
+ def whitelist (self ):
97
+ """Ends the cooldown period and moves this host out of blacklist."""
98
+ self ._cooldown_period_end_ts = 0
99
+ self ._blacklisted = False
100
+
46
101
def is_blacklisted (self ):
102
+ """Checks if the host is in the blacklist."""
47
103
return self ._blacklisted
48
104
105
+ def is_resurrected (self ):
106
+ """Checks if host is in an expired cooldown period."""
107
+ if self ._cooldown_period_end_ts > 0 :
108
+ return not self ._in_cooldown_period (time .time ())
109
+ return False
110
+
111
+
49
112
50
113
class DiscoveredHosts (object ):
51
114
def __init__ (self , host_slots , host_assignment_order ):
@@ -76,15 +139,17 @@ def update(self, hosts_state):
76
139
if not hosts_state [host ].is_blacklisted ()]
77
140
return self
78
141
142
+ def __str__ (self ):
143
+ return f"slots: { self ._host_slots } order: { self ._host_assignment_order } "
144
+
79
145
80
146
class HostManager (object ):
81
- def __init__ (self , discovery ):
147
+ def __init__ (self , discovery , cooldown_range = None ):
82
148
self ._current_hosts = DiscoveredHosts (host_slots = {}, host_assignment_order = [])
83
- self ._hosts_state = defaultdict (HostState )
149
+ self ._hosts_state = defaultdict (lambda : HostState ( cooldown_range ) )
84
150
self ._discovery = discovery
85
151
86
152
def update_available_hosts (self ):
87
- # TODO(travis): also check for hosts removed from the blacklist in the future
88
153
def check_update (cur_host_slots , prev_host_slots ):
89
154
res = HostUpdateResult .no_update
90
155
@@ -103,17 +168,32 @@ def check_update(cur_host_slots, prev_host_slots):
103
168
elif cur_host_slots [h ] < prev_host_slots [h ]:
104
169
# h has removed some slots
105
170
res |= HostUpdateResult .removed
171
+ elif self ._hosts_state [h ].is_resurrected ():
172
+ res |= HostUpdateResult .added
106
173
return res
107
174
108
175
prev_host_slots = self ._current_hosts .host_slots
109
176
prev_host_assignment_order = self ._current_hosts .host_assignment_order
110
177
host_slots = self ._discovery .find_available_hosts_and_slots ()
111
- if prev_host_slots != host_slots :
112
- available_hosts = set ([host for host in host_slots .keys () if not self ._hosts_state [host ].is_blacklisted ()])
178
+
179
+ def whitelist_all_hosts ():
180
+ for host in host_slots .keys ():
181
+ if self ._hosts_state [host ].is_resurrected ():
182
+ self ._hosts_state [host ].whitelist ()
183
+
184
+ def has_resurrected_hosts ():
185
+ resurrected_hosts = [host for host in host_slots .keys () if self ._hosts_state [host ].is_resurrected ()]
186
+ return len (resurrected_hosts ) > 0
187
+
188
+ if prev_host_slots != host_slots or has_resurrected_hosts ():
189
+ available_hosts = set ([host for host in host_slots .keys () \
190
+ if not (self ._hosts_state [host ].is_blacklisted () and not self ._hosts_state [host ].is_resurrected ())])
113
191
host_assignment_order = HostManager .order_available_hosts (available_hosts , prev_host_assignment_order )
114
192
self ._current_hosts = DiscoveredHosts (host_slots = host_slots ,
115
193
host_assignment_order = host_assignment_order )
116
- return check_update (self ._current_hosts .host_slots , prev_host_slots )
194
+ host_update_state = check_update (self ._current_hosts .host_slots , prev_host_slots )
195
+ whitelist_all_hosts ()
196
+ return host_update_state
117
197
else :
118
198
return HostUpdateResult .no_update
119
199
@@ -123,7 +203,7 @@ def current_hosts(self):
123
203
124
204
def blacklist (self , host ):
125
205
if not self ._hosts_state [host ].is_blacklisted ():
126
- logging .warning ('blacklist failing host: {}' .format (host ))
206
+ logging .info ('blacklist failing host: {}' .format (host ))
127
207
self ._hosts_state [host ].blacklist ()
128
208
129
209
def is_blacklisted (self , host ):
0 commit comments