Skip to content

Commit bddcb9e

Browse files
tgaddairEnricoMi
andauthored
Initial bare-metal implementation of elastic mode for fault tolerance and auto-scaling (horovod#1849)
Signed-off-by: Travis Addair <[email protected]> Co-authored-by: Enrico Minack <[email protected]>
1 parent f8fb21e commit bddcb9e

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

77 files changed

+6286
-2025
lines changed

.buildkite/gen-pipeline.sh

Lines changed: 39 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -8,22 +8,19 @@ repository=823773083436.dkr.ecr.us-east-1.amazonaws.com/buildkite
88

99
# list of all the tests
1010
tests=( \
11-
test-cpu-openmpi-py2_7-tf1_1_0-keras2_0_0-torch0_4_0-mxnet1_4_1-pyspark2_3_2 \
12-
test-cpu-openmpi-py3_6-tf1_1_0-keras2_0_0-torch0_4_0-mxnet1_4_1-pyspark2_3_2 \
1311
test-cpu-openmpi-py2_7-tf1_6_0-keras2_1_2-torch0_4_1-mxnet1_4_1-pyspark2_3_2 \
1412
test-cpu-openmpi-py3_6-tf1_6_0-keras2_1_2-torch0_4_1-mxnet1_4_1-pyspark2_3_2 \
1513
test-cpu-gloo-py2_7-tf1_15_0-keras2_3_1-torch1_4_0-mxnet1_5_0-pyspark2_4_0 \
1614
test-cpu-gloo-py3_6-tf1_15_0-keras2_3_1-torch1_4_0-mxnet1_5_0-pyspark2_4_0 \
17-
test-cpu-gloo-py3_7-tf1_15_0-keras2_3_1-torch1_4_0-mxnet1_5_0-pyspark2_4_0 \
15+
test-cpu-gloo-py3_7-tf2_2_0-keras2_3_1-torch1_5_0-mxnet1_5_0-pyspark2_4_0 \
1816
test-cpu-gloo-py3_8-tf2_2_0-keras2_3_1-torch1_5_0-mxnet1_5_0-pyspark2_4_0 \
1917
test-cpu-openmpi-py3_6-tf1_14_0-keras2_2_4-torch1_2_0-mxnet1_4_1-pyspark2_4_0 \
20-
test-cpu-openmpi-gloo-py3_6-tf1_14_0-keras2_3_1-torch1_3_0-mxnet1_4_1-pyspark2_4_0 \
2118
test-cpu-openmpi-py2_7-tf2_0_0-keras2_3_1-torch1_3_0-mxnet1_5_0-pyspark2_4_0 \
2219
test-cpu-openmpi-py3_6-tf2_0_0-keras2_3_1-torch1_3_0-mxnet1_5_0-pyspark2_4_0 \
2320
test-cpu-openmpi-py3_6-tfhead-kerashead-torchhead-mxnethead-pyspark2_4_0 \
24-
test-cpu-mpich-py3_6-tf1_14_0-keras2_3_1-torch1_3_0-mxnet1_5_0-pyspark2_4_0 \
25-
test-cpu-oneccl-py3_6-tf1_14_0-keras2_3_1-torch1_3_0-mxnet1_5_0-pyspark2_4_0 \
26-
test-cpu-oneccl-ofi-py3_6-tf1_14_0-keras2_3_1-torch1_3_0-mxnet1_5_0-pyspark2_4_0 \
21+
test-cpu-mpich-py3_6-tf1_15_0-keras2_3_1-torch1_3_0-mxnet1_5_0-pyspark2_4_0 \
22+
test-cpu-oneccl-py3_6-tf1_15_0-keras2_3_1-torch1_3_0-mxnet1_5_0-pyspark2_4_0 \
23+
test-cpu-oneccl-ofi-py3_6-tf1_15_0-keras2_3_1-torch1_3_0-mxnet1_5_0-pyspark2_4_0 \
2724
test-gpu-openmpi-py3_6-tf1_15_0-keras2_3_1-torch1_3_0-mxnet1_4_1-pyspark2_4_0 \
2825
test-gpu-gloo-py3_6-tf1_15_0-keras2_3_1-torch1_3_0-mxnet1_4_1-pyspark2_4_0 \
2926
test-gpu-openmpi-gloo-py3_6-tf1_15_0-keras2_3_1-torch1_3_0-mxnet1_4_1-pyspark2_4_0 \
@@ -98,15 +95,20 @@ run_mpi_pytest() {
9895
local oneccl_env=${3:-}
9996
oneccl_env=$(echo ${oneccl_env//:/ })
10097

101-
local exclude_keras_if_needed=""
98+
local exclude_keras=""
10299
if [[ ${test} == *"tf2_"* ]] || [[ ${test} == *"tfhead"* ]]; then
103100
# TODO: support for Keras + TF 2.0 and TF-Keras 2.0
104-
exclude_keras_if_needed="| sed 's/test_keras.py//g' | sed 's/test_tensorflow_keras.py//g'"
101+
exclude_keras="| sed 's/test_keras.py//g' | sed 's/test_tensorflow_keras.py//g'"
105102
else
106-
exclude_keras_if_needed="| sed 's/[a-z_]*tensorflow2[a-z_.]*//g'"
103+
exclude_keras="| sed 's/[a-z_]*tensorflow2[a-z_.]*//g'"
107104
fi
108105

109-
local exclude_interactiverun="| sed 's/test_interactiverun.py//g' | sed 's/test_spark_keras.py//g' | sed 's/test_spark_torch.py//g'"
106+
local exclude_elastic=""
107+
if [[ ${test} == *"py2_"* ]]; then
108+
exclude_elastic="| sed 's/test_elastic[a-z_.]*//g'"
109+
fi
110+
111+
local excluded_tests="| sed 's/test_interactiverun.py//g' | sed 's/test_spark_keras.py//g' | sed 's/test_spark_torch.py//g'"
110112

111113
# Spark and Run test does not need to be executed with horovodrun, but we still run it below.
112114
local exclude_standalone_test="| sed 's/test_spark.py//g' | sed 's/test_run.py//g'"
@@ -121,7 +123,7 @@ run_mpi_pytest() {
121123
# pytests have 4x GPU use cases and require a separate queue
122124
run_test "${test}" "${queue}" \
123125
":pytest: Run PyTests (${test})" \
124-
"bash -c \"${oneccl_env} cd /horovod/test && (echo test_*.py ${exclude_keras_if_needed} ${exclude_interactiverun} ${exclude_standalone_test} | xargs -n 1 \\\$(cat /mpirun_command) pytest -v --capture=no) && pytest --forked -v --capture=no ${standalone_tests}\""
126+
"bash -c \"${oneccl_env} cd /horovod/test && (echo test_*.py ${exclude_keras} ${exclude_elastic} ${excluded_tests} ${exclude_standalone_test} | xargs -n 1 \\\$(cat /mpirun_command) pytest -v --capture=no) && pytest --forked -v --capture=no ${standalone_tests}\""
125127
}
126128

127129
run_mpi_integration() {
@@ -156,7 +158,7 @@ run_mpi_integration() {
156158
fi
157159

158160
run_test "${test}" "${queue}" \
159-
":python: Test PyTorch MNIST (${test})" \
161+
":fire: Test PyTorch MNIST (${test})" \
160162
"bash -c \"${oneccl_env} \\\$(cat /mpirun_command) python /horovod/examples/pytorch_mnist.py\""
161163

162164
run_test "${test}" "${queue}" \
@@ -165,7 +167,7 @@ run_mpi_integration() {
165167

166168
# tests that should be executed only with the latest release since they don't test
167169
# a framework-specific functionality
168-
if [[ ${test} == *"tf1_14_0"* ]]; then
170+
if [[ ${test} == *"tf1_15_0"* ]]; then
169171
run_test "${test}" "${queue}" \
170172
":muscle: Test Stall (${test})" \
171173
"bash -c \"${oneccl_env} \\\$(cat /mpirun_command) python /horovod/test/test_stall.py\""
@@ -206,12 +208,17 @@ run_gloo_pytest() {
206208
local test=$1
207209
local queue=$2
208210

209-
local exclude_keras_if_needed=""
211+
local exclude_keras=""
210212
if [[ ${test} == *"tf2_"* ]] || [[ ${test} == *"tfhead"* ]]; then
211213
# TODO: support for Keras + TF 2.0 and TF-Keras 2.0
212-
exclude_keras_if_needed="| sed 's/test_keras.py//g' | sed 's/test_tensorflow_keras.py//g'"
214+
exclude_keras="| sed 's/test_keras.py//g' | sed 's/test_tensorflow_keras.py//g'"
213215
else
214-
exclude_keras_if_needed="| sed 's/[a-z_]*tensorflow2[a-z_.]*//g'"
216+
exclude_keras="| sed 's/[a-z_]*tensorflow2[a-z_.]*//g'"
217+
fi
218+
219+
local exclude_elastic=""
220+
if [[ ${test} == *"py2_"* ]]; then
221+
exclude_elastic="| sed 's/test_elastic[a-z_.]*//g'"
215222
fi
216223

217224
# These are tested as integration style tests.
@@ -229,7 +236,7 @@ run_gloo_pytest() {
229236

230237
run_test "${test}" "${queue}" \
231238
":pytest: Run PyTests (${test})" \
232-
"bash -c \"cd /horovod/test && (echo test_*.py ${exclude_keras_if_needed} ${excluded_tests} ${exclude_standalone_test} | xargs -n 1 horovodrun -np 2 -H localhost:2 --gloo pytest -v --capture=no) && pytest --forked -v --capture=no ${standalone_tests}\""
239+
"bash -c \"cd /horovod/test && (echo test_*.py ${exclude_keras} ${exclude_elastic} ${excluded_tests} ${exclude_standalone_test} | xargs -n 1 horovodrun -np 2 -H localhost:2 --gloo pytest -v --capture=no) && pytest --forked -v --capture=no ${standalone_tests}\""
233240
}
234241

235242
run_gloo_integration() {
@@ -256,12 +263,24 @@ run_gloo_integration() {
256263
fi
257264

258265
run_test "${test}" "${queue}" \
259-
":python: Test PyTorch MNIST (${test})" \
266+
":fire: Test PyTorch MNIST (${test})" \
260267
"horovodrun -np 2 -H localhost:2 --gloo python /horovod/examples/pytorch_mnist.py"
261268

262269
run_test "${test}" "${queue}" \
263270
":muscle: Test MXNet MNIST (${test})" \
264271
"horovodrun -np 2 -H localhost:2 --gloo python /horovod/examples/mxnet_mnist.py"
272+
273+
# Elastic
274+
if [[ ${test} == *"py3_"* ]]; then
275+
local elastic_tensorflow="test_elastic_tensorflow.py"
276+
if [[ ${test} == *"tf2_"* ]] || [[ ${test} == *"tfhead"* ]]; then
277+
elastic_tensorflow="test_elastic_tensorflow2.py"
278+
fi
279+
280+
run_test "${test}" "${queue}" \
281+
":factory: Elastic Tests (${test})" \
282+
"bash -c \"cd /horovod/test/integration && pytest -v --log-cli-level 10 --capture=no test_elastic_torch.py ${elastic_tensorflow}\""
283+
fi
265284
}
266285

267286
run_gloo() {
@@ -322,7 +341,7 @@ run_single_integration() {
322341
fi
323342

324343
run_test "${test}" "${queue}" \
325-
":python: Single PyTorch MNIST (${test})" \
344+
":fire: Single PyTorch MNIST (${test})" \
326345
"bash -c \"${oneccl_env} python /horovod/examples/pytorch_mnist.py --epochs 3\""
327346

328347
run_test "${test}" "${queue}" \

docker-compose.test.yml

Lines changed: 10 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -6,31 +6,6 @@ services:
66
dockerfile: Dockerfile.test.cpu
77
privileged: true
88
shm_size: 8gb
9-
test-cpu-openmpi-py2_7-tf1_1_0-keras2_0_0-torch0_4_0-mxnet1_4_1-pyspark2_3_2:
10-
extends: test-cpu-base
11-
build:
12-
args:
13-
MPI_KIND: OpenMPI
14-
PYTHON_VERSION: 2.7
15-
TENSORFLOW_PACKAGE: tensorflow==1.1.0
16-
KERAS_PACKAGE: keras==2.0.0
17-
PYTORCH_PACKAGE: torch==0.4.0
18-
TORCHVISION_PACKAGE: torchvision==0.2.2.post3
19-
MXNET_PACKAGE: mxnet==1.4.1
20-
PYSPARK_PACKAGE: pyspark==2.3.2
21-
test-cpu-openmpi-py3_6-tf1_1_0-keras2_0_0-torch0_4_0-mxnet1_4_1-pyspark2_3_2:
22-
extends: test-cpu-base
23-
build:
24-
args:
25-
UBUNTU_VERSION: 18.04
26-
MPI_KIND: OpenMPI
27-
PYTHON_VERSION: 3.6
28-
TENSORFLOW_PACKAGE: tensorflow==1.1.0
29-
KERAS_PACKAGE: keras==2.0.0
30-
PYTORCH_PACKAGE: torch==0.4.0
31-
TORCHVISION_PACKAGE: torchvision==0.2.2.post3
32-
MXNET_PACKAGE: mxnet==1.4.1
33-
PYSPARK_PACKAGE: pyspark==2.3.2
349
test-cpu-openmpi-py2_7-tf1_6_0-keras2_1_2-torch0_4_1-mxnet1_4_1-pyspark2_3_2:
3510
extends: test-cpu-base
3611
build:
@@ -81,16 +56,16 @@ services:
8156
TORCHVISION_PACKAGE: torchvision==0.5.0+cpu
8257
MXNET_PACKAGE: mxnet==1.5.0
8358
PYSPARK_PACKAGE: pyspark==2.4.0
84-
test-cpu-gloo-py3_7-tf1_15_0-keras2_3_1-torch1_4_0-mxnet1_5_0-pyspark2_4_0:
59+
test-cpu-gloo-py3_7-tf2_2_0-keras2_3_1-torch1_5_0-mxnet1_5_0-pyspark2_4_0:
8560
extends: test-cpu-base
8661
build:
8762
args:
8863
UBUNTU_VERSION: 18.04
8964
MPI_KIND: None
9065
PYTHON_VERSION: 3.7
91-
TENSORFLOW_PACKAGE: tensorflow-cpu==1.15.0
66+
TENSORFLOW_PACKAGE: tensorflow==2.2.0
9267
KERAS_PACKAGE: keras==2.3.1
93-
PYTORCH_PACKAGE: torch==1.4.0+cpu
68+
PYTORCH_PACKAGE: torch==1.5.0+cpu
9469
TORCHVISION_PACKAGE: torchvision==0.5.0+cpu
9570
MXNET_PACKAGE: mxnet==1.5.0
9671
PYSPARK_PACKAGE: pyspark==2.4.0
@@ -101,7 +76,7 @@ services:
10176
UBUNTU_VERSION: 18.04
10277
MPI_KIND: None
10378
PYTHON_VERSION: 3.8
104-
TENSORFLOW_PACKAGE: tensorflow==2.2.0rc3
79+
TENSORFLOW_PACKAGE: tensorflow==2.2.0
10580
KERAS_PACKAGE: keras==2.3.1
10681
PYTORCH_PACKAGE: torch==1.5.0+cpu
10782
TORCHVISION_PACKAGE: torchvision==0.5.0+cpu
@@ -120,19 +95,6 @@ services:
12095
TORCHVISION_PACKAGE: torchvision==0.4.1+cpu
12196
MXNET_PACKAGE: mxnet==1.4.1
12297
PYSPARK_PACKAGE: pyspark==2.4.0
123-
test-cpu-openmpi-gloo-py3_6-tf1_14_0-keras2_3_1-torch1_3_0-mxnet1_4_1-pyspark2_4_0:
124-
extends: test-cpu-base
125-
build:
126-
args:
127-
UBUNTU_VERSION: 18.04
128-
MPI_KIND: OpenMPI
129-
PYTHON_VERSION: 3.6
130-
TENSORFLOW_PACKAGE: tensorflow==1.14.0
131-
KERAS_PACKAGE: keras==2.3.1
132-
PYTORCH_PACKAGE: torch==1.3.0+cpu
133-
TORCHVISION_PACKAGE: torchvision==0.4.1+cpu
134-
MXNET_PACKAGE: mxnet==1.4.1
135-
PYSPARK_PACKAGE: pyspark==2.4.0
13698
test-cpu-openmpi-py2_7-tf2_0_0-keras2_3_1-torch1_3_0-mxnet1_5_0-pyspark2_4_0:
13799
extends: test-cpu-base
138100
build:
@@ -171,40 +133,40 @@ services:
171133
TORCHVISION_PACKAGE: torchvision==0.6.0.dev20200413
172134
MXNET_PACKAGE: mxnet-nightly
173135
PYSPARK_PACKAGE: pyspark==2.4.0
174-
test-cpu-mpich-py3_6-tf1_14_0-keras2_3_1-torch1_3_0-mxnet1_5_0-pyspark2_4_0:
136+
test-cpu-mpich-py3_6-tf1_15_0-keras2_3_1-torch1_3_0-mxnet1_5_0-pyspark2_4_0:
175137
extends: test-cpu-base
176138
build:
177139
args:
178140
UBUNTU_VERSION: 18.04
179141
MPI_KIND: MPICH
180142
PYTHON_VERSION: 3.6
181-
TENSORFLOW_PACKAGE: tensorflow==1.14.0
143+
TENSORFLOW_PACKAGE: tensorflow-cpu==1.15.0
182144
KERAS_PACKAGE: keras==2.3.1
183145
PYTORCH_PACKAGE: torch==1.3.0+cpu
184146
TORCHVISION_PACKAGE: torchvision==0.4.1+cpu
185147
MXNET_PACKAGE: mxnet==1.5.0
186148
PYSPARK_PACKAGE: pyspark==2.4.0
187-
test-cpu-oneccl-py3_6-tf1_14_0-keras2_3_1-torch1_3_0-mxnet1_5_0-pyspark2_4_0:
149+
test-cpu-oneccl-py3_6-tf1_15_0-keras2_3_1-torch1_3_0-mxnet1_5_0-pyspark2_4_0:
188150
extends: test-cpu-base
189151
build:
190152
args:
191153
UBUNTU_VERSION: 18.04
192154
MPI_KIND: ONECCL
193155
PYTHON_VERSION: 3.6
194-
TENSORFLOW_PACKAGE: tensorflow==1.14.0
156+
TENSORFLOW_PACKAGE: tensorflow-cpu==1.15.0
195157
KERAS_PACKAGE: keras==2.3.1
196158
PYTORCH_PACKAGE: torch==1.3.0+cpu
197159
TORCHVISION_PACKAGE: torchvision==0.4.1+cpu
198160
MXNET_PACKAGE: mxnet==1.5.0
199161
PYSPARK_PACKAGE: pyspark==2.4.0
200-
test-cpu-oneccl-ofi-py3_6-tf1_14_0-keras2_3_1-torch1_3_0-mxnet1_5_0-pyspark2_4_0:
162+
test-cpu-oneccl-ofi-py3_6-tf1_15_0-keras2_3_1-torch1_3_0-mxnet1_5_0-pyspark2_4_0:
201163
extends: test-cpu-base
202164
build:
203165
args:
204166
UBUNTU_VERSION: 18.04
205167
MPI_KIND: ONECCL
206168
PYTHON_VERSION: 3.6
207-
TENSORFLOW_PACKAGE: tensorflow==1.14.0
169+
TENSORFLOW_PACKAGE: tensorflow-cpu==1.15.0
208170
KERAS_PACKAGE: keras==2.3.1
209171
PYTORCH_PACKAGE: torch==1.3.0+cpu
210172
TORCHVISION_PACKAGE: torchvision==0.4.1+cpu

0 commit comments

Comments
 (0)