[ray integration] Initial Ray Integration with RayExecutor API (horovod#2218)

richardliaw · tgaddair · web-flow · commit eeca2c0db263 · 2020-09-01T14:25:12.000-07:00
Signed-off-by: Richard Liaw &lt;rliaw@berkeley.edu&gt;

Co-authored-by: Travis Addair &lt;taddair@uber.com&gt;
diff --git a/.buildkite/gen-pipeline.sh b/.buildkite/gen-pipeline.sh
@@ -104,7 +104,7 @@ run_mpi_pytest() {
   local excluded_tests="| sed 's/test_interactiverun.py//g' | sed 's/test_spark_keras.py//g' | sed 's/test_spark_torch.py//g'"
 
   # Spark and Run test does not need to be executed with horovodrun, but we still run it below.
-  local exclude_standalone_test="| sed 's/test_spark.py//g' | sed 's/test_run.py//g'"
+  local exclude_standalone_test="| sed 's/test_spark.py//g' | sed 's/test_run.py//g' | sed 's/test_ray.py//g'"
   local standalone_tests="test_spark.py test_run.py"
 
   # pytests have 4x GPU use cases and require a separate queue
@@ -209,8 +209,8 @@ run_gloo_pytest() {
   local excluded_tests="| sed 's/test_interactiverun.py//g' | sed 's/test_spark_keras.py//g' | sed 's/test_spark_torch.py//g'"
 
   # Spark and Run test does not need to be executed with horovodrun, but we still run it below.
-  local exclude_standalone_test="| sed 's/test_spark.py//g' | sed 's/test_run.py//g'"
-  local standalone_tests="test_spark.py test_run.py"
+  local exclude_standalone_test="| sed 's/test_spark.py//g' | sed 's/test_run.py//g' | sed 's/test_ray.py//g'"
+  local standalone_tests="test_spark.py test_run.py test_ray.py"
 
   run_test "${test}" "${queue}" \
     ":pytest: Run PyTests (${test})" \
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -18,6 +18,8 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
 
 - Added `hvd.is_initialized()` method. ([#2020](https://github.com/horovod/horovod/pull/2020))
 
+- Added Ray integration. ([#2218](https://github.com/horovod/horovod/pull/2218))
+
 ### Changed
 
 - Moved `horovod.run.runner.run` to `horovod.run`. ([#2099](https://github.com/horovod/horovod/pull/2099))
diff --git a/Dockerfile.test.cpu b/Dockerfile.test.cpu
@@ -169,9 +169,9 @@ RUN if [[ ${MPI_KIND} == "ONECCL" ]]; then \
       fi; \
       . /usr/local/oneccl/env/setvars.sh; \
       echo "pip install horovod, mpicxx is $(which mpicxx)"; \
-      pip install -v $(ls /horovod/dist/horovod-*.tar.gz)[spark]; \
+      pip install -v $(ls /horovod/dist/horovod-*.tar.gz)[spark,ray]; \
     else \
-      pip install -v $(ls /horovod/dist/horovod-*.tar.gz)[spark]; \
+      pip install -v $(ls /horovod/dist/horovod-*.tar.gz)[spark,ray]; \
     fi
 
 # Prefetch Spark MNIST dataset.
diff --git a/Dockerfile.test.gpu b/Dockerfile.test.gpu
@@ -123,7 +123,7 @@ RUN if [[ ${MXNET_PACKAGE} == "mxnet-nightly" ]]; then \
 # Install Horovod.
 RUN cd /horovod && python setup.py sdist
 RUN ldconfig /usr/local/cuda/targets/x86_64-linux/lib/stubs && \
-    bash -c "${HOROVOD_BUILD_FLAGS} HOROVOD_WITH_TENSORFLOW=1 HOROVOD_WITH_PYTORCH=1 HOROVOD_WITH_MXNET=1 pip install -v $(ls /horovod/dist/horovod-*.tar.gz)[spark]" && \
+    bash -c "${HOROVOD_BUILD_FLAGS} HOROVOD_WITH_TENSORFLOW=1 HOROVOD_WITH_PYTORCH=1 HOROVOD_WITH_MXNET=1 pip install -v $(ls /horovod/dist/horovod-*.tar.gz)[spark,ray]" && \
     ldconfig
 
 # Hack for compatibility of MNIST example with TensorFlow 1.1.0.
diff --git a/docs/api.rst b/docs/api.rst
@@ -52,6 +52,12 @@ horovod.spark.common
 .. automodule:: horovod.spark.common.store
     :show-inheritance:
 
+.. _horovod_ray_api:
+
+horovod.ray
+-----------
+.. automodule:: horovod.ray
+
 horovod.run
 -------------
 .. automodule:: horovod.run
diff --git a/docs/index.rst b/docs/index.rst
@@ -118,15 +118,17 @@ Guides
    gpus_include
 
    conda_include
-   
+
    docker_include
 
    spark_include
 
+   ray_include
+
    lsf_include
 
    tensor-fusion_include
-   
+
    adasum_user_guide_include
 
    timeline_include
diff --git a/docs/mocks.py b/docs/mocks.py
@@ -54,6 +54,8 @@ def _dummy():
     'pyspark.sql.functions',
     'pyspark.sql.types',
 
+    'ray',
+
     'tensorflow',
     'tensorflow.python',
     'tensorflow.python.framework',
diff --git a/docs/ray.rst b/docs/ray.rst
@@ -0,0 +1,155 @@
+.. inclusion-marker-start-do-not-remove
+
+Horovod on Ray
+==============
+
+``horovod.ray`` allows users to leverage Horovod on `a Ray cluster <https://docs.ray.io/en/latest/cluster/index.html>`_.
+
+Currently, the Ray + Horovod integration provides a :ref:`RayExecutor API <horovod_ray_api>`.
+
+.. note:: The Ray + Horovod integration currently only supports a Gloo backend.
+
+Installation
+------------
+
+Use the extra ``[ray]`` option to install Ray along with Horovod.
+
+.. code-block:: bash
+
+    $ HOROVOD_WITH_GLOO=1 ... pip install 'horovod[ray]'
+
+See the Ray documentation for `advanced installation instructions <https://docs.ray.io/en/latest/installation.html>`_.
+
+
+Horovod Ray Job
+---------------
+
+The Horovod Ray integration offers a ``RayExecutor`` abstraction (:ref:`docs <horovod_ray_api>`),
+which is a wrapper over a group of `Ray actors (stateful processes) <https://docs.ray.io/en/latest/walkthrough.html#remote-classes-actors>`_.
+
+.. code-block:: python
+
+    from horovod.ray import RayExecutor
+
+    # Start the Ray cluster or attach to an exisint Ray cluster.
+    ray.init()
+
+    # Start num_hosts * num_slots actors on the cluster.
+    executor = RayExecutor(
+        setting, num_hosts=num_hosts, num_slots=num_slots, use_gpu=True)
+
+    # Launch the Ray actors on each machine.
+    # This will launch `num_slots` actors on each machine, each with
+    # 1 GPU allocated (set via CUDA VISIBLE DEVICES)
+    executor.start()
+
+
+All actors will be part of the Horovod ring, so ``RayExecutor`` invocations will be able to support arbitrary Horovod collective operations.
+
+Note that there is an implicit assumption on the cluster being homogenous in shape (i.e., all machines have the same number of slots available). This is simply
+an implementation detail and is not a fundamental limitation.
+
+To actually execute a function, you can run the following:
+
+.. code-block:: python
+
+    # In its simplest form, a function must take in a dummy variable
+    def simple_fn(_):
+        hvd.init()
+        print("hvd rank", hvd.rank())
+        return hvd.rank()
+
+    # Execute the function on all workers at once.
+    result = executor.execute(simple_fn)
+    # Check that the rank of all workers is unique
+    assert len(set(result)) == hosts * num_slots
+
+    executor.shutdown()
+
+
+Execution
+~~~~~~~~~
+
+A unique feature of Ray is its support for `stateful Actors <https://docs.ray.io/en/latest/walkthrough.html#remote-classes-actors>`_. This means that you can start arbitrary Python classes on each worker, easily supporting operations and calls where data is cached in memory.
+
+.. code-block:: python
+
+    import torch
+    from horovod.torch import hvd
+    from horovod.ray import RayExecutor
+
+    class MyModel:
+        def __init__(self, learning_rate):
+            self.model = NeuralNet()
+            optimizer = torch.optim.SGD(
+                self.model.parameters(),
+                lr=learning_rate,
+            )
+            self.optimizer = hvd.DistributedOptimizer(optimizer)
+
+        def get_weights(self):
+            return dict(self.model.parameters())
+
+        def train(self):
+            return train(self.model, self.optimizer)
+
+
+    ray.init()
+    executor = RayExecutor(...)
+    executor.start(executable_cls=MyModel)
+    for i in range(5):
+        executor.execute(lambda worker: worker.train())
+
+    result = executor.execute(lambda worker: worker.get_weights())
+
+    # result will be N copies of the model weights
+    assert all(isinstance(res, dict) for res in result)
+
+
+AWS: Cluster Launcher
+---------------------
+
+You can also easily leverage the `Ray cluster launcher <https://docs.ray.io/en/latest/cluster/launcher.html>`_ to spin up cloud instances.
+
+.. code-block:: yaml
+
+    # Save as `ray_cluster.yaml`
+
+    cluster_name: horovod-cluster
+    provider: {type: aws, region: us-west-2}
+    auth: {ssh_user: ubuntu}
+    min_workers: 3
+    max_workers: 3
+
+    # Deep Learning AMI (Ubuntu) Version 21.0
+    head_node: {InstanceType: p3.2xlarge, ImageId: ami-0b294f219d14e6a82}
+    worker_nodes: {InstanceType: p3.2xlarge, ImageId: ami-0b294f219d14e6a82}
+    setup_commands: # Set up each node.
+        - HOROVOD_WITH_GLOO=1 HOROVOD_GPU_OPERATIONS=NCCL pip install horovod[ray]
+
+You can start the specified Ray cluster and monitor its status with:
+
+.. code-block:: bash
+
+    $ ray up ray_cluster.yaml  # starts the head node
+    $ ray monitor ray_cluster.yaml  # wait for worker nodes
+
+Then, in your python script, make sure you add ``ray.init(address="auto")`` to connect
+to the distributed Ray cluster.
+
+.. code-block:: diff
+
+    -ray.init()
+    +ray.init(address="auto")
+
+Then you can execute Ray scripts on the cluster:
+
+.. code-block:: bash
+
+    $ ray submit ray_cluster.yaml <your_script.py>
+
+    # the above is is equivalent to
+    $ ray attach ray_cluster.yaml  # ssh
+    ubuntu@ip-172-31-24-53:~$ python <your_script.py>
+
+.. inclusion-marker-end-do-not-remove
diff --git a/docs/ray_include.rst b/docs/ray_include.rst
@@ -0,0 +1,3 @@
+.. include:: ./ray.rst
+   :start-after: inclusion-marker-start-do-not-remove
+   :end-before: inclusion-marker-end-do-not-remove
diff --git a/examples/tensorflow2_mnist_ray.py b/examples/tensorflow2_mnist_ray.py
@@ -0,0 +1,93 @@
+import tensorflow as tf
+import horovod.tensorflow.keras as hvd
+import horovod.keras as hvd
+
+import ray
+from horovod.ray import RayExecutor
+
+
+def train(num_epochs):
+    # Horovod: initialize Horovod.
+    hvd.init()
+
+    # Horovod: pin GPU to be used to process local rank (one GPU per process)
+    gpus = tf.config.experimental.list_physical_devices('GPU')
+    for gpu in gpus:
+        tf.config.experimental.set_memory_growth(gpu, True)
+    if gpus:
+        tf.config.experimental.set_visible_devices(
+            gpus[hvd.local_rank()], 'GPU')
+
+    (mnist_images, mnist_labels), _ = \
+        tf.keras.datasets.mnist.load_data(path='mnist-%d.npz' % hvd.rank())
+
+    dataset = tf.data.Dataset.from_tensor_slices(
+        (tf.cast(mnist_images[..., tf.newaxis] / 255.0, tf.float32),
+         tf.cast(mnist_labels, tf.int64))
+    )
+    dataset = dataset.repeat().shuffle(10000).batch(128)
+
+    mnist_model = tf.keras.Sequential([
+        tf.keras.layers.Conv2D(32, [3, 3], activation='relu'),
+        tf.keras.layers.Conv2D(64, [3, 3], activation='relu'),
+        tf.keras.layers.MaxPooling2D(pool_size=(2, 2)),
+        tf.keras.layers.Dropout(0.25),
+        tf.keras.layers.Flatten(),
+        tf.keras.layers.Dense(128, activation='relu'),
+        tf.keras.layers.Dropout(0.5),
+        tf.keras.layers.Dense(10, activation='softmax')
+    ])
+
+    # Horovod: adjust learning rate based on number of GPUs.
+    scaled_lr = 0.001 * hvd.size()
+    opt = tf.optimizers.Adam(scaled_lr)
+
+    # Horovod: add Horovod DistributedOptimizer.
+    opt = hvd.DistributedOptimizer(opt)
+
+    # Horovod: Specify `experimental_run_tf_function=False` to ensure TensorFlow
+    # uses hvd.DistributedOptimizer() to compute gradients.
+    mnist_model.compile(loss=tf.losses.SparseCategoricalCrossentropy(),
+                        optimizer=opt,
+                        metrics=['accuracy'],
+                        experimental_run_tf_function=False)
+
+    callbacks = [
+        # Horovod: broadcast initial variable states from rank 0 to all other processes.
+        # This is necessary to ensure consistent initialization of all workers when
+        # training is started with random weights or restored from a checkpoint.
+        hvd.callbacks.BroadcastGlobalVariablesCallback(0),
+
+        # Horovod: average metrics among workers at the end of every epoch.
+        #
+        # Note: This callback must be in the list before the ReduceLROnPlateau,
+        # TensorBoard or other metrics-based callbacks.
+        hvd.callbacks.MetricAverageCallback(),
+
+        # Horovod: using `lr = 1.0 * hvd.size()` from the very beginning leads to worse final
+        # accuracy. Scale the learning rate `lr = 1.0` ---> `lr = 1.0 * hvd.size()` during
+        # the first three epochs. See https://arxiv.org/abs/1706.02677 for details.
+        hvd.callbacks.LearningRateWarmupCallback(
+            warmup_epochs=3, initial_lr=scaled_lr, verbose=1),
+    ]
+
+    # Horovod: save checkpoints only on worker 0 to prevent other workers from corrupting them.
+    if hvd.rank() == 0:
+        callbacks.append(tf.keras.callbacks.ModelCheckpoint(
+            './checkpoint-{epoch}.h5'))
+
+    # Horovod: write logs on worker 0.
+    verbose = 1 if hvd.rank() == 0 else 0
+
+    # Train the model.
+    # Horovod: adjust number of steps based on number of GPUs.
+    mnist_model.fit(dataset, steps_per_epoch=500 // hvd.size(),
+                    callbacks=callbacks, epochs=num_epochs, verbose=verbose)
+
+
+ray.init()
+settings = RayExecutor.create_settings(timeout_s=30)
+executor = RayExecutor(settings, num_hosts=1, num_slots=2, use_gpu=False)
+executor.start()
+executor.run(train, kwargs=dict(num_epochs=1))
+executor.shutdown()
diff --git a/horovod/ray/__init__.py b/horovod/ray/__init__.py
@@ -0,0 +1 @@
+from .runner import RayExecutor, BaseHorovodWorker
diff --git a/horovod/ray/runner.py b/horovod/ray/runner.py
diff --git a/setup.py b/setup.py
diff --git a/test/data/expected_buildkite_pipeline.yaml b/test/data/expected_buildkite_pipeline.yaml
diff --git a/test/test_ray.py b/test/test_ray.py

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+.. include:: ./ray.rst`
	`2`	`+ :start-after: inclusion-marker-start-do-not-remove`
	`3`	`+ :end-before: inclusion-marker-end-do-not-remove`