WeichenXu123
diff --git a/‎CHANGELOG.md
Lines changed: 6 additions & 0 deletions b/‎CHANGELOG.md
Lines changed: 6 additions & 0 deletions
diff --git a/‎CMakeLists.txt
Lines changed: 5 additions & 4 deletions b/‎CMakeLists.txt
Lines changed: 5 additions & 4 deletions
diff --git a/‎docs/concepts.rst
Lines changed: 4 additions & 0 deletions b/‎docs/concepts.rst
Lines changed: 4 additions & 0 deletions
diff --git a/‎docs/gpus.rst
Lines changed: 1 addition & 1 deletion b/‎docs/gpus.rst
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/install.rst
Lines changed: 3 additions & 1 deletion b/‎docs/install.rst
Lines changed: 3 additions & 1 deletion
diff --git a/‎horovod/_keras/__init__.py
Lines changed: 4 additions & 0 deletions b/‎horovod/_keras/__init__.py
Lines changed: 4 additions & 0 deletions
diff --git a/‎horovod/common/common.cc
Lines changed: 2 additions & 2 deletions b/‎horovod/common/common.cc
Lines changed: 2 additions & 2 deletions
diff --git a/‎horovod/common/common.h
Lines changed: 2 additions & 0 deletions b/‎horovod/common/common.h
Lines changed: 2 additions & 0 deletions
diff --git a/‎horovod/common/controller.cc
Lines changed: 25 additions & 4 deletions b/‎horovod/common/controller.cc
Lines changed: 25 additions & 4 deletions
diff --git a/‎horovod/common/message.cc
Lines changed: 6 additions & 0 deletions b/‎horovod/common/message.cc
Lines changed: 6 additions & 0 deletions
@@ -8,10 +8,16 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
 
 ### Added
 
+- Added `hvd.reducescatter()` operation with implementations in NCCL, MPI, and Gloo. ([#3299](https://github.com/horovod/horovod/pull/3299)) 
+
 ### Changed
 
+- MXNet: Updated allreduce functions to newer `op` API. ([#3299](https://github.com/horovod/horovod/pull/3299))
+
 ### Deprecated
 
+- MXNet: Deprecated `average` argument of allreduce functions. ([#3299](https://github.com/horovod/horovod/pull/3299))
+
 ### Removed
 
 ### Fixed
 
@@ -110,8 +110,9 @@ set_gpu_op(HOROVOD_GPU_ALLREDUCE "MPI;NCCL;DDL")
 set_gpu_op(HOROVOD_GPU_ALLGATHER "MPI;NCCL")
 set_gpu_op(HOROVOD_GPU_BROADCAST "MPI;NCCL")
 set_gpu_op(HOROVOD_GPU_ALLTOALL "MPI;NCCL")
+set_gpu_op(HOROVOD_GPU_REDUCESCATTER "MPI;NCCL")
 
-foreach(VAR in ITEMS HOROVOD_GPU_ALLREDUCE HOROVOD_GPU_ALLGATHER HOROVOD_GPU_BROADCAST HOROVOD_GPU_ALLTOALL)
+foreach(VAR in ITEMS HOROVOD_GPU_ALLREDUCE HOROVOD_GPU_ALLGATHER HOROVOD_GPU_BROADCAST HOROVOD_GPU_ALLTOALL HOROVOD_GPU_REDUCESCATTER)
     if(DEFINED ${VAR})
         string(SUBSTRING ${${VAR}} 0 1 ${VAR})
         convert_to_ascii_dec(ASCII_DEC ${${VAR}})
@@ -197,7 +198,7 @@ macro(ADD_CUDA)
     endif()
 endmacro()
 
-if(DEFINED HOROVOD_GPU_ALLREDUCE OR DEFINED HOROVOD_GPU_ALLGATHER OR DEFINED HOROVOD_GPU_BROADCAST OR DEFINED HOROVOD_GPU_ALLTOALL)
+if(DEFINED HOROVOD_GPU_ALLREDUCE OR DEFINED HOROVOD_GPU_ALLGATHER OR DEFINED HOROVOD_GPU_BROADCAST OR DEFINED HOROVOD_GPU_ALLTOALL OR DEFINED HOROVOD_GPU_REDUCESCATTER)
     if(NOT DEFINED HOROVOD_GPU OR HOROVOD_GPU STREQUAL "CUDA")
         add_cuda()
     elseif(HOROVOD_GPU STREQUAL "ROCM")
@@ -215,7 +216,7 @@ if(DEFINED HOROVOD_GPU_ALLREDUCE OR DEFINED HOROVOD_GPU_ALLGATHER OR DEFINED HOR
 endif()
 
 # NCCL
-if(HOROVOD_GPU_ALLREDUCE STREQUAL "N" OR HOROVOD_GPU_ALLGATHER STREQUAL "N" OR HOROVOD_GPU_BROADCAST STREQUAL "N" OR HOROVOD_GPU_ALLTOALL STREQUAL "N")
+if(HOROVOD_GPU_ALLREDUCE STREQUAL "N" OR HOROVOD_GPU_ALLGATHER STREQUAL "N" OR HOROVOD_GPU_BROADCAST STREQUAL "N" OR HOROVOD_GPU_ALLTOALL STREQUAL "N" OR HOROVOD_GPU_REDUCESCATTER STREQUAL "N")
     if(HAVE_ROCM)
         find_package(rccl REQUIRED)
         include_directories(SYSTEM ${RCCL_INCLUDE_DIRS})
@@ -256,7 +257,7 @@ if(DEFINED CCL_ROOT)
 endif()
 
 set(HOROVOD_ALLOW_MIXED_GPU_IMPL $ENV{HOROVOD_ALLOW_MIXED_GPU_IMPL})
-if(HOROVOD_GPU_ALLREDUCE STREQUAL "N" AND (HOROVOD_GPU_ALLGATHER STREQUAL "M" OR HOROVOD_GPU_BROADCAST STREQUAL "M" OR HOROVOD_GPU_ALLTOALL STREQUAL "M") AND
+if(HOROVOD_GPU_ALLREDUCE STREQUAL "N" AND (HOROVOD_GPU_ALLGATHER STREQUAL "M" OR HOROVOD_GPU_BROADCAST STREQUAL "M" OR HOROVOD_GPU_ALLTOALL STREQUAL "M" OR HOROVOD_GPU_REDUCESCATTER STREQUAL "M") AND
    NOT HOROVOD_ALLOW_MIXED_GPU_IMPL STREQUAL "1")
 message(FATAL_ERROR "You should not mix NCCL and MPI GPU due to a possible deadlock.\n"
                     "If you are sure you want to mix them, set the "
 
@@ -31,6 +31,10 @@ a training script on 4 servers, each having 4 GPUs. If we launched one copy of t
     .. image:: http://mpitutorial.com/tutorials/mpi-broadcast-and-collective-communication/broadcast_pattern.png
        :alt: Broadcast Illustration
 
+* *Reducescatter* is an operation that aggregates data among multiple processes and scatters the data across them.  *Reducescatter* is used to average dense tensors then split them across processes.  Here's an illustration from the `Nvidia developer guide <https://docs.nvidia.com/deeplearning/sdk/nccl-developer-guide/docs/usage/operations.html#reducescatter>`__:
+
+    .. image:: https://docs.nvidia.com/deeplearning/sdk/nccl-developer-guide/docs/_images/reducescatter.png
+       :alt: Reducescatter Illustration
 
 * *Alltoall* is an operation to exchange data between all processes.  *Alltoall* may be useful to implement neural networks with advanced architectures that span multiple devices.
 
 
@@ -82,7 +82,7 @@ use it instead:
     $ HOROVOD_GPU_ALLREDUCE=MPI pip install --no-cache-dir horovod
 
 
-Additionally, if your MPI vendor's implementation supports *allgather* and *broadcast* operations on GPU, you can
+Additionally, if your MPI vendor's implementation supports *allgather*, *broadcast*, and *reducescatter* operations on GPU, you can
 configure Horovod to use them as well:
 
 .. code-block:: bash
 
@@ -245,7 +245,9 @@ Possible values are given in curly brackets: {}.
 * ``HOROVOD_GPU_ALLREDUCE`` - {NCCL, MPI}. Framework to use for GPU tensor allreduce.
 * ``HOROVOD_GPU_ALLGATHER`` - {NCCL, MPI}. Framework to use for GPU tensor allgather.
 * ``HOROVOD_GPU_BROADCAST`` - {NCCL, MPI}. Framework to use for GPU tensor broadcast.
-* ``HOROVOD_ALLOW_MIXED_GPU_IMPL`` - {1}. Allow Horovod to install with NCCL allreduce and MPI GPU allgather / broadcast.  Not recommended due to a possible deadlock.
+* ``HOROVOD_GPU_ALLTOALL`` - {NCCL, MPI}. Framework to use for GPU tensor alltoall.
+* ``HOROVOD_GPU_REDUCESCATTER`` - {NCCL, MPI}. Framework to use for GPU tensor reducescatter.
+* ``HOROVOD_ALLOW_MIXED_GPU_IMPL`` - {1}. Allow Horovod to install with NCCL allreduce and MPI GPU allgather / broadcast / alltoall / reducescatter.  Not recommended due to a possible deadlock.
 * ``HOROVOD_CPU_OPERATIONS`` - {MPI, GLOO, CCL}. Framework to use for CPU tensor allreduce, allgather, and broadcast.
 * ``HOROVOD_CMAKE`` - path to the CMake binary used to build Horovod.
 * ``HOROVOD_WITH_TENSORFLOW`` - {1}. Require Horovod to install with TensorFlow support enabled.
 
@@ -188,6 +188,10 @@ def broadcast(backend, value, root_rank, name):
     return _eval(backend, hvd.broadcast(tf.constant(value, name=name), root_rank))
 
 
+def reducescatter(backend, value, name, op):
+    return _eval(backend, hvd.reducescatter(tf.constant(value, name=name), op=op))
+
+
 def load_model(keras, wrap_optimizer, optimizer_modules, filepath, custom_optimizers, custom_objects):
     horovod_objects = {
         subclass.__name__.lower(): wrap_optimizer(subclass)
 
@@ -101,7 +101,7 @@ int TensorShape::dims() const {
 
 int64_t TensorShape::dim_size(int idx) const {
   assert(idx >= 0);
-  assert(idx < shape_.size());
+  assert(idx < (int)shape_.size());
   return shape_[idx];
 }
 
@@ -165,7 +165,7 @@ void parse_and_set_affinity(const char* affinity, int local_size, int local_rank
     auto core_id_str = strsep(&tmp, ",");
     errno = 0;
     auto core_id = std::strtol(core_id_str, &endptr, 10);
-    if (errno == ERANGE && (core_id == LONG_MAX || core_id == LONG_MIN)
+    if ((errno == ERANGE && (core_id == LONG_MAX || core_id == LONG_MIN))
         || (errno != 0 && core_id == 0)){
         LOG(ERROR) << "Core ID value is invalid in " << HOROVOD_THREAD_AFFINITY
                    << "=" << affinity;
 
@@ -83,6 +83,7 @@ namespace common {
 #define MEMCPY_IN_SHARED_BUFFER "MEMCPY_IN_SHARED_BUFFER"
 #define MPI_ALLREDUCE "MPI_ALLREDUCE"
 #define MPI_ADASUM_ALLREDUCE "MPI_ADASUM_ALLREDUCE"
+#define MPI_REDUCESCATTER "MPI_REDUCESCATTER"
 #define MEMCPY_OUT_HOST_BUFFER "MEMCPY_OUT_HOST_BUFFER"
 #define NCCL_ALLREDUCE "NCCL_ALLREDUCE"
 #define MEMCPY_OUT_FUSION_BUFFER "MEMCPY_OUT_FUSION_BUFFER"
@@ -102,6 +103,7 @@ namespace common {
 #define GLOO_ALLREDUCE "GLOO_ALLREDUCE"
 #define GLOO_ALLGATHER "GLOO_ALLGATHER"
 #define GLOO_BCAST "GLOO_BCAST"
+#define GLOO_REDUCESCATTER "GLOO_REDUCESCATTER"
 #define HOROVOD_ELASTIC "HOROVOD_ELASTIC"
 
 // Horovod knobs.
 
@@ -536,11 +536,12 @@ Response Controller::ConstructResponse(const std::string& name, int joined_size)
     }
   }
 
-  // If we are doing an allreduce or broadcast, check that all tensor shapes are
-  // identical.
+  // If we are doing an allreduce, broadcast, or reducescatter check that all
+  // tensor shapes are identical.
   if (message_type == Request::ALLREDUCE ||
       message_type == Request::ADASUM ||
-      message_type == Request::BROADCAST) {
+      message_type == Request::BROADCAST ||
+      message_type == Request::REDUCESCATTER) {
     TensorShape tensor_shape;
     for (auto dim : requests[0].tensor_shape()) {
       tensor_shape.AddDim(dim);
@@ -673,6 +674,19 @@ Response Controller::ConstructResponse(const std::string& name, int joined_size)
     }
   }
 
+  if (message_type == Request::REDUCESCATTER) {
+    if (joined_size > 0) {
+      error = true;
+      error_message_stream << "Reducescatter is not supported with Join at this time.";
+    }
+
+    TensorShape tensor_shape;
+    for (auto dim : requests[0].tensor_shape()) {
+      tensor_shape.AddDim(dim);
+    }
+    tensor_sizes.push_back(tensor_shape.num_elements());
+  }
+
   if (message_type == Request::ALLREDUCE || message_type == Request::ADASUM) {
     TensorShape tensor_shape;
     for (auto dim : requests[0].tensor_shape()) {
@@ -756,6 +770,12 @@ Response Controller::ConstructResponse(const std::string& name, int joined_size)
     response.set_response_type(Response::BROADCAST);
   } else if (message_type == Request::ALLTOALL) {
     response.set_response_type(Response::ALLTOALL);
+  } else if (message_type == Request::REDUCESCATTER) {
+    response.set_response_type(Response::REDUCESCATTER);
+    for (auto dim : tensor_sizes) {
+      response.add_tensor_size(dim);
+    }
+    response.set_tensor_type(data_type);
   } else if (message_type == Request::ADASUM) {
     response.set_response_type(Response::ADASUM);
     for (auto dim : tensor_sizes) {
@@ -815,7 +835,8 @@ void Controller::FuseResponses(std::deque<Response>& responses,
     responses.pop_front();
     int64_t tensor_size = 0;
     if (response.response_type() == Response::ResponseType::ALLREDUCE ||
-        response.response_type() == Response::ResponseType::ADASUM) {
+        response.response_type() == Response::ResponseType::ADASUM ||
+        response.response_type() == Response::ResponseType::REDUCESCATTER) {
       // Attempt to add more responses to this fused response.
 
       tensor_size = response.tensor_sizes()[0] * GetTypeSize(response.tensor_type());
 
@@ -102,6 +102,9 @@ const std::string& Request::RequestType_Name(RequestType value) {
     case RequestType::BROADCAST:
       static const std::string broadcast("BROADCAST");
       return broadcast;
+    case RequestType::REDUCESCATTER:
+      static const std::string reducescatter("REDUCESCATTER");
+      return reducescatter;
     case RequestType::JOIN:
       static const std::string join("JOIN");
       return join;
@@ -294,6 +297,9 @@ const std::string& Response::ResponseType_Name(ResponseType value) {
     case ResponseType::BROADCAST:
       static const std::string broadcast("BROADCAST");
       return broadcast;
+    case ResponseType::REDUCESCATTER:
+      static const std::string reducescatter("REDUCESCATTER");
+      return reducescatter;
     case ResponseType::JOIN:
       static const std::string join("JOIN");
       return join;