Fix affinity, convert oneccl.md to oneccl.rst (horovod#2350)

shirosankaku · web-flow · commit bdce24b94f27 · 2020-10-07T13:57:20.000-07:00
Signed-off-by: Yana Shchyokotova &lt;yana.shchyokotova@intel.com&gt;
diff --git a/.buildkite/gen-pipeline.sh b/.buildkite/gen-pipeline.sh
@@ -18,8 +18,8 @@ tests=$(if [[ "${BUILDKITE_BRANCH:-}" == "${BUILDKITE_PIPELINE_DEFAULT_BRANCH:-}
   printf "test-cpu-gloo-py3_8-tf2_3_0-keras2_3_1-torch1_6_0-mxnet1_5_0-pyspark3_0_1 "
   printf "test-cpu-openmpi-py3_6-tfhead-kerashead-torchhead-mxnethead-pyspark2_4_7 "
   printf "test-cpu-mpich-py3_6-tf1_15_0-keras2_3_1-torch1_4_0-mxnet1_5_0-pyspark2_4_7 "
-#  printf "test-cpu-oneccl-py3_6-tf1_15_0-keras2_3_1-torch1_4_0-mxnet1_5_0-pyspark2_4_7 "
-#  printf "test-cpu-oneccl-ofi-py3_6-tf1_15_0-keras2_3_1-torch1_4_0-mxnet1_5_0-pyspark2_4_7 "
+  printf "test-cpu-oneccl-py3_6-tf1_15_0-keras2_3_1-torch1_4_0-mxnet1_5_0-pyspark2_4_7 "
+  printf "test-cpu-oneccl-ofi-py3_6-tf1_15_0-keras2_3_1-torch1_4_0-mxnet1_5_0-pyspark2_4_7 "
    printf "test-gpu-openmpi-py3_6-tf1_15_0-keras2_2_4-torch1_3_0-mxnet1_4_1-pyspark2_4_7 "
    printf "test-gpu-gloo-py3_6-tf2_0_0-keras2_3_1-torch1_4_0-mxnet1_4_1-pyspark2_4_7 "
    printf "test-gpu-openmpi-gloo-py3_6-tf2_2_0-keras2_3_1-torch1_5_0-mxnet1_4_1-pyspark2_4_7 "
diff --git a/Dockerfile.test.cpu b/Dockerfile.test.cpu
@@ -88,20 +88,21 @@ RUN if [[ ${MPI_KIND} == "OpenMPI" ]]; then \
             chmod +x /usr/local/oneccl/bin/mpigxx && \
             cp /tmp/oneCCL-master/mpi/lib/libmpicxx.so /usr/local/oneccl/lib && \
             chmod +x /usr/local/oneccl/lib/libmpicxx.so && \
+            cp /tmp/oneCCL-master/mpi/lib/libmpifort.so /usr/local/oneccl/lib && \
+            chmod +x /usr/local/oneccl/lib/libmpifort.so && \
             sed -i 's/if \[ -z \"\${I_MPI_ROOT}\" \]/if [ -z \"${I_MPI_ROOT:-}\" ]/g' /usr/local/oneccl/env/setvars.sh && \
             sed -i 's/ \$1/ \${1:-}/g' /usr/local/oneccl/env/setvars.sh && \
             echo ". /usr/local/oneccl/env/setvars.sh" > /oneccl_env && \
             chmod +x /oneccl_env && \
-            echo "export CCL_ATL_TRANSPORT=ofi; export CCL_ATL_SHM=0; \
+            echo "export CCL_ATL_TRANSPORT=ofi; \
                   echo \"\$(env)\"; \
                   echo \"mpirun is \$(which mpirun)\"; \
                   echo \"LD_LIBRARY_PATH is \$(echo \$LD_LIBRARY_PATH)\"; \
                   echo \"oneCCL links with \$(ldd /usr/local/oneccl/lib/libccl.so)\"; \
                   mpirun -np 2 -hosts localhost \$@" > /mpirun_command_ofi && \
             chmod +x /mpirun_command_ofi && \
             cp /mpirun_command_ofi /mpirun_command_mpi && \
-            sed -i 's/export CCL_ATL_TRANSPORT=ofi;//g' /mpirun_command_mpi && \
-            sed -i 's/export CCL_ATL_SHM=0;//g' /mpirun_command_mpi && \
+            sed -i 's/export CCL_ATL_TRANSPORT=ofi;/export CCL_ATL_TRANSPORT=mpi;/g' /mpirun_command_mpi && \
             echo "-L/usr/local/oneccl/lib -lmpi -I/usr/local/oneccl/include" > /mpicc_oneccl && \
             chmod +x /mpicc_oneccl && \
             echo "/mpirun_command_mpi" > /mpirun_command; \
diff --git a/docs/index.rst b/docs/index.rst
@@ -119,6 +119,8 @@ Guides
 
    mpi_include
 
+   oneccl_include
+
    conda_include
 
    docker_include
diff --git a/docs/install.rst b/docs/install.rst
@@ -190,7 +190,7 @@ oneCCL
 ~~~~~~
 
 oneCCL is an Intel library for accelerated collective operations on CPU. See
-`Horovod with Intel(R) oneCCL <oneccl.md>`_ for more details.
+`Horovod with Intel(R) oneCCL <oneccl.rst>`_ for more details.
 
 Set ``HOROVOD_CPU_OPERATIONS=CCL`` to use oneCCL.
 
diff --git a/docs/oneccl.md b/docs/oneccl.md
diff --git a/docs/oneccl.rst b/docs/oneccl.rst
@@ -0,0 +1,90 @@
+.. inclusion-marker-start-do-not-remove
+
+Horovod with Intel(R) oneCCL
+============================
+To use Horovod with the Intel(R) oneAPI Collective Communications Library (oneCCL), follow the steps below.
+
+1. Install `oneCCL <https://github.com/intel/oneccl>`_.
+
+To install oneCCL, follow `these steps <https://github.com/intel/oneccl/blob/master/README.md>`_.
+
+Source ``setvars.sh`` to start using oneCCL.
+
+.. code-block:: bash
+
+    source <install_dir>/env/setvars.sh
+
+2. Install the `Intel(R) MPI Library <https://software.intel.com/en-us/mpi-library>`_.
+
+To install the Intel MPI Library, follow `these instructions <https://software.intel.com/en-us/mpi-library/documentation/get-started>`_.
+
+Source ``mpivars.sh`` script to establish the proper environment settings.
+
+.. code-block:: bash
+        
+    source <installdir_MPI>/intel64/bin/mpivars.sh release_mt
+
+3. Set ``HOROVOD_CPU_OPERATIONS`` variable
+    
+.. code-block:: bash
+
+    export HOROVOD_CPU_OPERATIONS=CCL
+
+4. Install Horovod from source code
+
+.. code-block:: bash
+
+    python setup.py build
+    python setup.py install
+
+or via pip 
+
+.. code-block:: bash
+    
+    pip install horovod
+
+**Advanced:** You can specify the affinity for BackgroundThread with the ``HOROVOD_THREAD_AFFINITY`` environment variable.
+See the instructions below.
+
+Set Horovod background thread affinity according to the rule. If there is N Horovod ranks per node, this variable should
+contain all the values for every rank using comma as a separator:
+
+.. code-block:: bash
+    
+    export HOROVOD_THREAD_AFFINITY=c0,c1,...,c(N-1)
+
+where c0,...,c(N-1) are core IDs to attach background thread to.
+
+Set the number of oneCCL workers:
+
+.. code-block:: bash
+    
+    export CCL_WORKER_COUNT=X
+
+where X is the number of threads you'd like to dedicate for driving communication. This means that for every rank there are X oneCCL
+workers available.
+
+Set oneCCL workers affinity:
+
+.. code-block:: bash
+
+    export CCL_WORKER_AFFINITY=c0,c1,..,c(X-1)
+
+where c0,c1,..,c(X-1) are core IDs dedicated to oneCCL workers (uses X 'last' cores by default). This variable sets affinity for all
+oneCCL workers (``CCL_WORKER_COUNT`` * Number of ranks per node) that are available for all the ranks running on one node.
+
+For instance, we have 2 nodes and each node has 2 sockets: socket0 CPUs:0-17,36-53 and socket1 CPUs:18-35,54-71. We decide to pin CCL
+workers to the last two cores of each socket while pinning Horovod background thread to one of the hyper-thread cores of CCL workers's
+cores. All these cores are excluded from Intel MPI pinning using ``I_MPI_PIN_PROCESSOR_EXCLUDE_LIST`` to dedicate them to CCL and Horovod
+tasks only, thus avoiding the conflict with framework's computational threads. 
+
+.. code-block:: bash
+    
+    export I_MPI_PIN_PROCESSOR_EXCLUDE_LIST="16,17,34,35,52,53,70,71"
+    export I_MPI_PIN_DOMAIN=socket
+    export HOROVOD_THREAD_AFFINITY="53,71"
+    export CCL_WORKER_COUNT=2
+    export CCL_WORKER_AFFINITY="16,17,34,35"
+    mpirun -n 4 -ppn 2 -hostfile hosts python ./run_example.py
+
+.. inclusion-marker-end-do-not-remove
diff --git a/docs/oneccl_include.rst b/docs/oneccl_include.rst
@@ -0,0 +1,3 @@
+.. include:: ./oneccl.rst
+   :start-after: inclusion-marker-start-do-not-remove
+   :end-before: inclusion-marker-end-do-not-remove
diff --git a/horovod/common/common.cc b/horovod/common/common.cc
@@ -156,12 +156,12 @@ void parse_and_set_affinity(const char* affinity, int local_size, int local_rank
   char* affinity_copy = (char*)calloc(affinity_len + 1, sizeof(char));
   memcpy(affinity_copy, affinity, affinity_len);
   char* tmp = affinity_copy;
-  char *endptr;
+  char* endptr;
 
   std::vector<int> core_ids(local_size);
   int count = 0;
 
-  while (*tmp != 0 && count < local_size) {
+  while (tmp && count < local_size) {
     auto core_id_str = strsep(&tmp, ",");
     errno = 0;
     auto core_id = std::strtol(core_id_str, &endptr, 10);
diff --git a/test/single/data/expected_buildkite_pipeline.yaml b/test/single/data/expected_buildkite_pipeline.yaml

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+.. include:: ./oneccl.rst`
	`2`	`+ :start-after: inclusion-marker-start-do-not-remove`
	`3`	`+ :end-before: inclusion-marker-end-do-not-remove`