add new self-hosted CI runner

SolenoidWGT · SolenoidWGT · commit 30b3a73c65e1 · 2023-01-17T07:18:39.000Z
diff --git a/.github/workflows/unit_test.yml b/.github/workflows/unit_test.yml
@@ -47,11 +47,58 @@ jobs:
       - uses: actions/checkout@v2
       - name: Set up Python ${{ matrix.python-version }}
         uses: actions/setup-python@v2
+        env:
+          AGENT_TOOLSDIRECTORY: /opt/hostedtoolcache
         with:
           python-version: ${{ matrix.python-version }}
       - name: do_benchmark
         run: |
           python -m pip install .
           python -m pip install ".[test,k8s]"
           ./ding/scripts/install-k8s-tools.sh
-          make benchmark
+          make benchmark
+
+  test_multiprocess:
+    runs-on: self-hosted
+    if: "!contains(github.event.head_commit.message, 'ci skip')"
+    strategy:
+      matrix:
+        python-version: ["3.7", "3.8", "3.9"]
+    steps:
+      - uses: actions/checkout@v3
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v3
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: do_multiprocesstest
+        timeout-minutes: 40
+        run: |
+          python -m pip install  box2d-py
+          python -m pip install .
+          python -m pip install ".[test,k8s]"
+          ./ding/scripts/install-k8s-tools.sh
+          make multiprocesstest
+
+  test_cuda:
+    runs-on: self-hosted
+    if: "!contains(github.event.head_commit.message, 'ci skip')"
+    strategy:
+      matrix:
+        python-version: ["3.7", "3.8", "3.9"]
+    steps:
+      - uses: actions/checkout@v3
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v3
+        env:
+          AGENT_TOOLSDIRECTORY: /opt/hostedtoolcache
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: do_unittest
+        timeout-minutes: 40
+        run: |
+          python -m pip install torch==1.12.1+cu113 --extra-index-url https://download.pytorch.org/whl/cu113
+          python -m pip install box2d-py
+          python -m pip install .
+          python -m pip install ".[test,k8s]"
+          ./ding/scripts/install-k8s-tools.sh
+          make cudatest
diff --git a/Makefile b/Makefile
@@ -57,11 +57,20 @@ benchmark:
 		--durations=0 \
 		-sv -m benchmark
 
+multiprocesstest:
+	pytest ${TEST_DIR} \
+		--cov-report=xml \
+		--cov-report term-missing \
+		--cov=${COV_DIR} \
+		${DURATIONS_COMMAND} \
+		${WORKERS_COMMAND} \
+		-sv -m multiprocesstest
+
 test: unittest  # just for compatibility, can be changed later
 
 cpu_test: unittest algotest benchmark
 
-all_test: unittest algotest cudatest benchmark
+all_test: unittest algotest cudatest benchmark multiprocesstest
 
 format:
 	yapf --in-place --recursive -p --verbose --style .style.yapf ${FORMAT_DIR}
diff --git a/ding/framework/message_queue/perfs/tests/test_perf_nng.py b/ding/framework/message_queue/perfs/tests/test_perf_nng.py
@@ -6,7 +6,8 @@
 
 
 @pytest.mark.benchmark
-# @pytest.mark.multiprocesstest
+@pytest.mark.multiprocesstest
+@pytest.mark.cudatest
 def test_nng():
     if torch.cuda.is_available() and torch.cuda.device_count() >= 2:
         address = socket.gethostbyname(socket.gethostname())
diff --git a/ding/framework/message_queue/perfs/tests/test_perf_shm.py b/ding/framework/message_queue/perfs/tests/test_perf_shm.py
@@ -6,15 +6,15 @@
 
 @pytest.mark.mqbenchmark
 @pytest.mark.cudatest
-# @pytest.mark.multiprocesstest
+@pytest.mark.multiprocesstest
 def test_shm_numpy_shm():
     if torch.cuda.is_available():
         shm_perf_main("shm")
 
 
 @pytest.mark.mqbenchmark
 @pytest.mark.cudatest
-# @pytest.mark.multiprocesstest
+@pytest.mark.multiprocesstest
 def test_shm_cuda_shared_tensor():
     if torch.cuda.is_available() and torch.cuda.device_count() >= 2:
         shm_perf_main("cuda_ipc")
diff --git a/ding/framework/message_queue/perfs/tests/test_perf_torchrpc_nccl.py b/ding/framework/message_queue/perfs/tests/test_perf_torchrpc_nccl.py
@@ -10,7 +10,7 @@
 
 @pytest.mark.benchmark
 @pytest.mark.cudatest
-# @pytest.mark.multiprocesstest
+@pytest.mark.multiprocesstest
 def test_perf_torchrpc_nccl():
     address = socket.gethostbyname(socket.gethostname())
     init_method = "tcp://{}:{}".format(address, find_free_port(address))
diff --git a/ding/framework/message_queue/tests/test_torch_rpc.py b/ding/framework/message_queue/tests/test_torch_rpc.py
@@ -8,7 +8,6 @@
 from torch.distributed import rpc
 from multiprocessing import Pool, get_context
 from ding.compatibility import torch_ge_1121
-from ditk import logging
 from ding.utils.system_helper import find_free_port
 
 mq = None
@@ -26,7 +25,6 @@ def torchrpc(rank):
     mq = None
     address = socket.gethostbyname(socket.gethostname())
     recv_tensor_list = [None, None, None, None]
-    logging.getLogger().setLevel(logging.DEBUG)
     name_list = ["A", "B", "C", "D"]
 
     if rank == 0:
@@ -85,7 +83,6 @@ def torchrpc_cuda(rank):
     recv_tensor_list = [None, None, None, None]
     name_list = ["A", "B"]
     address = socket.gethostbyname(socket.gethostname())
-    logging.getLogger().setLevel(logging.DEBUG)
 
     if rank == 0:
         attach_to = name_list[1:]
@@ -95,7 +92,7 @@ def torchrpc_cuda(rank):
     peer_rank = int(rank == 0) or 0
     peer_name = name_list[peer_rank]
     device_map = DeviceMap(rank, [peer_name], [rank], [peer_rank])
-    logging.debug(device_map)
+    print(device_map)
 
     mq = TORCHRPCMQ(
         rpc_name=name_list[rank],
@@ -132,7 +129,6 @@ def torchrpc_args_parser(rank):
     global mq
     global recv_tensor_list
     from ding.framework.parallel import Parallel
-    logging.getLogger().setLevel(logging.DEBUG)
 
     params = Parallel._torchrpc_args_parser(
         n_parallel_workers=1,
@@ -143,30 +139,30 @@ def torchrpc_args_parser(rank):
         local_cuda_devices=None,
         cuda_device_map=None
     )[0]
-
-    logging.debug(params)
+    print(params)
 
     # 1. If attach_to is empty, init_rpc will not block.
     mq = TORCHRPCMQ(**params)
     mq.listen()
     assert mq._running
     mq.stop()
     assert not mq._running
-    logging.debug("[Pass] 1. If attach_to is empty, init_rpc will not block.")
+    print("[Pass] 1. If attach_to is empty, init_rpc will not block.")
 
     # 2. n_parallel_workers != len(node_ids)
     try:
         Parallel._torchrpc_args_parser(n_parallel_workers=999, attach_to=[], node_ids=[1, 2])[0]
     except RuntimeError as e:
-        logging.debug("[Pass] 2. n_parallel_workers != len(node_ids).")
+        print("[Pass] 2. n_parallel_workers != len(node_ids).")
+        pass
     else:
         assert False
 
     # 3. len(local_cuda_devices) != n_parallel_workers
     try:
         Parallel._torchrpc_args_parser(n_parallel_workers=8, node_ids=[1], local_cuda_devices=[1, 2, 3])[0]
     except RuntimeError as e:
-        logging.debug("[Pass] 3. len(local_cuda_devices) != n_parallel_workers.")
+        print("[Pass] 3. len(local_cuda_devices) != n_parallel_workers.")
     else:
         assert False
 
@@ -175,7 +171,7 @@ def torchrpc_args_parser(rank):
     try:
         Parallel._torchrpc_args_parser(n_parallel_workers=999, node_ids=[1], use_cuda=True)[0]
     except RuntimeError as e:
-        logging.debug("[Pass] 4. n_parallel_workers > gpu_nums.")
+        print("[Pass] 4. n_parallel_workers > gpu_nums.")
     else:
         assert False
 
@@ -186,8 +182,7 @@ def torchrpc_args_parser(rank):
     assert params['device_maps'].peer_name_list == ["Node_0", "Node_0", "Node_1"]
     assert params['device_maps'].our_device_list == [0, 1, 1]
     assert params['device_maps'].peer_device_list == [0, 2, 4]
-    # logging.debug(params['device_maps'])
-    logging.debug("[Pass] 5. Set custom device map.")
+    print("[Pass] 5. Set custom device map.")
 
     # 6. Set n_parallel_workers > 1
     params = Parallel._torchrpc_args_parser(n_parallel_workers=8, node_ids=[1])
@@ -201,7 +196,7 @@ def torchrpc_args_parser(rank):
         params = Parallel._torchrpc_args_parser(n_parallel_workers=2, node_ids=[1], use_cuda=True)
         assert params[0]['use_cuda']
         assert len(params[0]['device_maps'].peer_name_list) == DEFAULT_DEVICE_MAP_NUMS - 1
-    logging.debug("[Pass] 6. Set n_parallel_workers > 1.")
+    print("[Pass] 6. Set n_parallel_workers > 1.")
 
 
 @pytest.mark.unittest
diff --git a/pytest.ini b/pytest.ini
@@ -10,5 +10,7 @@ markers =
     envpooltest
     other
     tmp
+    multiprocesstest
+    mqbenchmark
 
 norecursedirs = ding/hpc_rl/tests