Skip to content

Commit 30b3a73

Browse files
committed
add new self-hosted CI runner
1 parent fdd1bb9 commit 30b3a73

File tree

7 files changed

+74
-20
lines changed

7 files changed

+74
-20
lines changed

.github/workflows/unit_test.yml

Lines changed: 48 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,11 +47,58 @@ jobs:
4747
- uses: actions/checkout@v2
4848
- name: Set up Python ${{ matrix.python-version }}
4949
uses: actions/setup-python@v2
50+
env:
51+
AGENT_TOOLSDIRECTORY: /opt/hostedtoolcache
5052
with:
5153
python-version: ${{ matrix.python-version }}
5254
- name: do_benchmark
5355
run: |
5456
python -m pip install .
5557
python -m pip install ".[test,k8s]"
5658
./ding/scripts/install-k8s-tools.sh
57-
make benchmark
59+
make benchmark
60+
61+
test_multiprocess:
62+
runs-on: self-hosted
63+
if: "!contains(github.event.head_commit.message, 'ci skip')"
64+
strategy:
65+
matrix:
66+
python-version: ["3.7", "3.8", "3.9"]
67+
steps:
68+
- uses: actions/checkout@v3
69+
- name: Set up Python ${{ matrix.python-version }}
70+
uses: actions/setup-python@v3
71+
with:
72+
python-version: ${{ matrix.python-version }}
73+
- name: do_multiprocesstest
74+
timeout-minutes: 40
75+
run: |
76+
python -m pip install box2d-py
77+
python -m pip install .
78+
python -m pip install ".[test,k8s]"
79+
./ding/scripts/install-k8s-tools.sh
80+
make multiprocesstest
81+
82+
test_cuda:
83+
runs-on: self-hosted
84+
if: "!contains(github.event.head_commit.message, 'ci skip')"
85+
strategy:
86+
matrix:
87+
python-version: ["3.7", "3.8", "3.9"]
88+
steps:
89+
- uses: actions/checkout@v3
90+
- name: Set up Python ${{ matrix.python-version }}
91+
uses: actions/setup-python@v3
92+
env:
93+
AGENT_TOOLSDIRECTORY: /opt/hostedtoolcache
94+
with:
95+
python-version: ${{ matrix.python-version }}
96+
- name: do_unittest
97+
timeout-minutes: 40
98+
run: |
99+
python -m pip install torch==1.12.1+cu113 --extra-index-url https://download.pytorch.org/whl/cu113
100+
python -m pip install box2d-py
101+
python -m pip install .
102+
python -m pip install ".[test,k8s]"
103+
./ding/scripts/install-k8s-tools.sh
104+
make cudatest

Makefile

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -57,11 +57,20 @@ benchmark:
5757
--durations=0 \
5858
-sv -m benchmark
5959

60+
multiprocesstest:
61+
pytest ${TEST_DIR} \
62+
--cov-report=xml \
63+
--cov-report term-missing \
64+
--cov=${COV_DIR} \
65+
${DURATIONS_COMMAND} \
66+
${WORKERS_COMMAND} \
67+
-sv -m multiprocesstest
68+
6069
test: unittest # just for compatibility, can be changed later
6170

6271
cpu_test: unittest algotest benchmark
6372

64-
all_test: unittest algotest cudatest benchmark
73+
all_test: unittest algotest cudatest benchmark multiprocesstest
6574

6675
format:
6776
yapf --in-place --recursive -p --verbose --style .style.yapf ${FORMAT_DIR}

ding/framework/message_queue/perfs/tests/test_perf_nng.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,8 @@
66

77

88
@pytest.mark.benchmark
9-
# @pytest.mark.multiprocesstest
9+
@pytest.mark.multiprocesstest
10+
@pytest.mark.cudatest
1011
def test_nng():
1112
if torch.cuda.is_available() and torch.cuda.device_count() >= 2:
1213
address = socket.gethostbyname(socket.gethostname())

ding/framework/message_queue/perfs/tests/test_perf_shm.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,15 +6,15 @@
66

77
@pytest.mark.mqbenchmark
88
@pytest.mark.cudatest
9-
# @pytest.mark.multiprocesstest
9+
@pytest.mark.multiprocesstest
1010
def test_shm_numpy_shm():
1111
if torch.cuda.is_available():
1212
shm_perf_main("shm")
1313

1414

1515
@pytest.mark.mqbenchmark
1616
@pytest.mark.cudatest
17-
# @pytest.mark.multiprocesstest
17+
@pytest.mark.multiprocesstest
1818
def test_shm_cuda_shared_tensor():
1919
if torch.cuda.is_available() and torch.cuda.device_count() >= 2:
2020
shm_perf_main("cuda_ipc")

ding/framework/message_queue/perfs/tests/test_perf_torchrpc_nccl.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010

1111
@pytest.mark.benchmark
1212
@pytest.mark.cudatest
13-
# @pytest.mark.multiprocesstest
13+
@pytest.mark.multiprocesstest
1414
def test_perf_torchrpc_nccl():
1515
address = socket.gethostbyname(socket.gethostname())
1616
init_method = "tcp://{}:{}".format(address, find_free_port(address))

ding/framework/message_queue/tests/test_torch_rpc.py

Lines changed: 9 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,6 @@
88
from torch.distributed import rpc
99
from multiprocessing import Pool, get_context
1010
from ding.compatibility import torch_ge_1121
11-
from ditk import logging
1211
from ding.utils.system_helper import find_free_port
1312

1413
mq = None
@@ -26,7 +25,6 @@ def torchrpc(rank):
2625
mq = None
2726
address = socket.gethostbyname(socket.gethostname())
2827
recv_tensor_list = [None, None, None, None]
29-
logging.getLogger().setLevel(logging.DEBUG)
3028
name_list = ["A", "B", "C", "D"]
3129

3230
if rank == 0:
@@ -85,7 +83,6 @@ def torchrpc_cuda(rank):
8583
recv_tensor_list = [None, None, None, None]
8684
name_list = ["A", "B"]
8785
address = socket.gethostbyname(socket.gethostname())
88-
logging.getLogger().setLevel(logging.DEBUG)
8986

9087
if rank == 0:
9188
attach_to = name_list[1:]
@@ -95,7 +92,7 @@ def torchrpc_cuda(rank):
9592
peer_rank = int(rank == 0) or 0
9693
peer_name = name_list[peer_rank]
9794
device_map = DeviceMap(rank, [peer_name], [rank], [peer_rank])
98-
logging.debug(device_map)
95+
print(device_map)
9996

10097
mq = TORCHRPCMQ(
10198
rpc_name=name_list[rank],
@@ -132,7 +129,6 @@ def torchrpc_args_parser(rank):
132129
global mq
133130
global recv_tensor_list
134131
from ding.framework.parallel import Parallel
135-
logging.getLogger().setLevel(logging.DEBUG)
136132

137133
params = Parallel._torchrpc_args_parser(
138134
n_parallel_workers=1,
@@ -143,30 +139,30 @@ def torchrpc_args_parser(rank):
143139
local_cuda_devices=None,
144140
cuda_device_map=None
145141
)[0]
146-
147-
logging.debug(params)
142+
print(params)
148143

149144
# 1. If attach_to is empty, init_rpc will not block.
150145
mq = TORCHRPCMQ(**params)
151146
mq.listen()
152147
assert mq._running
153148
mq.stop()
154149
assert not mq._running
155-
logging.debug("[Pass] 1. If attach_to is empty, init_rpc will not block.")
150+
print("[Pass] 1. If attach_to is empty, init_rpc will not block.")
156151

157152
# 2. n_parallel_workers != len(node_ids)
158153
try:
159154
Parallel._torchrpc_args_parser(n_parallel_workers=999, attach_to=[], node_ids=[1, 2])[0]
160155
except RuntimeError as e:
161-
logging.debug("[Pass] 2. n_parallel_workers != len(node_ids).")
156+
print("[Pass] 2. n_parallel_workers != len(node_ids).")
157+
pass
162158
else:
163159
assert False
164160

165161
# 3. len(local_cuda_devices) != n_parallel_workers
166162
try:
167163
Parallel._torchrpc_args_parser(n_parallel_workers=8, node_ids=[1], local_cuda_devices=[1, 2, 3])[0]
168164
except RuntimeError as e:
169-
logging.debug("[Pass] 3. len(local_cuda_devices) != n_parallel_workers.")
165+
print("[Pass] 3. len(local_cuda_devices) != n_parallel_workers.")
170166
else:
171167
assert False
172168

@@ -175,7 +171,7 @@ def torchrpc_args_parser(rank):
175171
try:
176172
Parallel._torchrpc_args_parser(n_parallel_workers=999, node_ids=[1], use_cuda=True)[0]
177173
except RuntimeError as e:
178-
logging.debug("[Pass] 4. n_parallel_workers > gpu_nums.")
174+
print("[Pass] 4. n_parallel_workers > gpu_nums.")
179175
else:
180176
assert False
181177

@@ -186,8 +182,7 @@ def torchrpc_args_parser(rank):
186182
assert params['device_maps'].peer_name_list == ["Node_0", "Node_0", "Node_1"]
187183
assert params['device_maps'].our_device_list == [0, 1, 1]
188184
assert params['device_maps'].peer_device_list == [0, 2, 4]
189-
# logging.debug(params['device_maps'])
190-
logging.debug("[Pass] 5. Set custom device map.")
185+
print("[Pass] 5. Set custom device map.")
191186

192187
# 6. Set n_parallel_workers > 1
193188
params = Parallel._torchrpc_args_parser(n_parallel_workers=8, node_ids=[1])
@@ -201,7 +196,7 @@ def torchrpc_args_parser(rank):
201196
params = Parallel._torchrpc_args_parser(n_parallel_workers=2, node_ids=[1], use_cuda=True)
202197
assert params[0]['use_cuda']
203198
assert len(params[0]['device_maps'].peer_name_list) == DEFAULT_DEVICE_MAP_NUMS - 1
204-
logging.debug("[Pass] 6. Set n_parallel_workers > 1.")
199+
print("[Pass] 6. Set n_parallel_workers > 1.")
205200

206201

207202
@pytest.mark.unittest

pytest.ini

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,5 +10,7 @@ markers =
1010
envpooltest
1111
other
1212
tmp
13+
multiprocesstest
14+
mqbenchmark
1315

1416
norecursedirs = ding/hpc_rl/tests

0 commit comments

Comments
 (0)