mirror of
https://github.com/modelscope/modelscope.git
synced 2026-05-18 05:05:00 +02:00
[to #46106568]feat: parallel run ci case
This commit is contained in:
@@ -1,6 +1,3 @@
|
||||
echo "Testing envs"
|
||||
printenv
|
||||
echo "ENV END"
|
||||
if [ "$MODELSCOPE_SDK_DEBUG" == "True" ]; then
|
||||
pip install -r requirements/tests.txt
|
||||
git config --global --add safe.directory /Maas-lib
|
||||
@@ -23,7 +20,7 @@ if [ "$MODELSCOPE_SDK_DEBUG" == "True" ]; then
|
||||
awk -F: '/^[^#]/ { print $1 }' requirements/multi-modal.txt | xargs -n 1 pip install -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
|
||||
awk -F: '/^[^#]/ { print $1 }' requirements/nlp.txt | xargs -n 1 pip install -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
|
||||
awk -F: '/^[^#]/ { print $1 }' requirements/science.txt | xargs -n 1 pip install -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
|
||||
pip install -r requirements/tests.txt
|
||||
|
||||
# test with install
|
||||
python setup.py install
|
||||
else
|
||||
|
||||
@@ -3,30 +3,32 @@ MODELSCOPE_CACHE_DIR_IN_CONTAINER=/modelscope_cache
|
||||
CODE_DIR=$PWD
|
||||
CODE_DIR_IN_CONTAINER=/Maas-lib
|
||||
echo "$USER"
|
||||
gpus='7 6 5 4 3 2 1 0'
|
||||
cpu_sets='0-7 8-15 16-23 24-30 31-37 38-44 45-51 52-58'
|
||||
gpus='0,1 2,3 4,5 6,7'
|
||||
cpu_sets='45-58 31-44 16-30 0-15'
|
||||
cpu_sets_arr=($cpu_sets)
|
||||
is_get_file_lock=false
|
||||
# export RUN_CASE_COMMAND='python tests/run.py --run_config tests/run_config.yaml'
|
||||
CI_COMMAND=${CI_COMMAND:-bash .dev_scripts/ci_container_test.sh $RUN_CASE_BASE_COMMAND}
|
||||
CI_COMMAND='bash .dev_scripts/ci_container_test.sh python tests/run.py --parallel 2 --run_config tests/run_config.yaml'
|
||||
echo "ci command: $CI_COMMAND"
|
||||
idx=0
|
||||
for gpu in $gpus
|
||||
do
|
||||
exec {lock_fd}>"/tmp/gpu$gpu" || exit 1
|
||||
flock -n "$lock_fd" || { echo "WARN: gpu $gpu is in use!" >&2; continue; }
|
||||
flock -n "$lock_fd" || { echo "WARN: gpu $gpu is in use!" >&2; idx=$((idx+1)); continue; }
|
||||
echo "get gpu lock $gpu"
|
||||
CONTAINER_NAME="modelscope-ci-$gpu"
|
||||
|
||||
CONTAINER_NAME="modelscope-ci-$idx"
|
||||
let is_get_file_lock=true
|
||||
|
||||
# pull image if there are update
|
||||
docker pull ${IMAGE_NAME}:${IMAGE_VERSION}
|
||||
if [ "$MODELSCOPE_SDK_DEBUG" == "True" ]; then
|
||||
echo 'debugging'
|
||||
docker run --rm --name $CONTAINER_NAME --shm-size=16gb \
|
||||
--cpuset-cpus=${cpu_sets_arr[$gpu]} \
|
||||
--gpus="device=$gpu" \
|
||||
--cpuset-cpus=${cpu_sets_arr[$idx]} \
|
||||
--gpus='"'"device=$gpu"'"' \
|
||||
-v $CODE_DIR:$CODE_DIR_IN_CONTAINER \
|
||||
-v $MODELSCOPE_CACHE:$MODELSCOPE_CACHE_DIR_IN_CONTAINER \
|
||||
-v $MODELSCOPE_HOME_CACHE/$gpu:/root \
|
||||
-v $MODELSCOPE_HOME_CACHE/$idx:/root \
|
||||
-v /home/admin/pre-commit:/home/admin/pre-commit \
|
||||
-e CI_TEST=True \
|
||||
-e TEST_LEVEL=$TEST_LEVEL \
|
||||
@@ -41,16 +43,15 @@ do
|
||||
-e TEST_UPLOAD_MS_TOKEN=$TEST_UPLOAD_MS_TOKEN \
|
||||
-e MODEL_TAG_URL=$MODEL_TAG_URL \
|
||||
--workdir=$CODE_DIR_IN_CONTAINER \
|
||||
--net host \
|
||||
${IMAGE_NAME}:${IMAGE_VERSION} \
|
||||
$CI_COMMAND
|
||||
else
|
||||
docker run --rm --name $CONTAINER_NAME --shm-size=16gb \
|
||||
--cpuset-cpus=${cpu_sets_arr[$gpu]} \
|
||||
--gpus="device=$gpu" \
|
||||
--cpuset-cpus=${cpu_sets_arr[$idx]} \
|
||||
--gpus='"'"device=$gpu"'"' \
|
||||
-v $CODE_DIR:$CODE_DIR_IN_CONTAINER \
|
||||
-v $MODELSCOPE_CACHE:$MODELSCOPE_CACHE_DIR_IN_CONTAINER \
|
||||
-v $MODELSCOPE_HOME_CACHE/$gpu:/root \
|
||||
-v $MODELSCOPE_HOME_CACHE/$idx:/root \
|
||||
-v /home/admin/pre-commit:/home/admin/pre-commit \
|
||||
-e CI_TEST=True \
|
||||
-e TEST_LEVEL=$TEST_LEVEL \
|
||||
@@ -64,7 +65,6 @@ do
|
||||
-e TEST_UPLOAD_MS_TOKEN=$TEST_UPLOAD_MS_TOKEN \
|
||||
-e MODEL_TAG_URL=$MODEL_TAG_URL \
|
||||
--workdir=$CODE_DIR_IN_CONTAINER \
|
||||
--net host \
|
||||
${IMAGE_NAME}:${IMAGE_VERSION} \
|
||||
$CI_COMMAND
|
||||
fi
|
||||
|
||||
@@ -20,7 +20,6 @@ class MogFaceDetector(TorchModel):
|
||||
|
||||
def __init__(self, model_path, device='cuda'):
|
||||
super().__init__(model_path)
|
||||
torch.set_grad_enabled(False)
|
||||
cudnn.benchmark = True
|
||||
self.model_path = model_path
|
||||
self.device = device
|
||||
|
||||
@@ -21,7 +21,6 @@ class MtcnnFaceDetector(TorchModel):
|
||||
|
||||
def __init__(self, model_path, device='cuda'):
|
||||
super().__init__(model_path)
|
||||
torch.set_grad_enabled(False)
|
||||
cudnn.benchmark = True
|
||||
self.model_path = model_path
|
||||
self.device = device
|
||||
|
||||
@@ -18,7 +18,6 @@ class RetinaFaceDetection(TorchModel):
|
||||
|
||||
def __init__(self, model_path, device='cuda'):
|
||||
super().__init__(model_path)
|
||||
torch.set_grad_enabled(False)
|
||||
cudnn.benchmark = True
|
||||
self.model_path = model_path
|
||||
self.cfg = Config.from_file(
|
||||
|
||||
@@ -24,7 +24,6 @@ class UlfdFaceDetector(TorchModel):
|
||||
|
||||
def __init__(self, model_path, device='cuda'):
|
||||
super().__init__(model_path)
|
||||
torch.set_grad_enabled(False)
|
||||
cudnn.benchmark = True
|
||||
self.model_path = model_path
|
||||
self.device = device
|
||||
|
||||
@@ -24,7 +24,6 @@ class FacialExpressionRecognition(TorchModel):
|
||||
|
||||
def __init__(self, model_path, device='cuda'):
|
||||
super().__init__(model_path)
|
||||
torch.set_grad_enabled(False)
|
||||
cudnn.benchmark = True
|
||||
self.model_path = model_path
|
||||
self.device = device
|
||||
|
||||
@@ -31,7 +31,6 @@ cfg_re50 = {
|
||||
class RetinaFaceDetection(object):
|
||||
|
||||
def __init__(self, model_path, device='cuda'):
|
||||
torch.set_grad_enabled(False)
|
||||
cudnn.benchmark = True
|
||||
self.model_path = model_path
|
||||
self.device = device
|
||||
|
||||
136
tests/run.py
136
tests/run.py
@@ -3,11 +3,13 @@
|
||||
|
||||
import argparse
|
||||
import datetime
|
||||
import math
|
||||
import multiprocessing
|
||||
import os
|
||||
import subprocess
|
||||
import sys
|
||||
import tempfile
|
||||
import time
|
||||
import unittest
|
||||
from fnmatch import fnmatch
|
||||
from multiprocessing.managers import BaseManager
|
||||
@@ -158,6 +160,21 @@ def run_command_with_popen(cmd):
|
||||
sys.stdout.write(line)
|
||||
|
||||
|
||||
def async_run_command_with_popen(cmd, device_id):
|
||||
logger.info('Worker id: %s args: %s' % (device_id, cmd))
|
||||
env = os.environ.copy()
|
||||
env['CUDA_VISIBLE_DEVICES'] = '%s' % device_id
|
||||
sub_process = subprocess.Popen(
|
||||
cmd,
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.STDOUT,
|
||||
bufsize=1,
|
||||
universal_newlines=True,
|
||||
env=env,
|
||||
encoding='utf8')
|
||||
return sub_process
|
||||
|
||||
|
||||
def save_test_result(df, args):
|
||||
if args.result_dir is not None:
|
||||
file_name = str(int(datetime.datetime.now().timestamp() * 1000))
|
||||
@@ -199,6 +216,108 @@ def install_requirements(requirements):
|
||||
run_command(cmd)
|
||||
|
||||
|
||||
def wait_for_free_worker(workers):
|
||||
while True:
|
||||
for idx, worker in enumerate(workers):
|
||||
if worker is None:
|
||||
logger.info('return free worker: %s' % (idx))
|
||||
return idx
|
||||
if worker.poll() is None: # running, get output
|
||||
for line in iter(worker.stdout.readline, ''):
|
||||
if line != '':
|
||||
sys.stdout.write(line)
|
||||
else:
|
||||
break
|
||||
else: # worker process completed.
|
||||
logger.info('Process end: %s' % (idx))
|
||||
workers[idx] = None
|
||||
return idx
|
||||
time.sleep(0.001)
|
||||
|
||||
|
||||
def wait_for_workers(workers):
|
||||
while True:
|
||||
for idx, worker in enumerate(workers):
|
||||
if worker is None:
|
||||
continue
|
||||
# check worker is completed.
|
||||
if worker.poll() is None:
|
||||
for line in iter(worker.stdout.readline, ''):
|
||||
if line != '':
|
||||
sys.stdout.write(line)
|
||||
else:
|
||||
break
|
||||
else:
|
||||
logger.info('Process idx: %s end!' % (idx))
|
||||
workers[idx] = None
|
||||
|
||||
is_all_completed = True
|
||||
for idx, worker in enumerate(workers):
|
||||
if worker is not None:
|
||||
is_all_completed = False
|
||||
break
|
||||
|
||||
if is_all_completed:
|
||||
logger.info('All sub porcess is completed!')
|
||||
break
|
||||
time.sleep(0.001)
|
||||
|
||||
|
||||
def parallel_run_case_in_env(env_name, env, test_suite_env_map, isolated_cases,
|
||||
result_dir, parallel):
|
||||
logger.info('Running case in env: %s' % env_name)
|
||||
# install requirements and deps # run_config['envs'][env]
|
||||
if 'requirements' in env:
|
||||
install_requirements(env['requirements'])
|
||||
if 'dependencies' in env:
|
||||
install_packages(env['dependencies'])
|
||||
# case worker processes
|
||||
worker_processes = [None] * parallel
|
||||
for test_suite_file in isolated_cases: # run case in subprocess
|
||||
if test_suite_file in test_suite_env_map and test_suite_env_map[
|
||||
test_suite_file] == env_name:
|
||||
cmd = [
|
||||
'python',
|
||||
'tests/run.py',
|
||||
'--pattern',
|
||||
test_suite_file,
|
||||
'--result_dir',
|
||||
result_dir,
|
||||
]
|
||||
worker_idx = wait_for_free_worker(worker_processes)
|
||||
worker_process = async_run_command_with_popen(cmd, worker_idx)
|
||||
os.set_blocking(worker_process.stdout.fileno(), False)
|
||||
worker_processes[worker_idx] = worker_process
|
||||
else:
|
||||
pass # case not in run list.
|
||||
|
||||
# run remain cases in a process.
|
||||
remain_suite_files = []
|
||||
for k, v in test_suite_env_map.items():
|
||||
if k not in isolated_cases and v == env_name:
|
||||
remain_suite_files.append(k)
|
||||
if len(remain_suite_files) == 0:
|
||||
return
|
||||
# roughly split case in parallel
|
||||
part_count = math.ceil(len(remain_suite_files) / parallel)
|
||||
suites_chunks = [
|
||||
remain_suite_files[x:x + part_count]
|
||||
for x in range(0, len(remain_suite_files), part_count)
|
||||
]
|
||||
for suites_chunk in suites_chunks:
|
||||
worker_idx = wait_for_free_worker(worker_processes)
|
||||
cmd = [
|
||||
'python', 'tests/run.py', '--result_dir', result_dir, '--suites'
|
||||
]
|
||||
for suite in suites_chunk:
|
||||
cmd.append(suite)
|
||||
worker_process = async_run_command_with_popen(cmd, worker_idx)
|
||||
os.set_blocking(worker_process.stdout.fileno(), False)
|
||||
worker_processes[worker_idx] = worker_process
|
||||
|
||||
wait_for_workers(worker_processes)
|
||||
|
||||
|
||||
def run_case_in_env(env_name, env, test_suite_env_map, isolated_cases,
|
||||
result_dir):
|
||||
# install requirements and deps # run_config['envs'][env]
|
||||
@@ -264,8 +383,9 @@ def run_in_subprocess(args):
|
||||
|
||||
with tempfile.TemporaryDirectory() as temp_result_dir:
|
||||
for env in set(test_suite_env_map.values()):
|
||||
run_case_in_env(env, run_config['envs'][env], test_suite_env_map,
|
||||
isolated_cases, temp_result_dir)
|
||||
parallel_run_case_in_env(env, run_config['envs'][env],
|
||||
test_suite_env_map, isolated_cases,
|
||||
temp_result_dir, args.parallel)
|
||||
|
||||
result_dfs = []
|
||||
result_path = Path(temp_result_dir)
|
||||
@@ -312,6 +432,10 @@ class TimeCostTextTestResult(TextTestResult):
|
||||
self.stream.writeln(
|
||||
'Test case: %s stop at: %s, cost time: %s(seconds)' %
|
||||
(test.test_full_name, test.stop_time, test.time_cost))
|
||||
if torch.cuda.is_available(
|
||||
) and test.time_cost > 5.0: # print nvidia-smi
|
||||
cmd = ['nvidia-smi']
|
||||
run_command_with_popen(cmd)
|
||||
super(TimeCostTextTestResult, self).stopTest(test)
|
||||
|
||||
def addSuccess(self, test):
|
||||
@@ -383,6 +507,8 @@ def main(args):
|
||||
os.path.abspath(args.test_dir), args.pattern, args.list_tests)
|
||||
if not args.list_tests:
|
||||
result = runner.run(test_suite)
|
||||
logger.info('Running case completed, pid: %s, suites: %s' %
|
||||
(os.getpid(), args.suites))
|
||||
result = collect_test_results(result)
|
||||
df = test_cases_result_to_df(result)
|
||||
if args.result_dir is not None:
|
||||
@@ -417,6 +543,12 @@ if __name__ == '__main__':
|
||||
'--result_dir',
|
||||
default=None,
|
||||
help='Save result to directory, internal use only')
|
||||
parser.add_argument(
|
||||
'--parallel',
|
||||
default=1,
|
||||
type=int,
|
||||
help='Set case parallels, default single process, set with gpu number.'
|
||||
)
|
||||
parser.add_argument(
|
||||
'--suites',
|
||||
nargs='*',
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
# isolate cases in env, we can install different dependencies in each env.
|
||||
isolated: # test cases that may require excessive anmount of GPU memory, which will be executed in dedicagted process.
|
||||
isolated: # test cases that may require excessive anmount of GPU memory or run long time, which will be executed in dedicagted process.
|
||||
- test_text_to_speech.py
|
||||
- test_multi_modal_embedding.py
|
||||
- test_ofa_tasks.py
|
||||
@@ -12,6 +12,33 @@ isolated: # test cases that may require excessive anmount of GPU memory, which
|
||||
- test_segmentation_pipeline.py
|
||||
- test_image_inpainting.py
|
||||
- test_mglm_text_summarization.py
|
||||
- test_team_transfer_trainer.py
|
||||
- test_image_denoise_trainer.py
|
||||
- test_dialog_intent_trainer.py
|
||||
- test_finetune_mplug.py
|
||||
- test_image_instance_segmentation_trainer.py
|
||||
- test_image_portrait_enhancement_trainer.py
|
||||
- test_translation_trainer.py
|
||||
- test_unifold.py
|
||||
- test_automatic_post_editing.py
|
||||
- test_mplug_tasks.py
|
||||
- test_movie_scene_segmentation.py
|
||||
- test_body_3d_keypoints.py
|
||||
- test_finetune_text_generation.py
|
||||
- test_clip_trainer.py
|
||||
- test_ofa_trainer.py
|
||||
- test_fill_mask.py
|
||||
- test_hand_2d_keypoints.py
|
||||
- test_referring_video_object_segmentation.py
|
||||
- test_easycv_trainer_hand_2d_keypoints.py
|
||||
- test_card_detection_scrfd_trainer.py
|
||||
- test_referring_video_object_segmentation_trainer.py
|
||||
- test_person_image_cartoon.py
|
||||
- test_image_style_transfer.py
|
||||
- test_ocr_detection.py
|
||||
- test_automatic_speech_recognition.py
|
||||
- test_image_matting.py
|
||||
- test_skin_retouching.py
|
||||
|
||||
envs:
|
||||
default: # default env, case not in other env will in default, pytorch.
|
||||
|
||||
@@ -94,7 +94,7 @@ class TestDialogIntentTrainer(unittest.TestCase):
|
||||
cfg.Model.update(config['Model'])
|
||||
if self.debugging:
|
||||
cfg.Trainer.save_checkpoint = False
|
||||
cfg.Trainer.num_epochs = 5
|
||||
cfg.Trainer.num_epochs = 1
|
||||
cfg.Trainer.batch_size_label = 64
|
||||
return cfg
|
||||
|
||||
|
||||
Reference in New Issue
Block a user