[to #46106568]feat: parallel run ci case

2026-05-18 05:05:00 +02:00 · 2022-11-17 08:51:23 +08:00
parent 10926a06d4
commit 90a5efa1c2
11 changed files with 178 additions and 28 deletions
--- a/.dev_scripts/ci_container_test.sh
+++ b/.dev_scripts/ci_container_test.sh
@@ -1,6 +1,3 @@
-echo "Testing envs"
-printenv
-echo "ENV END"
 if [ "$MODELSCOPE_SDK_DEBUG" == "True" ]; then
    pip install -r requirements/tests.txt
    git config --global --add safe.directory /Maas-lib
@@ -23,7 +20,7 @@ if [ "$MODELSCOPE_SDK_DEBUG" == "True" ]; then
    awk -F: '/^[^#]/ { print $1 }' requirements/multi-modal.txt | xargs -n 1 pip install -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
    awk -F: '/^[^#]/ { print $1 }' requirements/nlp.txt | xargs -n 1 pip install -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
    awk -F: '/^[^#]/ { print $1 }' requirements/science.txt | xargs -n 1 pip install -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
-     pip install -r requirements/tests.txt
+
    # test with install
    python setup.py install
 else
--- a/.dev_scripts/dockerci.sh
+++ b/.dev_scripts/dockerci.sh
@@ -3,30 +3,32 @@ MODELSCOPE_CACHE_DIR_IN_CONTAINER=/modelscope_cache
 CODE_DIR=$PWD
 CODE_DIR_IN_CONTAINER=/Maas-lib
 echo "$USER"
-gpus='7 6 5 4 3 2 1 0'
-cpu_sets='0-7 8-15 16-23 24-30 31-37 38-44 45-51 52-58'
+gpus='0,1 2,3 4,5 6,7'
+cpu_sets='45-58 31-44 16-30 0-15'
 cpu_sets_arr=($cpu_sets)
 is_get_file_lock=false
-# export RUN_CASE_COMMAND='python tests/run.py --run_config tests/run_config.yaml'
-CI_COMMAND=${CI_COMMAND:-bash .dev_scripts/ci_container_test.sh $RUN_CASE_BASE_COMMAND}
+CI_COMMAND='bash .dev_scripts/ci_container_test.sh python tests/run.py --parallel 2 --run_config tests/run_config.yaml'
 echo "ci command: $CI_COMMAND"
+idx=0
 for gpu in $gpus
 do
  exec {lock_fd}>"/tmp/gpu$gpu" || exit 1
-  flock -n "$lock_fd" || { echo "WARN: gpu $gpu is in use!" >&2; continue; }
+  flock -n "$lock_fd" || { echo "WARN: gpu $gpu is in use!" >&2; idx=$((idx+1)); continue; }
  echo "get gpu lock $gpu"
-  CONTAINER_NAME="modelscope-ci-$gpu"
+
+  CONTAINER_NAME="modelscope-ci-$idx"
  let is_get_file_lock=true

  # pull image if there are update
  docker pull ${IMAGE_NAME}:${IMAGE_VERSION}
  if [ "$MODELSCOPE_SDK_DEBUG" == "True" ]; then
+    echo 'debugging'
    docker run --rm --name $CONTAINER_NAME --shm-size=16gb \
-              --cpuset-cpus=${cpu_sets_arr[$gpu]} \
-              --gpus="device=$gpu" \
+              --cpuset-cpus=${cpu_sets_arr[$idx]} \
+              --gpus='"'"device=$gpu"'"' \
              -v $CODE_DIR:$CODE_DIR_IN_CONTAINER \
              -v $MODELSCOPE_CACHE:$MODELSCOPE_CACHE_DIR_IN_CONTAINER \
-              -v $MODELSCOPE_HOME_CACHE/$gpu:/root \
+              -v $MODELSCOPE_HOME_CACHE/$idx:/root \
              -v /home/admin/pre-commit:/home/admin/pre-commit \
              -e CI_TEST=True \
              -e TEST_LEVEL=$TEST_LEVEL \
@@ -41,16 +43,15 @@ do
              -e TEST_UPLOAD_MS_TOKEN=$TEST_UPLOAD_MS_TOKEN \
              -e MODEL_TAG_URL=$MODEL_TAG_URL \
              --workdir=$CODE_DIR_IN_CONTAINER \
-              --net host  \
              ${IMAGE_NAME}:${IMAGE_VERSION} \
              $CI_COMMAND
  else
    docker run --rm --name $CONTAINER_NAME --shm-size=16gb \
-              --cpuset-cpus=${cpu_sets_arr[$gpu]} \
-              --gpus="device=$gpu" \
+              --cpuset-cpus=${cpu_sets_arr[$idx]} \
+              --gpus='"'"device=$gpu"'"' \
              -v $CODE_DIR:$CODE_DIR_IN_CONTAINER \
              -v $MODELSCOPE_CACHE:$MODELSCOPE_CACHE_DIR_IN_CONTAINER \
-              -v $MODELSCOPE_HOME_CACHE/$gpu:/root \
+              -v $MODELSCOPE_HOME_CACHE/$idx:/root \
              -v /home/admin/pre-commit:/home/admin/pre-commit \
              -e CI_TEST=True \
              -e TEST_LEVEL=$TEST_LEVEL \
@@ -64,7 +65,6 @@ do
              -e TEST_UPLOAD_MS_TOKEN=$TEST_UPLOAD_MS_TOKEN \
              -e MODEL_TAG_URL=$MODEL_TAG_URL \
              --workdir=$CODE_DIR_IN_CONTAINER \
-              --net host  \
              ${IMAGE_NAME}:${IMAGE_VERSION} \
              $CI_COMMAND
  fi
--- a/modelscope/models/cv/face_detection/mogface/models/detectors.py
+++ b/modelscope/models/cv/face_detection/mogface/models/detectors.py
@@ -20,7 +20,6 @@ class MogFaceDetector(TorchModel):

    def __init__(self, model_path, device='cuda'):
        super().__init__(model_path)
-        torch.set_grad_enabled(False)
        cudnn.benchmark = True
        self.model_path = model_path
        self.device = device
--- a/modelscope/models/cv/face_detection/mtcnn/models/detector.py
+++ b/modelscope/models/cv/face_detection/mtcnn/models/detector.py
@@ -21,7 +21,6 @@ class MtcnnFaceDetector(TorchModel):

    def __init__(self, model_path, device='cuda'):
        super().__init__(model_path)
-        torch.set_grad_enabled(False)
        cudnn.benchmark = True
        self.model_path = model_path
        self.device = device
--- a/modelscope/models/cv/face_detection/retinaface/detection.py
+++ b/modelscope/models/cv/face_detection/retinaface/detection.py
@@ -18,7 +18,6 @@ class RetinaFaceDetection(TorchModel):

    def __init__(self, model_path, device='cuda'):
        super().__init__(model_path)
-        torch.set_grad_enabled(False)
        cudnn.benchmark = True
        self.model_path = model_path
        self.cfg = Config.from_file(
--- a/modelscope/models/cv/face_detection/ulfd_slim/detection.py
+++ b/modelscope/models/cv/face_detection/ulfd_slim/detection.py
@@ -24,7 +24,6 @@ class UlfdFaceDetector(TorchModel):

    def __init__(self, model_path, device='cuda'):
        super().__init__(model_path)
-        torch.set_grad_enabled(False)
        cudnn.benchmark = True
        self.model_path = model_path
        self.device = device
--- a/modelscope/models/cv/facial_expression_recognition/fer/facial_expression_recognition.py
+++ b/modelscope/models/cv/facial_expression_recognition/fer/facial_expression_recognition.py
@@ -24,7 +24,6 @@ class FacialExpressionRecognition(TorchModel):

    def __init__(self, model_path, device='cuda'):
        super().__init__(model_path)
-        torch.set_grad_enabled(False)
        cudnn.benchmark = True
        self.model_path = model_path
        self.device = device
--- a/modelscope/models/cv/image_portrait_enhancement/retinaface/detection.py
+++ b/modelscope/models/cv/image_portrait_enhancement/retinaface/detection.py
@@ -31,7 +31,6 @@ cfg_re50 = {
 class RetinaFaceDetection(object):

    def __init__(self, model_path, device='cuda'):
-        torch.set_grad_enabled(False)
        cudnn.benchmark = True
        self.model_path = model_path
        self.device = device
--- a/tests/run.py
+++ b/tests/run.py
@@ -3,11 +3,13 @@

 import argparse
 import datetime
+import math
 import multiprocessing
 import os
 import subprocess
 import sys
 import tempfile
+import time
 import unittest
 from fnmatch import fnmatch
 from multiprocessing.managers import BaseManager
@@ -158,6 +160,21 @@ def run_command_with_popen(cmd):
            sys.stdout.write(line)


+def async_run_command_with_popen(cmd, device_id):
+    logger.info('Worker id: %s args: %s' % (device_id, cmd))
+    env = os.environ.copy()
+    env['CUDA_VISIBLE_DEVICES'] = '%s' % device_id
+    sub_process = subprocess.Popen(
+        cmd,
+        stdout=subprocess.PIPE,
+        stderr=subprocess.STDOUT,
+        bufsize=1,
+        universal_newlines=True,
+        env=env,
+        encoding='utf8')
+    return sub_process
+
+
 def save_test_result(df, args):
    if args.result_dir is not None:
        file_name = str(int(datetime.datetime.now().timestamp() * 1000))
@@ -199,6 +216,108 @@ def install_requirements(requirements):
        run_command(cmd)


+def wait_for_free_worker(workers):
+    while True:
+        for idx, worker in enumerate(workers):
+            if worker is None:
+                logger.info('return free worker: %s' % (idx))
+                return idx
+            if worker.poll() is None:  # running, get output
+                for line in iter(worker.stdout.readline, ''):
+                    if line != '':
+                        sys.stdout.write(line)
+                    else:
+                        break
+            else:  # worker process completed.
+                logger.info('Process end: %s' % (idx))
+                workers[idx] = None
+                return idx
+        time.sleep(0.001)
+
+
+def wait_for_workers(workers):
+    while True:
+        for idx, worker in enumerate(workers):
+            if worker is None:
+                continue
+            # check worker is completed.
+            if worker.poll() is None:
+                for line in iter(worker.stdout.readline, ''):
+                    if line != '':
+                        sys.stdout.write(line)
+                    else:
+                        break
+            else:
+                logger.info('Process idx: %s end!' % (idx))
+                workers[idx] = None
+
+        is_all_completed = True
+        for idx, worker in enumerate(workers):
+            if worker is not None:
+                is_all_completed = False
+                break
+
+        if is_all_completed:
+            logger.info('All sub porcess is completed!')
+            break
+        time.sleep(0.001)
+
+
+def parallel_run_case_in_env(env_name, env, test_suite_env_map, isolated_cases,
+                             result_dir, parallel):
+    logger.info('Running case in env: %s' % env_name)
+    # install requirements and deps # run_config['envs'][env]
+    if 'requirements' in env:
+        install_requirements(env['requirements'])
+    if 'dependencies' in env:
+        install_packages(env['dependencies'])
+    # case worker processes
+    worker_processes = [None] * parallel
+    for test_suite_file in isolated_cases:  # run case in subprocess
+        if test_suite_file in test_suite_env_map and test_suite_env_map[
+                test_suite_file] == env_name:
+            cmd = [
+                'python',
+                'tests/run.py',
+                '--pattern',
+                test_suite_file,
+                '--result_dir',
+                result_dir,
+            ]
+            worker_idx = wait_for_free_worker(worker_processes)
+            worker_process = async_run_command_with_popen(cmd, worker_idx)
+            os.set_blocking(worker_process.stdout.fileno(), False)
+            worker_processes[worker_idx] = worker_process
+        else:
+            pass  # case not in run list.
+
+    # run remain cases in a process.
+    remain_suite_files = []
+    for k, v in test_suite_env_map.items():
+        if k not in isolated_cases and v == env_name:
+            remain_suite_files.append(k)
+    if len(remain_suite_files) == 0:
+        return
+    # roughly split case in parallel
+    part_count = math.ceil(len(remain_suite_files) / parallel)
+    suites_chunks = [
+        remain_suite_files[x:x + part_count]
+        for x in range(0, len(remain_suite_files), part_count)
+    ]
+    for suites_chunk in suites_chunks:
+        worker_idx = wait_for_free_worker(worker_processes)
+        cmd = [
+            'python', 'tests/run.py', '--result_dir', result_dir, '--suites'
+        ]
+        for suite in suites_chunk:
+            cmd.append(suite)
+        worker_process = async_run_command_with_popen(cmd, worker_idx)
+        os.set_blocking(worker_process.stdout.fileno(), False)
+        worker_processes[worker_idx] = worker_process
+
+    wait_for_workers(worker_processes)
+
+
 def run_case_in_env(env_name, env, test_suite_env_map, isolated_cases,
                    result_dir):
    # install requirements and deps # run_config['envs'][env]
@@ -264,8 +383,9 @@ def run_in_subprocess(args):

    with tempfile.TemporaryDirectory() as temp_result_dir:
        for env in set(test_suite_env_map.values()):
-            run_case_in_env(env, run_config['envs'][env], test_suite_env_map,
-                            isolated_cases, temp_result_dir)
+            parallel_run_case_in_env(env, run_config['envs'][env],
+                                     test_suite_env_map, isolated_cases,
+                                     temp_result_dir, args.parallel)

        result_dfs = []
        result_path = Path(temp_result_dir)
@@ -312,6 +432,10 @@ class TimeCostTextTestResult(TextTestResult):
        self.stream.writeln(
            'Test case: %s stop at: %s, cost time: %s(seconds)' %
            (test.test_full_name, test.stop_time, test.time_cost))
+        if torch.cuda.is_available(
+        ) and test.time_cost > 5.0:  # print nvidia-smi
+            cmd = ['nvidia-smi']
+            run_command_with_popen(cmd)
        super(TimeCostTextTestResult, self).stopTest(test)

    def addSuccess(self, test):
@@ -383,6 +507,8 @@ def main(args):
            os.path.abspath(args.test_dir), args.pattern, args.list_tests)
    if not args.list_tests:
        result = runner.run(test_suite)
+        logger.info('Running case completed, pid: %s, suites: %s' %
+                    (os.getpid(), args.suites))
        result = collect_test_results(result)
        df = test_cases_result_to_df(result)
        if args.result_dir is not None:
@@ -417,6 +543,12 @@ if __name__ == '__main__':
        '--result_dir',
        default=None,
        help='Save result to directory, internal use only')
+    parser.add_argument(
+        '--parallel',
+        default=1,
+        type=int,
+        help='Set case parallels, default single process, set with gpu number.'
+    )
    parser.add_argument(
        '--suites',
        nargs='*',
--- a/tests/run_config.yaml
+++ b/tests/run_config.yaml
@@ -1,5 +1,5 @@
 # isolate cases in env, we can install different dependencies in each env.
-isolated:  # test cases that may require excessive anmount of GPU memory, which will be executed in dedicagted process.
+isolated:  # test cases that may require excessive anmount of GPU memory or run long time, which will be executed in dedicagted process.
  - test_text_to_speech.py
  - test_multi_modal_embedding.py
  - test_ofa_tasks.py
@@ -12,6 +12,33 @@ isolated:  # test cases that may require excessive anmount of GPU memory, which
  - test_segmentation_pipeline.py
  - test_image_inpainting.py
  - test_mglm_text_summarization.py
+  - test_team_transfer_trainer.py
+  - test_image_denoise_trainer.py
+  - test_dialog_intent_trainer.py
+  - test_finetune_mplug.py
+  - test_image_instance_segmentation_trainer.py
+  - test_image_portrait_enhancement_trainer.py
+  - test_translation_trainer.py
+  - test_unifold.py
+  - test_automatic_post_editing.py
+  - test_mplug_tasks.py
+  - test_movie_scene_segmentation.py
+  - test_body_3d_keypoints.py
+  - test_finetune_text_generation.py
+  - test_clip_trainer.py
+  - test_ofa_trainer.py
+  - test_fill_mask.py
+  - test_hand_2d_keypoints.py
+  - test_referring_video_object_segmentation.py
+  - test_easycv_trainer_hand_2d_keypoints.py
+  - test_card_detection_scrfd_trainer.py
+  - test_referring_video_object_segmentation_trainer.py
+  - test_person_image_cartoon.py
+  - test_image_style_transfer.py
+  - test_ocr_detection.py
+  - test_automatic_speech_recognition.py
+  - test_image_matting.py
+  - test_skin_retouching.py

 envs:
  default: # default env, case not in other env will in default, pytorch.
--- a/tests/trainers/test_dialog_intent_trainer.py
+++ b/tests/trainers/test_dialog_intent_trainer.py
@@ -94,7 +94,7 @@ class TestDialogIntentTrainer(unittest.TestCase):
        cfg.Model.update(config['Model'])
        if self.debugging:
            cfg.Trainer.save_checkpoint = False
-            cfg.Trainer.num_epochs = 5
+            cfg.Trainer.num_epochs = 1
            cfg.Trainer.batch_size_label = 64
        return cfg