diff --git a/.dev_scripts/ci_container_test.sh b/.dev_scripts/ci_container_test.sh
index 98e9f88d..129a6c25 100644
--- a/.dev_scripts/ci_container_test.sh
+++ b/.dev_scripts/ci_container_test.sh
@@ -1,11 +1,9 @@
-pip install -r requirements.txt -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
-pip install -r requirements/audio.txt -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
-pip install -r requirements/cv.txt -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
-pip install -r requirements/multi-modal.txt -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
-pip install -r requirements/nlp.txt -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
+awk -F: '/^[^#]/ { print $1 }' requirements/framework.txt | xargs -n 1 pip install -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
+awk -F: '/^[^#]/ { print $1 }' requirements/audio.txt | xargs -n 1 pip install -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
+awk -F: '/^[^#]/ { print $1 }' requirements/cv.txt | xargs -n 1 pip install -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
+awk -F: '/^[^#]/ { print $1 }' requirements/multi-modal.txt | xargs -n 1 pip install -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
+awk -F: '/^[^#]/ { print $1 }' requirements/nlp.txt | xargs -n 1 pip install -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
 pip install -r requirements/tests.txt
-# install numpy<=1.18 for tensorflow==1.15.x
-pip install "numpy<=1.18"
 
 git config --global --add safe.directory /Maas-lib
 
@@ -19,4 +17,10 @@ fi
 # test with install
 python setup.py install
 
-python tests/run.py
+if [ $# -eq 0 ]; then
+    ci_command="python tests/run.py --subprocess"
+else
+    ci_command="$@"
+fi
+echo "Running case with command: $ci_command"
+$ci_command
diff --git a/.dev_scripts/citest.sh b/.dev_scripts/citest.sh
deleted file mode 100644
index c6e0905f..00000000
--- a/.dev_scripts/citest.sh
+++ /dev/null
@@ -1,19 +0,0 @@
-pip install -r requirements.txt -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
-pip install -r requirements/audio.txt -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
-pip install -r requirements/cv.txt -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
-pip install -r requirements/multi-modal.txt -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
-pip install -r requirements/nlp.txt -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
-
-pip install -r requirements/tests.txt
-# install numpy<=1.18 for tensorflow==1.15.x
-pip install "numpy<=1.18"
-
-# linter test
-# use internal project for pre-commit due to the network problem
-pre-commit run --all-files
-if [ $? -ne 0 ]; then
-    echo "linter test failed, please run 'pre-commit run --all-files' to check"
-    exit -1
-fi
-
-PYTHONPATH=. python tests/run.py
diff --git a/.dev_scripts/dockerci.sh b/.dev_scripts/dockerci.sh
index 383eb909..af94b211 100644
--- a/.dev_scripts/dockerci.sh
+++ b/.dev_scripts/dockerci.sh
@@ -7,7 +7,9 @@ gpus='7 6 5 4 3 2 1 0'
 cpu_sets='0-7 8-15 16-23 24-30 31-37 38-44 45-51 52-58'
 cpu_sets_arr=($cpu_sets)
 is_get_file_lock=false
-CI_COMMAND=${CI_COMMAND:-'bash .dev_scripts/ci_container_test.sh'}
+# export RUN_CASE_COMMAND='python tests/run.py --run_config tests/run_config.yaml'
+CI_COMMAND=${CI_COMMAND:-bash .dev_scripts/ci_container_test.sh $RUN_CASE_BASE_COMMAND}
+echo "ci command: $CI_COMMAND"
 for gpu in $gpus
 do
   exec {lock_fd}>"/tmp/gpu$gpu" || exit 1
@@ -15,6 +17,7 @@ do
   echo "get gpu lock $gpu"
   CONTAINER_NAME="modelscope-ci-$gpu"
   let is_get_file_lock=true
+
   # pull image if there are update
   docker pull ${IMAGE_NAME}:${IMAGE_VERSION}
   docker run --rm --name $CONTAINER_NAME --shm-size=16gb \
@@ -32,10 +35,13 @@ do
              -e TEST_ACCESS_TOKEN_CITEST=$TEST_ACCESS_TOKEN_CITEST \
              -e TEST_ACCESS_TOKEN_SDKDEV=$TEST_ACCESS_TOKEN_SDKDEV \
              -e TEST_LEVEL=$TEST_LEVEL \
+             -e TEST_UPLOAD_MS_TOKEN=$TEST_UPLOAD_MS_TOKEN \
+             -e MODEL_TAG_URL=$MODEL_TAG_URL \
              --workdir=$CODE_DIR_IN_CONTAINER \
              --net host  \
              ${IMAGE_NAME}:${IMAGE_VERSION} \
              $CI_COMMAND
+
   if [ $? -ne 0 ]; then
     echo "Running test case failed, please check the log!"
     exit -1
diff --git a/.gitattributes b/.gitattributes
index 60ff0dd2..1a3015ec 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -4,4 +4,6 @@
 *.wav filter=lfs diff=lfs merge=lfs -text
 *.JPEG filter=lfs diff=lfs merge=lfs -text
 *.jpeg filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
 *.avi filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
diff --git a/.readthedocs.yaml b/.readthedocs.yaml
index b88d734a..f7b9c7ea 100644
--- a/.readthedocs.yaml
+++ b/.readthedocs.yaml
@@ -25,4 +25,4 @@ python:
   install:
     - requirements: requirements/docs.txt
     - requirements: requirements/readthedocs.txt
-    - requirements: requirements/runtime.txt
+    - requirements: requirements/framework.txt
diff --git a/configs/cv/configuration.json b/configs/cv/configuration.json
index 2b0da89d..ae07fa10 100644
--- a/configs/cv/configuration.json
+++ b/configs/cv/configuration.json
@@ -2,7 +2,6 @@
     "framework": "pytorch",
 
     "task": "image_classification",
-    "work_dir": "./work_dir",
 
     "model": {
         "type": "classification",
@@ -119,6 +118,7 @@
     },
 
     "train": {
+        "work_dir": "./work_dir",
         "dataloader": {
             "batch_size_per_gpu": 2,
             "workers_per_gpu": 1
diff --git a/data/test/audios/1ch_nihaomiya.wav b/data/test/audios/1ch_nihaomiya.wav
new file mode 100644
index 00000000..4618d412
--- /dev/null
+++ b/data/test/audios/1ch_nihaomiya.wav
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4f7f5a0a4efca1e83463cb44460c66b56fb7cd673eb6da37924637bc05ef758d
+size 1440044
diff --git a/data/test/images/face_emotion.jpg b/data/test/images/face_emotion.jpg
new file mode 100644
index 00000000..54f22280
--- /dev/null
+++ b/data/test/images/face_emotion.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:712b5525e37080d33f62d6657609dbef20e843ccc04ee5c788ea11aa7c08545e
+size 123341
diff --git a/data/test/images/face_human_hand_detection.jpg b/data/test/images/face_human_hand_detection.jpg
new file mode 100644
index 00000000..f94bb547
--- /dev/null
+++ b/data/test/images/face_human_hand_detection.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8fddc7be8381eb244cd692601f1c1e6cf3484b44bb4e73df0bc7de29352eb487
+size 23889
diff --git a/data/test/images/facial_expression_recognition.jpg b/data/test/images/facial_expression_recognition.jpg
new file mode 100644
index 00000000..a943fa72
--- /dev/null
+++ b/data/test/images/facial_expression_recognition.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bdb1cef5a5fd5f938a856311011c4820ddc45946a470b9929c61e59b6a065633
+size 161535
diff --git a/data/test/images/hand_keypoints.jpg b/data/test/images/hand_keypoints.jpg
new file mode 100644
index 00000000..cb445c26
--- /dev/null
+++ b/data/test/images/hand_keypoints.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c05d58edee7398de37b8e479410676d6b97cfde69cc003e8356a348067e71988
+size 7750
diff --git a/data/test/images/hand_static.jpg b/data/test/images/hand_static.jpg
new file mode 100644
index 00000000..43ae28b1
--- /dev/null
+++ b/data/test/images/hand_static.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:94b8e281d77ee6d3ea2a8a0c9408ecdbd29fe75f33ea5399b6ea00070ba77bd6
+size 13090
diff --git a/data/test/images/image-text-retrieval.jpg b/data/test/images/image-text-retrieval.jpg
new file mode 100644
index 00000000..2d20374a
--- /dev/null
+++ b/data/test/images/image-text-retrieval.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b012c7e966f6550874ccb85ef9602d483aa89b8623dff9ffcdb0faab8f2ca9ab
+size 218143
diff --git a/data/test/images/image_panoptic_segmentation.jpg b/data/test/images/image_panoptic_segmentation.jpg
new file mode 100644
index 00000000..2a8d826b
--- /dev/null
+++ b/data/test/images/image_panoptic_segmentation.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:59b1da30af12f76b691990363e0d221050a59cf53fc4a97e776bcb00228c6c2a
+size 245864
diff --git a/data/test/images/image_segmentation.jpg b/data/test/images/image_segmentation.jpg
new file mode 100644
index 00000000..a9c0875c
--- /dev/null
+++ b/data/test/images/image_segmentation.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:af6fa61274e497ecc170de5adc4b8e7ac89eba2bc22a6aa119b08ec7adbe9459
+size 146140
diff --git a/data/test/images/image_semantic_segmentation.jpg b/data/test/images/image_semantic_segmentation.jpg
new file mode 100644
index 00000000..2a8d826b
--- /dev/null
+++ b/data/test/images/image_semantic_segmentation.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:59b1da30af12f76b691990363e0d221050a59cf53fc4a97e776bcb00228c6c2a
+size 245864
diff --git a/data/test/images/keypoints_detect/test_img_face_2d_keypoints.png b/data/test/images/keypoints_detect/test_img_face_2d_keypoints.png
new file mode 100644
index 00000000..00311c33
--- /dev/null
+++ b/data/test/images/keypoints_detect/test_img_face_2d_keypoints.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:331ead75033fa2f01f6be72a2f8e34d581fcb593308067815d4bb136bb13b766
+size 54390
diff --git a/data/test/images/mog_face_detection.jpg b/data/test/images/mog_face_detection.jpg
new file mode 100644
index 00000000..c95881fe
--- /dev/null
+++ b/data/test/images/mog_face_detection.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:176c824d99af119b36f743d3d90b44529167b0e4fc6db276da60fa140ee3f4a9
+size 87228
diff --git a/data/test/images/mtcnn_face_detection.jpg b/data/test/images/mtcnn_face_detection.jpg
new file mode 100644
index 00000000..c95881fe
--- /dev/null
+++ b/data/test/images/mtcnn_face_detection.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:176c824d99af119b36f743d3d90b44529167b0e4fc6db276da60fa140ee3f4a9
+size 87228
diff --git a/data/test/images/multimodal_similarity.jpg b/data/test/images/multimodal_similarity.jpg
new file mode 100644
index 00000000..70a2b844
--- /dev/null
+++ b/data/test/images/multimodal_similarity.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1f24abbba43782d733dedbb0b4f416635af50263862e5632963ac9263e430555
+size 88542
diff --git a/data/test/images/product_segmentation.jpg b/data/test/images/product_segmentation.jpg
new file mode 100644
index 00000000..c188a69e
--- /dev/null
+++ b/data/test/images/product_segmentation.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a16038f7809127eb3e03cbae049592d193707e095309daca78f7d108d67fe4ec
+size 108357
diff --git a/data/test/images/retina_face_detection.jpg b/data/test/images/retina_face_detection.jpg
new file mode 100644
index 00000000..c95881fe
--- /dev/null
+++ b/data/test/images/retina_face_detection.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:176c824d99af119b36f743d3d90b44529167b0e4fc6db276da60fa140ee3f4a9
+size 87228
diff --git a/data/test/images/shop_segmentation.jpg b/data/test/images/shop_segmentation.jpg
new file mode 100644
index 00000000..ec02881d
--- /dev/null
+++ b/data/test/images/shop_segmentation.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f5ecc371c8b0ca09d0e11df89bc549000937eafc451929586426fe657ade25a0
+size 238607
diff --git a/data/test/images/text_driven_segmentation.jpg b/data/test/images/text_driven_segmentation.jpg
new file mode 100644
index 00000000..e3320b1f
--- /dev/null
+++ b/data/test/images/text_driven_segmentation.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2c7d2f279e3b317f1d0de18410a0585e122166fa2464c17b88a0c813f6c58bd4
+size 67861
diff --git a/data/test/images/ulfd_face_detection.jpg b/data/test/images/ulfd_face_detection.jpg
new file mode 100644
index 00000000..c95881fe
--- /dev/null
+++ b/data/test/images/ulfd_face_detection.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:176c824d99af119b36f743d3d90b44529167b0e4fc6db276da60fa140ee3f4a9
+size 87228
diff --git a/data/test/regression/fill_mask_bert_zh.bin b/data/test/regression/fill_mask_bert_zh.bin
new file mode 100644
index 00000000..17c28b81
--- /dev/null
+++ b/data/test/regression/fill_mask_bert_zh.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:541183383bb06aa3ca2c44a68cd51c1be5e3e984a1dee2c58092b9552660f3ce
+size 61883
diff --git a/data/test/regression/fill_mask_sbert_en.bin b/data/test/regression/fill_mask_sbert_en.bin
new file mode 100644
index 00000000..09aaf300
--- /dev/null
+++ b/data/test/regression/fill_mask_sbert_en.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8f0afcd9d2aa5ac9569114203bd9db4f1a520c903a88fd4854370cdde0e7eab7
+size 119940
diff --git a/data/test/regression/fill_mask_sbert_zh.bin b/data/test/regression/fill_mask_sbert_zh.bin
new file mode 100644
index 00000000..812f7ba2
--- /dev/null
+++ b/data/test/regression/fill_mask_sbert_zh.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4fd6fa6b23c2fdaf876606a767d9b64b1924e1acddfc06ac42db73ba86083280
+size 119940
diff --git a/data/test/regression/fill_mask_veco_en.bin b/data/test/regression/fill_mask_veco_en.bin
new file mode 100644
index 00000000..be3fddc8
--- /dev/null
+++ b/data/test/regression/fill_mask_veco_en.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4d37672a0e299a08d2daf5c7fc29bfce96bb15701fe5e5e68f068861ac2ee705
+size 119619
diff --git a/data/test/regression/fill_mask_veco_zh.bin b/data/test/regression/fill_mask_veco_zh.bin
new file mode 100644
index 00000000..c0d27e20
--- /dev/null
+++ b/data/test/regression/fill_mask_veco_zh.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c692e0753cfe349e520511427727a8252f141fa10e85f9a61562845e8d731f9a
+size 119619
diff --git a/data/test/regression/sbert_nli.bin b/data/test/regression/sbert_nli.bin
new file mode 100644
index 00000000..a5f680bb
--- /dev/null
+++ b/data/test/regression/sbert_nli.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:44e3925c15d86d8596baeb6bd1d153d86f57b7489798b2cf988a1248e110fd62
+size 62231
diff --git a/data/test/regression/sbert_sen_sim.bin b/data/test/regression/sbert_sen_sim.bin
new file mode 100644
index 00000000..a59cbe0b
--- /dev/null
+++ b/data/test/regression/sbert_sen_sim.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1ff17a0272752de4c88d4254b2e881f97f8ef022f03609d03ee1de0ae964368a
+size 62235
diff --git a/data/test/regression/sbert_ws_en.bin b/data/test/regression/sbert_ws_en.bin
new file mode 100644
index 00000000..4eb562d6
--- /dev/null
+++ b/data/test/regression/sbert_ws_en.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9103ce2bc89212f67fb49ce70783b7667e376900d0f70fb8f5c4432eb74bc572
+size 60801
diff --git a/data/test/regression/sbert_ws_zh.bin b/data/test/regression/sbert_ws_zh.bin
new file mode 100644
index 00000000..555f640d
--- /dev/null
+++ b/data/test/regression/sbert_ws_zh.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2d4dee34c7e83b77db04fb2f0d1200bfd37c7c24954c58e185da5cb96445975c
+size 60801
diff --git a/data/test/regression/sbert_zero_shot.bin b/data/test/regression/sbert_zero_shot.bin
new file mode 100644
index 00000000..23d40946
--- /dev/null
+++ b/data/test/regression/sbert_zero_shot.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9e3ecc2c30d382641d561f84849b199c12bb1a9418e8099a191153f6f5275a85
+size 61589
diff --git a/data/test/videos/Walking.54138969.mp4 b/data/test/videos/Walking.54138969.mp4
new file mode 100644
index 00000000..d4355290
--- /dev/null
+++ b/data/test/videos/Walking.54138969.mp4
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7663f9a32ea57086bf66c4b9e9ebe0fd418986c67716c7be02ca917e72ddc0ba
+size 8155895
diff --git a/data/test/videos/action_detection_test_video.mp4 b/data/test/videos/action_detection_test_video.mp4
new file mode 100644
index 00000000..e2ea1d80
--- /dev/null
+++ b/data/test/videos/action_detection_test_video.mp4
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0b7c3bc7c82ea5fee9d83130041df01046d89143ff77058b04577455ff6fdc92
+size 3191059
diff --git a/data/test/videos/mask_dir/mask_00000_00320.png b/data/test/videos/mask_dir/mask_00000_00320.png
new file mode 100644
index 00000000..2eae71a1
--- /dev/null
+++ b/data/test/videos/mask_dir/mask_00000_00320.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b158f6029d9763d7f84042f7c5835f398c688fdbb6b3f4fe6431101d4118c66c
+size 2766
diff --git a/data/test/videos/mask_dir/mask_00321_00633.png b/data/test/videos/mask_dir/mask_00321_00633.png
new file mode 100644
index 00000000..89633eb6
--- /dev/null
+++ b/data/test/videos/mask_dir/mask_00321_00633.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0dcf46b93077e2229ab69cd6ddb80e2689546c575ee538bb2033fee1124ef3e3
+size 2761
diff --git a/data/test/videos/movie_scene_segmentation_test_video.mp4 b/data/test/videos/movie_scene_segmentation_test_video.mp4
new file mode 100644
index 00000000..21ea3cb1
--- /dev/null
+++ b/data/test/videos/movie_scene_segmentation_test_video.mp4
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:03002807dc2aa180c3ae104e764c7a4d6c421d186a5d552f97d338467ae6c443
+size 12722029
diff --git a/data/test/videos/video_inpainting_test.mp4 b/data/test/videos/video_inpainting_test.mp4
new file mode 100644
index 00000000..61f96fac
--- /dev/null
+++ b/data/test/videos/video_inpainting_test.mp4
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9c9870df5a86acaaec67063183dace795479cd0f05296f13058995f475149c56
+size 2957783
diff --git a/docker/Dockerfile.ubuntu b/docker/Dockerfile.ubuntu
index 97881007..a9a409b5 100644
--- a/docker/Dockerfile.ubuntu
+++ b/docker/Dockerfile.ubuntu
@@ -34,7 +34,8 @@ RUN wget --quiet https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-${a
     cp /tmp/resources/conda.tuna  ~/.condarc && \
     source /root/.bashrc && \
     conda install --yes python==${PYTHON_VERSION} && \
-    pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple
+    pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple && \
+    pip config set install.trusted-host pypi.tuna.tsinghua.edu.cn
 
 ARG USE_GPU=True
 
@@ -42,15 +43,15 @@ ARG USE_GPU=True
 ARG TORCH_VERSION=1.12.0
 ARG CUDATOOLKIT_VERSION=11.3
 RUN if [ "$USE_GPU" = "True" ] ; then \
-        conda install --yes pytorch==$TORCH_VERSION torchvision torchaudio cudatoolkit=$CUDATOOLKIT_VERSION -c pytorch && conda clean --yes --all; \
+        pip install --no-cache-dir torch==$TORCH_VERSION torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu113; \
     else \
-        conda install pytorch==$TORCH_VERSION torchvision torchaudio cpuonly -c pytorch; \
+        pip install --no-cache-dir torch==$TORCH_VERSION torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cpu; \
     fi
 
 # install tensorflow
 ARG TENSORFLOW_VERSION=1.15.5
 RUN if [ "$USE_GPU" = "True" ] ; then \
-        pip install --no-cache-dir --use-deprecated=legacy-resolver tensorflow==$TENSORFLOW_VERSION -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html; \
+        pip install --no-cache-dir tensorflow==$TENSORFLOW_VERSION -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html; \
     else \
         pip install --no-cache-dir tensorflow==$TENSORFLOW_VERSION; \
     fi
@@ -64,7 +65,7 @@ RUN if [ "$USE_GPU" = "True" ] ; then \
 # install modelscope
 COPY requirements /var/modelscope
 RUN pip install --no-cache-dir --upgrade pip && \
-    pip install --no-cache-dir -r /var/modelscope/runtime.txt -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html && \
+    pip install --no-cache-dir -r /var/modelscope/framework.txt -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html && \
     pip install --no-cache-dir -r /var/modelscope/audio.txt -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html && \
     pip install --no-cache-dir -r /var/modelscope/cv.txt -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html && \
     pip install --no-cache-dir -r /var/modelscope/multi-modal.txt -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html && \
@@ -75,7 +76,7 @@ RUN pip install --no-cache-dir --upgrade pip && \
 ENV SHELL=/bin/bash
 
 # install special package
-RUN pip install --no-cache-dir mmcls>=0.21.0 mmdet>=2.25.0 decord>=0.6.0 numpy==1.18.5 datasets==2.1.0
+RUN pip install --no-cache-dir mmcls>=0.21.0 mmdet>=2.25.0 decord>=0.6.0 datasets==2.1.0 numpy==1.18.5 ipykernel fairseq
 
 RUN if [ "$USE_GPU" = "True" ] ; then \
         pip install --no-cache-dir dgl-cu113 dglgo -f https://data.dgl.ai/wheels/repo.html; \
diff --git a/modelscope/exporters/__init__.py b/modelscope/exporters/__init__.py
new file mode 100644
index 00000000..a597114f
--- /dev/null
+++ b/modelscope/exporters/__init__.py
@@ -0,0 +1,4 @@
+from .base import Exporter
+from .builder import build_exporter
+from .nlp import SbertForSequenceClassificationExporter
+from .torch_model_exporter import TorchModelExporter
diff --git a/modelscope/exporters/base.py b/modelscope/exporters/base.py
new file mode 100644
index 00000000..f19d2bbb
--- /dev/null
+++ b/modelscope/exporters/base.py
@@ -0,0 +1,53 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os
+from abc import ABC, abstractmethod
+
+from modelscope.models import Model
+from modelscope.utils.config import Config, ConfigDict
+from modelscope.utils.constant import ModelFile
+from .builder import build_exporter
+
+
+class Exporter(ABC):
+    """Exporter base class to output model to onnx, torch_script, graphdef, etc.
+    """
+
+    def __init__(self):
+        self.model = None
+
+    @classmethod
+    def from_model(cls, model: Model, **kwargs):
+        """Build the Exporter instance.
+
+        @param model: A model instance. it will be used to output the generated file,
+            and the configuration.json in its model_dir field will be used to create the exporter instance.
+        @param kwargs: Extra kwargs used to create the Exporter instance.
+        @return: The Exporter instance
+        """
+        cfg = Config.from_file(
+            os.path.join(model.model_dir, ModelFile.CONFIGURATION))
+        task_name = cfg.task
+        model_cfg = cfg.model
+        if hasattr(model_cfg, 'model_type') and not hasattr(model_cfg, 'type'):
+            model_cfg.type = model_cfg.model_type
+        export_cfg = ConfigDict({'type': model_cfg.type})
+        if hasattr(cfg, 'export'):
+            export_cfg.update(cfg.export)
+        exporter = build_exporter(export_cfg, task_name, kwargs)
+        exporter.model = model
+        return exporter
+
+    @abstractmethod
+    def export_onnx(self, outputs: str, opset=11, **kwargs):
+        """Export the model as onnx format files.
+
+        In some cases,  several files may be generated,
+        So please return a dict which contains the generated name with the file path.
+
+        @param opset: The version of the ONNX operator set to use.
+        @param outputs: The output dir.
+        @param kwargs: In this default implementation,
+            kwargs will be carried to generate_dummy_inputs as extra arguments (like input shape).
+        @return: A dict contains the model name with the model file path.
+        """
+        pass
diff --git a/modelscope/exporters/builder.py b/modelscope/exporters/builder.py
new file mode 100644
index 00000000..90699c12
--- /dev/null
+++ b/modelscope/exporters/builder.py
@@ -0,0 +1,21 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+from modelscope.utils.config import ConfigDict
+from modelscope.utils.registry import Registry, build_from_cfg
+
+EXPORTERS = Registry('exporters')
+
+
+def build_exporter(cfg: ConfigDict,
+                   task_name: str = None,
+                   default_args: dict = None):
+    """ build exporter by the given model config dict
+
+    Args:
+        cfg (:obj:`ConfigDict`): config dict for exporter object.
+        task_name (str, optional):  task name, refer to
+            :obj:`Tasks` for more details
+        default_args (dict, optional): Default initialization arguments.
+    """
+    return build_from_cfg(
+        cfg, EXPORTERS, group_key=task_name, default_args=default_args)
diff --git a/modelscope/exporters/nlp/__init__.py b/modelscope/exporters/nlp/__init__.py
new file mode 100644
index 00000000..fdfd2711
--- /dev/null
+++ b/modelscope/exporters/nlp/__init__.py
@@ -0,0 +1,2 @@
+from .sbert_for_sequence_classification_exporter import \
+    SbertForSequenceClassificationExporter
diff --git a/modelscope/exporters/nlp/sbert_for_sequence_classification_exporter.py b/modelscope/exporters/nlp/sbert_for_sequence_classification_exporter.py
new file mode 100644
index 00000000..dc1e2b92
--- /dev/null
+++ b/modelscope/exporters/nlp/sbert_for_sequence_classification_exporter.py
@@ -0,0 +1,81 @@
+import os
+from collections import OrderedDict
+from typing import Any, Dict, Mapping, Tuple
+
+from torch.utils.data.dataloader import default_collate
+
+from modelscope.exporters.builder import EXPORTERS
+from modelscope.exporters.torch_model_exporter import TorchModelExporter
+from modelscope.metainfo import Models
+from modelscope.preprocessors import Preprocessor, build_preprocessor
+from modelscope.utils.config import Config
+from modelscope.utils.constant import ModeKeys, Tasks
+
+
+@EXPORTERS.register_module(
+    Tasks.sentence_similarity, module_name=Models.structbert)
+@EXPORTERS.register_module(
+    Tasks.sentiment_classification, module_name=Models.structbert)
+@EXPORTERS.register_module(Tasks.nli, module_name=Models.structbert)
+@EXPORTERS.register_module(
+    Tasks.zero_shot_classification, module_name=Models.structbert)
+class SbertForSequenceClassificationExporter(TorchModelExporter):
+
+    def generate_dummy_inputs(self,
+                              shape: Tuple = None,
+                              **kwargs) -> Dict[str, Any]:
+        """Generate dummy inputs for model exportation to onnx or other formats by tracing.
+
+        @param shape: A tuple of input shape which should have at most two dimensions.
+            shape = (1, ) batch_size=1, sequence_length will be taken from the preprocessor.
+            shape = (8, 128) batch_size=1, sequence_length=128, which will cover the config of the preprocessor.
+        @return: Dummy inputs.
+        """
+
+        cfg = Config.from_file(
+            os.path.join(self.model.model_dir, 'configuration.json'))
+        field_name = Tasks.find_field_by_task(cfg.task)
+        if 'type' not in cfg.preprocessor and 'val' in cfg.preprocessor:
+            cfg = cfg.preprocessor.val
+        else:
+            cfg = cfg.preprocessor
+
+        batch_size = 1
+        sequence_length = {}
+        if shape is not None:
+            if len(shape) == 1:
+                batch_size = shape[0]
+            elif len(shape) == 2:
+                batch_size, max_length = shape
+                sequence_length = {'sequence_length': max_length}
+
+        cfg.update({
+            'model_dir': self.model.model_dir,
+            'mode': ModeKeys.TRAIN,
+            **sequence_length
+        })
+        preprocessor: Preprocessor = build_preprocessor(cfg, field_name)
+        if preprocessor.pair:
+            first_sequence = preprocessor.tokenizer.unk_token
+            second_sequence = preprocessor.tokenizer.unk_token
+        else:
+            first_sequence = preprocessor.tokenizer.unk_token
+            second_sequence = None
+
+        batched = []
+        for _ in range(batch_size):
+            batched.append(preprocessor((first_sequence, second_sequence)))
+        return default_collate(batched)
+
+    @property
+    def inputs(self) -> Mapping[str, Mapping[int, str]]:
+        dynamic_axis = {0: 'batch', 1: 'sequence'}
+        return OrderedDict([
+            ('input_ids', dynamic_axis),
+            ('attention_mask', dynamic_axis),
+            ('token_type_ids', dynamic_axis),
+        ])
+
+    @property
+    def outputs(self) -> Mapping[str, Mapping[int, str]]:
+        return OrderedDict({'logits': {0: 'batch'}})
diff --git a/modelscope/exporters/torch_model_exporter.py b/modelscope/exporters/torch_model_exporter.py
new file mode 100644
index 00000000..98a23fe5
--- /dev/null
+++ b/modelscope/exporters/torch_model_exporter.py
@@ -0,0 +1,247 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os
+from contextlib import contextmanager
+from itertools import chain
+from typing import Any, Dict, Mapping
+
+import torch
+from torch import nn
+from torch.onnx import export as onnx_export
+from torch.onnx.utils import _decide_input_format
+
+from modelscope.models import TorchModel
+from modelscope.pipelines.base import collate_fn
+from modelscope.utils.constant import ModelFile
+from modelscope.utils.logger import get_logger
+from modelscope.utils.regress_test_utils import compare_arguments_nested
+from modelscope.utils.tensor_utils import torch_nested_numpify
+from .base import Exporter
+
+logger = get_logger(__name__)
+
+
+class TorchModelExporter(Exporter):
+    """The torch base class of exporter.
+
+    This class provides the default implementations for exporting onnx and torch script.
+    Each specific model may implement its own exporter by overriding the export_onnx/export_torch_script,
+    and to provide implementations for generate_dummy_inputs/inputs/outputs methods.
+    """
+
+    def export_onnx(self, outputs: str, opset=11, **kwargs):
+        """Export the model as onnx format files.
+
+        In some cases,  several files may be generated,
+        So please return a dict which contains the generated name with the file path.
+
+        @param opset: The version of the ONNX operator set to use.
+        @param outputs: The output dir.
+        @param kwargs: In this default implementation,
+            you can pass the arguments needed by _torch_export_onnx, other unrecognized args
+            will be carried to generate_dummy_inputs as extra arguments (such as input shape).
+        @return: A dict containing the model key - model file path pairs.
+        """
+        model = self.model
+        if not isinstance(model, nn.Module) and hasattr(model, 'model'):
+            model = model.model
+        onnx_file = os.path.join(outputs, ModelFile.ONNX_MODEL_FILE)
+        self._torch_export_onnx(model, onnx_file, opset=opset, **kwargs)
+        return {'model': onnx_file}
+
+    def export_torch_script(self, outputs: str, **kwargs):
+        """Export the model as torch script files.
+
+        In some cases,  several files may be generated,
+        So please return a dict which contains the generated name with the file path.
+
+        @param outputs: The output dir.
+        @param kwargs: In this default implementation,
+            you can pass the arguments needed by _torch_export_torch_script, other unrecognized args
+            will be carried to generate_dummy_inputs as extra arguments (like input shape).
+        @return: A dict contains the model name with the model file path.
+        """
+        model = self.model
+        if not isinstance(model, nn.Module) and hasattr(model, 'model'):
+            model = model.model
+        ts_file = os.path.join(outputs, ModelFile.TS_MODEL_FILE)
+        # generate ts by tracing
+        self._torch_export_torch_script(model, ts_file, **kwargs)
+        return {'model': ts_file}
+
+    def generate_dummy_inputs(self, **kwargs) -> Dict[str, Any]:
+        """Generate dummy inputs for model exportation to onnx or other formats by tracing.
+        @return: Dummy inputs.
+        """
+        return None
+
+    @property
+    def inputs(self) -> Mapping[str, Mapping[int, str]]:
+        """Return an ordered dict contains the model's input arguments name with their dynamic axis.
+
+        About the information of dynamic axis please check the dynamic_axes argument of torch.onnx.export function
+        """
+        return None
+
+    @property
+    def outputs(self) -> Mapping[str, Mapping[int, str]]:
+        """Return an ordered dict contains the model's output arguments name with their dynamic axis.
+
+        About the information of dynamic axis please check the dynamic_axes argument of torch.onnx.export function
+        """
+        return None
+
+    def _torch_export_onnx(self,
+                           model: nn.Module,
+                           output: str,
+                           opset: int = 11,
+                           device: str = 'cpu',
+                           validation: bool = True,
+                           rtol: float = None,
+                           atol: float = None,
+                           **kwargs):
+        """Export the model to an onnx format file.
+
+        @param model: A torch.nn.Module instance to export.
+        @param output: The output file.
+        @param opset: The version of the ONNX operator set to use.
+        @param device: The device used to forward.
+        @param validation: Whether validate the export file.
+        @param rtol: The rtol used to regress the outputs.
+        @param atol: The atol used to regress the outputs.
+        """
+
+        dummy_inputs = self.generate_dummy_inputs(**kwargs)
+        inputs = self.inputs
+        outputs = self.outputs
+        if dummy_inputs is None or inputs is None or outputs is None:
+            raise NotImplementedError(
+                'Model property dummy_inputs,inputs,outputs must be set.')
+
+        with torch.no_grad():
+            model.eval()
+            device = torch.device(device)
+            model.to(device)
+            dummy_inputs = collate_fn(dummy_inputs, device)
+
+            if isinstance(dummy_inputs, Mapping):
+                dummy_inputs = dict(dummy_inputs)
+            onnx_outputs = list(self.outputs.keys())
+
+            with replace_call():
+                onnx_export(
+                    model,
+                    (dummy_inputs, ),
+                    f=output,
+                    input_names=list(inputs.keys()),
+                    output_names=onnx_outputs,
+                    dynamic_axes={
+                        name: axes
+                        for name, axes in chain(inputs.items(),
+                                                outputs.items())
+                    },
+                    do_constant_folding=True,
+                    opset_version=opset,
+                )
+
+        if validation:
+            try:
+                import onnx
+                import onnxruntime as ort
+            except ImportError:
+                logger.warn(
+                    'Cannot validate the exported onnx file, because '
+                    'the installation of onnx or onnxruntime cannot be found')
+                return
+            onnx_model = onnx.load(output)
+            onnx.checker.check_model(onnx_model)
+            ort_session = ort.InferenceSession(output)
+            with torch.no_grad():
+                model.eval()
+                outputs_origin = model.forward(
+                    *_decide_input_format(model, dummy_inputs))
+            if isinstance(outputs_origin, Mapping):
+                outputs_origin = torch_nested_numpify(
+                    list(outputs_origin.values()))
+            outputs = ort_session.run(
+                onnx_outputs,
+                torch_nested_numpify(dummy_inputs),
+            )
+
+            tols = {}
+            if rtol is not None:
+                tols['rtol'] = rtol
+            if atol is not None:
+                tols['atol'] = atol
+            if not compare_arguments_nested('Onnx model output match failed',
+                                            outputs, outputs_origin, **tols):
+                raise RuntimeError(
+                    'export onnx failed because of validation error.')
+
+    def _torch_export_torch_script(self,
+                                   model: nn.Module,
+                                   output: str,
+                                   device: str = 'cpu',
+                                   validation: bool = True,
+                                   rtol: float = None,
+                                   atol: float = None,
+                                   **kwargs):
+        """Export the model to a torch script file.
+
+        @param model: A torch.nn.Module instance to export.
+        @param output: The output file.
+        @param device: The device used to forward.
+        @param validation: Whether validate the export file.
+        @param rtol: The rtol used to regress the outputs.
+        @param atol: The atol used to regress the outputs.
+        """
+
+        model.eval()
+        dummy_inputs = self.generate_dummy_inputs(**kwargs)
+        if dummy_inputs is None:
+            raise NotImplementedError(
+                'Model property dummy_inputs must be set.')
+        dummy_inputs = collate_fn(dummy_inputs, device)
+        if isinstance(dummy_inputs, Mapping):
+            dummy_inputs = tuple(dummy_inputs.values())
+        with torch.no_grad():
+            model.eval()
+            with replace_call():
+                traced_model = torch.jit.trace(
+                    model, dummy_inputs, strict=False)
+        torch.jit.save(traced_model, output)
+
+        if validation:
+            ts_model = torch.jit.load(output)
+            with torch.no_grad():
+                model.eval()
+                ts_model.eval()
+                outputs = ts_model.forward(*dummy_inputs)
+                outputs = torch_nested_numpify(outputs)
+                outputs_origin = model.forward(*dummy_inputs)
+                outputs_origin = torch_nested_numpify(outputs_origin)
+            tols = {}
+            if rtol is not None:
+                tols['rtol'] = rtol
+            if atol is not None:
+                tols['atol'] = atol
+            if not compare_arguments_nested(
+                    'Torch script model output match failed', outputs,
+                    outputs_origin, **tols):
+                raise RuntimeError(
+                    'export torch script failed because of validation error.')
+
+
+@contextmanager
+def replace_call():
+    """This function is used to recover the original call method.
+
+    The Model class of modelscope overrides the call method. When exporting to onnx or torchscript, torch will
+    prepare the parameters as the prototype of forward method, and trace the call method, this causes
+    problems. Here we recover the call method to the default implementation of torch.nn.Module, and change it
+    back after the tracing was done.
+    """
+
+    TorchModel.call_origin, TorchModel.__call__ = TorchModel.__call__, TorchModel._call_impl
+    yield
+    TorchModel.__call__ = TorchModel.call_origin
+    del TorchModel.call_origin
diff --git a/modelscope/fileio/__init__.py b/modelscope/fileio/__init__.py
index 5fd10f85..385cd02c 100644
--- a/modelscope/fileio/__init__.py
+++ b/modelscope/fileio/__init__.py
@@ -1,2 +1,4 @@
-from .file import File
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+from .file import File, LocalStorage
 from .io import dump, dumps, load
diff --git a/modelscope/fileio/file.py b/modelscope/fileio/file.py
index 343cad9a..3fff80c8 100644
--- a/modelscope/fileio/file.py
+++ b/modelscope/fileio/file.py
@@ -240,7 +240,7 @@ class File(object):
     @staticmethod
     def _get_storage(uri):
         assert isinstance(uri,
-                          str), f'uri should be str type, buf got {type(uri)}'
+                          str), f'uri should be str type, but got {type(uri)}'
 
         if '://' not in uri:
             # local path
diff --git a/modelscope/fileio/format/__init__.py b/modelscope/fileio/format/__init__.py
index 52e64279..68518266 100644
--- a/modelscope/fileio/format/__init__.py
+++ b/modelscope/fileio/format/__init__.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 from .base import FormatHandler
 from .json import JsonHandler
 from .yaml import YamlHandler
diff --git a/modelscope/fileio/format/json.py b/modelscope/fileio/format/json.py
index 977a8b8c..9979c023 100644
--- a/modelscope/fileio/format/json.py
+++ b/modelscope/fileio/format/json.py
@@ -1,5 +1,4 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
-import json
 import numpy as np
 
 from .base import FormatHandler
@@ -22,14 +21,16 @@ def set_default(obj):
 
 
 class JsonHandler(FormatHandler):
+    """Use jsonplus, serialization of Python types to JSON that "just works"."""
 
     def load(self, file):
-        return json.load(file)
+        import jsonplus
+        return jsonplus.loads(file.read())
 
     def dump(self, obj, file, **kwargs):
-        kwargs.setdefault('default', set_default)
-        json.dump(obj, file, **kwargs)
+        file.write(self.dumps(obj, **kwargs))
 
     def dumps(self, obj, **kwargs):
+        import jsonplus
         kwargs.setdefault('default', set_default)
-        return json.dumps(obj, **kwargs)
+        return jsonplus.dumps(obj, **kwargs)
diff --git a/modelscope/hub/api.py b/modelscope/hub/api.py
index 09bff2c1..8dcfa5b0 100644
--- a/modelscope/hub/api.py
+++ b/modelscope/hub/api.py
@@ -1,7 +1,8 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import os
 import pickle
 import shutil
-import subprocess
 from collections import defaultdict
 from http import HTTPStatus
 from http.cookiejar import CookieJar
@@ -16,8 +17,7 @@ from modelscope.hub.constants import (API_RESPONSE_FIELD_DATA,
                                       API_RESPONSE_FIELD_MESSAGE,
                                       API_RESPONSE_FIELD_USERNAME,
                                       DEFAULT_CREDENTIALS_PATH)
-from modelscope.msdatasets.config import (DOWNLOADED_DATASETS_PATH,
-                                          HUB_DATASET_ENDPOINT)
+from modelscope.utils.config_ds import DOWNLOADED_DATASETS_PATH
 from modelscope.utils.constant import (DEFAULT_DATASET_REVISION,
                                        DEFAULT_MODEL_REVISION,
                                        DatasetFormations, DatasetMetaFormats,
@@ -26,7 +26,8 @@ from modelscope.utils.logger import get_logger
 from .errors import (InvalidParameter, NotExistError, RequestError,
                      datahub_raise_on_error, handle_http_response, is_ok,
                      raise_on_error)
-from .utils.utils import get_endpoint, model_id_to_group_owner_name
+from .utils.utils import (get_dataset_hub_endpoint, get_endpoint,
+                          model_id_to_group_owner_name)
 
 logger = get_logger()
 
@@ -35,7 +36,8 @@ class HubApi:
 
     def __init__(self, endpoint=None, dataset_endpoint=None):
         self.endpoint = endpoint if endpoint is not None else get_endpoint()
-        self.dataset_endpoint = dataset_endpoint if dataset_endpoint is not None else HUB_DATASET_ENDPOINT
+        self.dataset_endpoint = dataset_endpoint if dataset_endpoint is not None else get_dataset_hub_endpoint(
+        )
 
     def login(
         self,
@@ -376,6 +378,27 @@ class HubApi:
                       f'ststoken?Revision={revision}'
         return self.datahub_remote_call(datahub_url)
 
+    def get_dataset_access_config_session(
+            self,
+            cookies: CookieJar,
+            dataset_name: str,
+            namespace: str,
+            revision: Optional[str] = DEFAULT_DATASET_REVISION):
+
+        datahub_url = f'{self.dataset_endpoint}/api/v1/datasets/{namespace}/{dataset_name}/' \
+                      f'ststoken?Revision={revision}'
+
+        cookies = requests.utils.dict_from_cookiejar(cookies)
+        r = requests.get(url=datahub_url, cookies=cookies)
+        resp = r.json()
+        raise_on_error(resp)
+        return resp['Data']
+
+    def on_dataset_download(self, dataset_name: str, namespace: str) -> None:
+        url = f'{self.endpoint}/api/v1/datasets/{namespace}/{dataset_name}/download/increase'
+        r = requests.post(url)
+        r.raise_for_status()
+
     @staticmethod
     def datahub_remote_call(url):
         r = requests.get(url)
@@ -383,6 +406,9 @@ class HubApi:
         datahub_raise_on_error(url, resp)
         return resp['Data']
 
+    def check_cookies_upload_data(self, use_cookies) -> CookieJar:
+        return self._check_cookie(use_cookies=use_cookies)
+
 
 class ModelScopeConfig:
     path_credential = expanduser(DEFAULT_CREDENTIALS_PATH)
diff --git a/modelscope/hub/constants.py b/modelscope/hub/constants.py
index 014a1e59..c8664597 100644
--- a/modelscope/hub/constants.py
+++ b/modelscope/hub/constants.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 from pathlib import Path
 
 MODELSCOPE_URL_SCHEME = 'http://'
diff --git a/modelscope/hub/errors.py b/modelscope/hub/errors.py
index ecd4e1da..c095a6ec 100644
--- a/modelscope/hub/errors.py
+++ b/modelscope/hub/errors.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 from http import HTTPStatus
 
 from requests.exceptions import HTTPError
@@ -49,8 +51,8 @@ def handle_http_response(response, logger, cookies, model_id):
     except HTTPError:
         if cookies is None:  # code in [403] and
             logger.error(
-                f'Authentication token does not exist, failed to access model {model_id} which may not exist or may be private. \
-                  Please login first.')
+                f'Authentication token does not exist, failed to access model {model_id} which may not exist or may be \
+                private. Please login first.')
         raise
 
 
@@ -60,7 +62,7 @@ def raise_on_error(rsp):
     Args:
         rsp (_type_): The server response
     """
-    if rsp['Code'] == HTTPStatus.OK and rsp['Success']:
+    if rsp['Code'] == HTTPStatus.OK:
         return True
     else:
         raise RequestError(rsp['Message'])
diff --git a/modelscope/hub/file_download.py b/modelscope/hub/file_download.py
index 5f15272c..1cc5645b 100644
--- a/modelscope/hub/file_download.py
+++ b/modelscope/hub/file_download.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import copy
 import os
 import sys
diff --git a/modelscope/hub/git.py b/modelscope/hub/git.py
index 08eec3ff..486f8df3 100644
--- a/modelscope/hub/git.py
+++ b/modelscope/hub/git.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import os
 import subprocess
 from typing import List
@@ -39,17 +41,28 @@ class GitCommandWrapper(metaclass=Singleton):
             subprocess.CompletedProcess: the command response
         """
         logger.debug(' '.join(args))
+        git_env = os.environ.copy()
+        git_env['GIT_TERMINAL_PROMPT'] = '0'
         response = subprocess.run(
             [self.git_path, *args],
             stdout=subprocess.PIPE,
-            stderr=subprocess.PIPE)  # compatible for python3.6
+            stderr=subprocess.PIPE,
+            env=git_env,
+        )  # compatible for python3.6
         try:
             response.check_returncode()
             return response
         except subprocess.CalledProcessError as error:
-            raise GitError(
-                'stdout: %s, stderr: %s' %
-                (response.stdout.decode('utf8'), error.stderr.decode('utf8')))
+            if response.returncode == 1:
+                logger.info('Nothing to commit.')
+                return response
+            else:
+                logger.error(
+                    'There are error run git command, you may need to login first.'
+                )
+                raise GitError('stdout: %s, stderr: %s' %
+                               (response.stdout.decode('utf8'),
+                                error.stderr.decode('utf8')))
 
     def config_auth_token(self, repo_dir, auth_token):
         url = self.get_repo_remote_url(repo_dir)
diff --git a/modelscope/hub/repository.py b/modelscope/hub/repository.py
index 51ddf954..d92089ed 100644
--- a/modelscope/hub/repository.py
+++ b/modelscope/hub/repository.py
@@ -1,8 +1,11 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import os
 from typing import Optional
 
 from modelscope.hub.errors import GitError, InvalidParameter, NotLoginException
-from modelscope.utils.constant import DEFAULT_MODEL_REVISION
+from modelscope.utils.constant import (DEFAULT_DATASET_REVISION,
+                                       DEFAULT_MODEL_REVISION)
 from modelscope.utils.logger import get_logger
 from .api import ModelScopeConfig
 from .git import GitCommandWrapper
@@ -15,14 +18,12 @@ class Repository:
     """A local representation of the model git repository.
     """
 
-    def __init__(
-        self,
-        model_dir: str,
-        clone_from: str,
-        revision: Optional[str] = DEFAULT_MODEL_REVISION,
-        auth_token: Optional[str] = None,
-        git_path: Optional[str] = None,
-    ):
+    def __init__(self,
+                 model_dir: str,
+                 clone_from: str,
+                 revision: Optional[str] = DEFAULT_MODEL_REVISION,
+                 auth_token: Optional[str] = None,
+                 git_path: Optional[str] = None):
         """
         Instantiate a Repository object by cloning the remote ModelScopeHub repo
         Args:
@@ -41,6 +42,11 @@ class Repository:
         self.model_dir = model_dir
         self.model_base_dir = os.path.dirname(model_dir)
         self.model_repo_name = os.path.basename(model_dir)
+
+        if not revision:
+            err_msg = 'a non-default value of revision cannot be empty.'
+            raise InvalidParameter(err_msg)
+
         if auth_token:
             self.auth_token = auth_token
         else:
@@ -86,6 +92,7 @@ class Repository:
              branch: Optional[str] = DEFAULT_MODEL_REVISION,
              force: bool = False):
         """Push local files to remote, this method will do.
+           git pull
            git add
            git commit
            git push
@@ -117,3 +124,118 @@ class Repository:
             url=url,
             local_branch=branch,
             remote_branch=branch)
+
+
+class DatasetRepository:
+    """A local representation of the dataset (metadata) git repository.
+    """
+
+    def __init__(self,
+                 repo_work_dir: str,
+                 dataset_id: str,
+                 revision: Optional[str] = DEFAULT_DATASET_REVISION,
+                 auth_token: Optional[str] = None,
+                 git_path: Optional[str] = None):
+        """
+        Instantiate a Dataset Repository object by cloning the remote ModelScope dataset repo
+        Args:
+            repo_work_dir(`str`):
+                The dataset repo root directory.
+            dataset_id:
+                dataset id in ModelScope from which git clone
+            revision(`Optional[str]`):
+                revision of the dataset you want to clone from. Can be any of a branch, tag or commit hash
+            auth_token(`Optional[str]`):
+                token obtained when calling `HubApi.login()`. Usually you can safely ignore the parameter
+                as the token is already saved when you login the first time, if None, we will use saved token.
+            git_path:(`Optional[str]`):
+                The git command line path, if None, we use 'git'
+        """
+        self.dataset_id = dataset_id
+        if not repo_work_dir or not isinstance(repo_work_dir, str):
+            err_msg = 'dataset_work_dir must be provided!'
+            raise InvalidParameter(err_msg)
+        self.repo_work_dir = repo_work_dir.rstrip('/')
+        if not self.repo_work_dir:
+            err_msg = 'dataset_work_dir can not be root dir!'
+            raise InvalidParameter(err_msg)
+        self.repo_base_dir = os.path.dirname(self.repo_work_dir)
+        self.repo_name = os.path.basename(self.repo_work_dir)
+
+        if not revision:
+            err_msg = 'a non-default value of revision cannot be empty.'
+            raise InvalidParameter(err_msg)
+        self.revision = revision
+
+        if auth_token:
+            self.auth_token = auth_token
+        else:
+            self.auth_token = ModelScopeConfig.get_token()
+
+        self.git_wrapper = GitCommandWrapper(git_path)
+        os.makedirs(self.repo_work_dir, exist_ok=True)
+        self.repo_url = self._get_repo_url(dataset_id=dataset_id)
+
+    def clone(self) -> str:
+        # check local repo dir, directory not empty.
+        if os.listdir(self.repo_work_dir):
+            remote_url = self._get_remote_url()
+            remote_url = self.git_wrapper.remove_token_from_url(remote_url)
+            # no need clone again
+            if remote_url and remote_url == self.repo_url:
+                return ''
+
+        logger.info('Cloning repo from {} '.format(self.repo_url))
+        self.git_wrapper.clone(self.repo_base_dir, self.auth_token,
+                               self.repo_url, self.repo_name, self.revision)
+        return self.repo_work_dir
+
+    def push(self,
+             commit_message: str,
+             branch: Optional[str] = DEFAULT_DATASET_REVISION,
+             force: bool = False):
+        """Push local files to remote, this method will do.
+           git pull
+           git add
+           git commit
+           git push
+        Args:
+            commit_message (str): commit message
+            branch (Optional[str], optional): which branch to push.
+            force (Optional[bool]): whether to use forced-push.
+        """
+        if commit_message is None or not isinstance(commit_message, str):
+            msg = 'commit_message must be provided!'
+            raise InvalidParameter(msg)
+
+        if not isinstance(force, bool):
+            raise InvalidParameter('force must be bool')
+
+        if not self.auth_token:
+            raise NotLoginException('Must login to push, please login first.')
+
+        self.git_wrapper.config_auth_token(self.repo_work_dir, self.auth_token)
+        self.git_wrapper.add_user_info(self.repo_base_dir, self.repo_name)
+
+        remote_url = self._get_remote_url()
+        remote_url = self.git_wrapper.remove_token_from_url(remote_url)
+
+        self.git_wrapper.pull(self.repo_work_dir)
+        self.git_wrapper.add(self.repo_work_dir, all_files=True)
+        self.git_wrapper.commit(self.repo_work_dir, commit_message)
+        self.git_wrapper.push(
+            repo_dir=self.repo_work_dir,
+            token=self.auth_token,
+            url=remote_url,
+            local_branch=branch,
+            remote_branch=branch)
+
+    def _get_repo_url(self, dataset_id):
+        return f'{get_endpoint()}/datasets/{dataset_id}.git'
+
+    def _get_remote_url(self):
+        try:
+            remote = self.git_wrapper.get_repo_remote_url(self.repo_work_dir)
+        except GitError:
+            remote = None
+        return remote
diff --git a/modelscope/hub/snapshot_download.py b/modelscope/hub/snapshot_download.py
index c63d8956..cde6ad34 100644
--- a/modelscope/hub/snapshot_download.py
+++ b/modelscope/hub/snapshot_download.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import os
 import tempfile
 from pathlib import Path
diff --git a/modelscope/hub/utils/caching.py b/modelscope/hub/utils/caching.py
index fc30fa27..1acd2e84 100644
--- a/modelscope/hub/utils/caching.py
+++ b/modelscope/hub/utils/caching.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import hashlib
 import os
 import pickle
diff --git a/modelscope/hub/utils/utils.py b/modelscope/hub/utils/utils.py
index 1a55c9f9..d84b78ea 100644
--- a/modelscope/hub/utils/utils.py
+++ b/modelscope/hub/utils/utils.py
@@ -1,7 +1,11 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import hashlib
 import os
+from typing import Optional
 
-from modelscope.hub.constants import (DEFAULT_MODELSCOPE_DOMAIN,
+from modelscope.hub.constants import (DEFAULT_MODELSCOPE_DATA_ENDPOINT,
+                                      DEFAULT_MODELSCOPE_DOMAIN,
                                       DEFAULT_MODELSCOPE_GROUP,
                                       MODEL_ID_SEPARATOR,
                                       MODELSCOPE_URL_SCHEME)
@@ -22,14 +26,16 @@ def model_id_to_group_owner_name(model_id):
     return group_or_owner, name
 
 
-def get_cache_dir():
+def get_cache_dir(model_id: Optional[str] = None):
     """
     cache dir precedence:
         function parameter > enviroment > ~/.cache/modelscope/hub
     """
     default_cache_dir = get_default_cache_dir()
-    return os.getenv('MODELSCOPE_CACHE', os.path.join(default_cache_dir,
-                                                      'hub'))
+    base_path = os.getenv('MODELSCOPE_CACHE',
+                          os.path.join(default_cache_dir, 'hub'))
+    return base_path if model_id is None else os.path.join(
+        base_path, model_id + '/')
 
 
 def get_endpoint():
@@ -38,6 +44,11 @@ def get_endpoint():
     return MODELSCOPE_URL_SCHEME + modelscope_domain
 
 
+def get_dataset_hub_endpoint():
+    return os.environ.get('HUB_DATASET_ENDPOINT',
+                          DEFAULT_MODELSCOPE_DATA_ENDPOINT)
+
+
 def compute_hash(file_path):
     BUFFER_SIZE = 1024 * 64  # 64k buffer size
     sha256_hash = hashlib.sha256()
diff --git a/modelscope/metainfo.py b/modelscope/metainfo.py
index d0684ecd..33273502 100644
--- a/modelscope/metainfo.py
+++ b/modelscope/metainfo.py
@@ -9,8 +9,11 @@ class Models(object):
 
         Model name should only contain model info but not task info.
     """
+    tinynas_detection = 'tinynas-detection'
+
     # vision models
     detection = 'detection'
+    realtime_object_detection = 'realtime-object-detection'
     scrfd = 'scrfd'
     classification_model = 'ClassificationModel'
     nafnet = 'nafnet'
@@ -19,23 +22,54 @@ class Models(object):
     gpen = 'gpen'
     product_retrieval_embedding = 'product-retrieval-embedding'
     body_2d_keypoints = 'body-2d-keypoints'
+    body_3d_keypoints = 'body-3d-keypoints'
     crowd_counting = 'HRNetCrowdCounting'
+    face_2d_keypoints = 'face-2d-keypoints'
+    panoptic_segmentation = 'swinL-panoptic-segmentation'
     image_reid_person = 'passvitb'
     video_summarization = 'pgl-video-summarization'
+    swinL_semantic_segmentation = 'swinL-semantic-segmentation'
+    vitadapter_semantic_segmentation = 'vitadapter-semantic-segmentation'
+    text_driven_segmentation = 'text-driven-segmentation'
+    resnet50_bert = 'resnet50-bert'
+    fer = 'fer'
+    retinaface = 'retinaface'
+    shop_segmentation = 'shop-segmentation'
+    mogface = 'mogface'
+    mtcnn = 'mtcnn'
+    ulfd = 'ulfd'
+    video_inpainting = 'video-inpainting'
+    hand_static = 'hand-static'
+    face_human_hand_detection = 'face-human-hand-detection'
+    face_emotion = 'face-emotion'
+    product_segmentation = 'product-segmentation'
+
+    # EasyCV models
+    yolox = 'YOLOX'
+    segformer = 'Segformer'
 
     # nlp models
     bert = 'bert'
     palm = 'palm-v2'
     structbert = 'structbert'
+    deberta_v2 = 'deberta_v2'
     veco = 'veco'
     translation = 'csanmt-translation'
     space_dst = 'space-dst'
     space_intent = 'space-intent'
     space_modeling = 'space-modeling'
     star = 'star'
+    star3 = 'star3'
     tcrf = 'transformer-crf'
+    transformer_softmax = 'transformer-softmax'
+    lcrf = 'lstm-crf'
+    gcnncrf = 'gcnn-crf'
     bart = 'bart'
     gpt3 = 'gpt3'
+    plug = 'plug'
+    bert_for_ds = 'bert-for-document-segmentation'
+    ponet = 'ponet'
+    T5 = 'T5'
 
     # audio models
     sambert_hifigan = 'sambert-hifigan'
@@ -50,21 +84,33 @@ class Models(object):
     gemm = 'gemm-generative-multi-modal'
     mplug = 'mplug'
     diffusion = 'diffusion-text-to-image-synthesis'
+    multi_stage_diffusion = 'multi-stage-diffusion-text-to-image-synthesis'
+    team = 'team-multi-modal-similarity'
     video_clip = 'video-clip-multi-modal-embedding'
 
 
 class TaskModels(object):
     # nlp task
     text_classification = 'text-classification'
+    token_classification = 'token-classification'
+    information_extraction = 'information-extraction'
+    fill_mask = 'fill-mask'
+    feature_extraction = 'feature-extraction'
 
 
 class Heads(object):
     # nlp heads
+
+    # text cls
     text_classification = 'text-classification'
-    # mlm
+    # fill mask
+    fill_mask = 'fill-mask'
     bert_mlm = 'bert-mlm'
-    # roberta mlm
     roberta_mlm = 'roberta-mlm'
+    # token cls
+    token_classification = 'token-classification'
+    # extraction
+    information_extraction = 'information-extraction'
 
 
 class Pipelines(object):
@@ -86,12 +132,23 @@ class Pipelines(object):
     animal_recognition = 'resnet101-animal-recognition'
     general_recognition = 'resnet101-general-recognition'
     cmdssl_video_embedding = 'cmdssl-r2p1d_video_embedding'
+    hicossl_video_embedding = 'hicossl-s3dg-video_embedding'
     body_2d_keypoints = 'hrnetv2w32_body-2d-keypoints_image'
+    body_3d_keypoints = 'canonical_body-3d-keypoints_video'
+    hand_2d_keypoints = 'hrnetv2w18_hand-2d-keypoints_image'
     human_detection = 'resnet18-human-detection'
     object_detection = 'vit-object-detection'
+    easycv_detection = 'easycv-detection'
+    easycv_segmentation = 'easycv-segmentation'
+    face_2d_keypoints = 'mobilenet_face-2d-keypoints_alignment'
     salient_detection = 'u2net-salient-detection'
     image_classification = 'image-classification'
     face_detection = 'resnet-face-detection-scrfd10gkps'
+    ulfd_face_detection = 'manual-face-detection-ulfd'
+    facial_expression_recognition = 'vgg19-facial-expression-recognition-fer'
+    retina_face_detection = 'resnet50-face-detection-retinaface'
+    mog_face_detection = 'resnet101-face-detection-cvpr22papermogface'
+    mtcnn_face_detection = 'manual-face-detection-mtcnn'
     live_category = 'live-category'
     general_image_classification = 'vit-base_image-classification_ImageNet-labels'
     daily_image_classification = 'vit-base_image-classification_Dailylife-labels'
@@ -102,6 +159,7 @@ class Pipelines(object):
     image_super_resolution = 'rrdb-image-super-resolution'
     face_image_generation = 'gan-face-image-generation'
     product_retrieval_embedding = 'resnet50-product-retrieval-embedding'
+    realtime_object_detection = 'cspnet_realtime-object-detection_yolox'
     face_recognition = 'ir101-face-recognition-cfglint'
     image_instance_segmentation = 'cascade-mask-rcnn-swin-image-instance-segmentation'
     image2image_translation = 'image-to-image-translation'
@@ -112,20 +170,36 @@ class Pipelines(object):
     image_to_image_generation = 'image-to-image-generation'
     skin_retouching = 'unet-skin-retouching'
     tinynas_classification = 'tinynas-classification'
+    tinynas_detection = 'tinynas-detection'
     crowd_counting = 'hrnet-crowd-counting'
+    action_detection = 'ResNetC3D-action-detection'
     video_single_object_tracking = 'ostrack-vitb-video-single-object-tracking'
+    image_panoptic_segmentation = 'image-panoptic-segmentation'
     video_summarization = 'googlenet_pgl_video_summarization'
+    image_semantic_segmentation = 'image-semantic-segmentation'
     image_reid_person = 'passvitb-image-reid-person'
+    text_driven_segmentation = 'text-driven-segmentation'
+    movie_scene_segmentation = 'resnet50-bert-movie-scene-segmentation'
+    shop_segmentation = 'shop-segmentation'
+    video_inpainting = 'video-inpainting'
+    pst_action_recognition = 'patchshift-action-recognition'
+    hand_static = 'hand-static'
+    face_human_hand_detection = 'face-human-hand-detection'
+    face_emotion = 'face-emotion'
+    product_segmentation = 'product-segmentation'
 
     # nlp tasks
     sentence_similarity = 'sentence-similarity'
     word_segmentation = 'word-segmentation'
+    part_of_speech = 'part-of-speech'
     named_entity_recognition = 'named-entity-recognition'
     text_generation = 'text-generation'
+    text2text_generation = 'text2text-generation'
     sentiment_analysis = 'sentiment-analysis'
     sentiment_classification = 'sentiment-classification'
     text_classification = 'text-classification'
     fill_mask = 'fill-mask'
+    fill_mask_ponet = 'fill-mask-ponet'
     csanmt_translation = 'csanmt-translation'
     nli = 'nli'
     dialog_intent_prediction = 'dialog-intent-prediction'
@@ -133,7 +207,15 @@ class Pipelines(object):
     dialog_state_tracking = 'dialog-state-tracking'
     zero_shot_classification = 'zero-shot-classification'
     text_error_correction = 'text-error-correction'
+    plug_generation = 'plug-generation'
+    faq_question_answering = 'faq-question-answering'
     conversational_text_to_sql = 'conversational-text-to-sql'
+    table_question_answering_pipeline = 'table-question-answering-pipeline'
+    sentence_embedding = 'sentence-embedding'
+    passage_ranking = 'passage-ranking'
+    relation_extraction = 'relation-extraction'
+    document_segmentation = 'document-segmentation'
+    feature_extraction = 'feature-extraction'
 
     # audio tasks
     sambert_hifigan_tts = 'sambert-hifigan-tts'
@@ -150,8 +232,10 @@ class Pipelines(object):
     visual_question_answering = 'visual-question-answering'
     visual_grounding = 'visual-grounding'
     visual_entailment = 'visual-entailment'
+    multi_modal_similarity = 'multi-modal-similarity'
     text_to_image_synthesis = 'text-to-image-synthesis'
     video_multi_modal_embedding = 'video-multi-modal-embedding'
+    image_text_retrieval = 'image-text-retrieval'
 
 
 class Trainers(object):
@@ -165,6 +249,7 @@ class Trainers(object):
     """
 
     default = 'trainer'
+    easycv = 'easycv'
 
     # multi-modal trainers
     clip_multi_modal_embedding = 'clip-multi-modal-embedding'
@@ -173,11 +258,18 @@ class Trainers(object):
     image_instance_segmentation = 'image-instance-segmentation'
     image_portrait_enhancement = 'image-portrait-enhancement'
     video_summarization = 'video-summarization'
+    movie_scene_segmentation = 'movie-scene-segmentation'
 
     # nlp trainers
     bert_sentiment_analysis = 'bert-sentiment-analysis'
+    dialog_modeling_trainer = 'dialog-modeling-trainer'
+    dialog_intent_trainer = 'dialog-intent-trainer'
     nlp_base_trainer = 'nlp-base-trainer'
     nlp_veco_trainer = 'nlp-veco-trainer'
+    nlp_passage_ranking_trainer = 'nlp-passage-ranking-trainer'
+
+    # audio trainers
+    speech_frcrn_ans_cirm_16k = 'speech_frcrn_ans_cirm_16k'
 
 
 class Preprocessors(object):
@@ -198,11 +290,14 @@ class Preprocessors(object):
     image_instance_segmentation_preprocessor = 'image-instance-segmentation-preprocessor'
     image_portrait_enhancement_preprocessor = 'image-portrait-enhancement-preprocessor'
     video_summarization_preprocessor = 'video-summarization-preprocessor'
+    movie_scene_segmentation_preprocessor = 'movie-scene-segmentation-preprocessor'
 
     # nlp preprocessor
     sen_sim_tokenizer = 'sen-sim-tokenizer'
+    cross_encoder_tokenizer = 'cross-encoder-tokenizer'
     bert_seq_cls_tokenizer = 'bert-seq-cls-tokenizer'
     text_gen_tokenizer = 'text-gen-tokenizer'
+    text2text_gen_preprocessor = 'text2text-gen-preprocessor'
     token_cls_tokenizer = 'token-cls-tokenizer'
     ner_tokenizer = 'ner-tokenizer'
     nli_tokenizer = 'nli-tokenizer'
@@ -213,9 +308,18 @@ class Preprocessors(object):
     sbert_token_cls_tokenizer = 'sbert-token-cls-tokenizer'
     zero_shot_cls_tokenizer = 'zero-shot-cls-tokenizer'
     text_error_correction = 'text-error-correction'
+    sentence_embedding = 'sentence-embedding'
+    passage_ranking = 'passage-ranking'
+    sequence_labeling_tokenizer = 'sequence-labeling-tokenizer'
     word_segment_text_to_label_preprocessor = 'word-segment-text-to-label-preprocessor'
     fill_mask = 'fill-mask'
+    fill_mask_ponet = 'fill-mask-ponet'
+    faq_question_answering_preprocessor = 'faq-question-answering-preprocessor'
     conversational_text_to_sql = 'conversational-text-to-sql'
+    table_question_answering_preprocessor = 'table-question-answering-preprocessor'
+    re_tokenizer = 're-tokenizer'
+    document_segmentation = 'document-segmentation'
+    feature_extraction = 'feature-extraction'
 
     # audio preprocessor
     linear_aec_fbank = 'linear-aec-fbank'
@@ -234,6 +338,7 @@ class Metrics(object):
 
     # accuracy
     accuracy = 'accuracy'
+    audio_noise_metric = 'audio-noise-metric'
 
     # metrics for image denoise task
     image_denoise_metric = 'image-denoise-metric'
@@ -251,6 +356,8 @@ class Metrics(object):
     # metrics for image-portrait-enhancement task
     image_portrait_enhancement_metric = 'image-portrait-enhancement-metric'
     video_summarization_metric = 'video-summarization-metric'
+    # metric for movie-scene-segmentation task
+    movie_scene_segmentation_metric = 'movie-scene-segmentation-metric'
 
 
 class Optimizers(object):
@@ -300,3 +407,13 @@ class LR_Schedulers(object):
     LinearWarmup = 'LinearWarmup'
     ConstantWarmup = 'ConstantWarmup'
     ExponentialWarmup = 'ExponentialWarmup'
+
+
+class Datasets(object):
+    """ Names for different datasets.
+    """
+    ClsDataset = 'ClsDataset'
+    Face2dKeypointsDataset = 'Face2dKeypointsDataset'
+    SegDataset = 'SegDataset'
+    DetDataset = 'DetDataset'
+    DetImagesMixDataset = 'DetImagesMixDataset'
diff --git a/modelscope/metrics/__init__.py b/modelscope/metrics/__init__.py
index d307f7c9..d3975a2c 100644
--- a/modelscope/metrics/__init__.py
+++ b/modelscope/metrics/__init__.py
@@ -4,6 +4,7 @@ from typing import TYPE_CHECKING
 from modelscope.utils.import_utils import LazyImportModule
 
 if TYPE_CHECKING:
+    from .audio_noise_metric import AudioNoiseMetric
     from .base import Metric
     from .builder import METRICS, build_metric, task_default_metrics
     from .image_color_enhance_metric import ImageColorEnhanceMetric
@@ -15,9 +16,11 @@ if TYPE_CHECKING:
     from .text_generation_metric import TextGenerationMetric
     from .token_classification_metric import TokenClassificationMetric
     from .video_summarization_metric import VideoSummarizationMetric
+    from .movie_scene_segmentation_metric import MovieSceneSegmentationMetric
 
 else:
     _import_structure = {
+        'audio_noise_metric': ['AudioNoiseMetric'],
         'base': ['Metric'],
         'builder': ['METRICS', 'build_metric', 'task_default_metrics'],
         'image_color_enhance_metric': ['ImageColorEnhanceMetric'],
@@ -30,6 +33,7 @@ else:
         'text_generation_metric': ['TextGenerationMetric'],
         'token_classification_metric': ['TokenClassificationMetric'],
         'video_summarization_metric': ['VideoSummarizationMetric'],
+        'movie_scene_segmentation_metric': ['MovieSceneSegmentationMetric'],
     }
 
     import sys
diff --git a/modelscope/metrics/audio_noise_metric.py b/modelscope/metrics/audio_noise_metric.py
new file mode 100644
index 00000000..f26db46d
--- /dev/null
+++ b/modelscope/metrics/audio_noise_metric.py
@@ -0,0 +1,40 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+from typing import Dict
+
+from modelscope.metainfo import Metrics
+from modelscope.metrics.base import Metric
+from modelscope.metrics.builder import METRICS, MetricKeys
+from modelscope.utils.registry import default_group
+
+
+@METRICS.register_module(
+    group_key=default_group, module_name=Metrics.audio_noise_metric)
+class AudioNoiseMetric(Metric):
+    """
+    The metric computation class for acoustic noise suppression task.
+    """
+
+    def __init__(self):
+        self.loss = []
+        self.amp_loss = []
+        self.phase_loss = []
+        self.sisnr = []
+
+    def add(self, outputs: Dict, inputs: Dict):
+        self.loss.append(outputs['loss'].data.cpu())
+        self.amp_loss.append(outputs['amp_loss'].data.cpu())
+        self.phase_loss.append(outputs['phase_loss'].data.cpu())
+        self.sisnr.append(outputs['sisnr'].data.cpu())
+
+    def evaluate(self):
+        avg_loss = sum(self.loss) / len(self.loss)
+        avg_sisnr = sum(self.sisnr) / len(self.sisnr)
+        avg_amp = sum(self.amp_loss) / len(self.amp_loss)
+        avg_phase = sum(self.phase_loss) / len(self.phase_loss)
+        total_loss = avg_loss + avg_amp + avg_phase + avg_sisnr
+        return {
+            'total_loss': total_loss.item(),
+            'avg_sisnr': avg_sisnr.item(),
+            MetricKeys.AVERAGE_LOSS: avg_loss.item()
+        }
diff --git a/modelscope/metrics/builder.py b/modelscope/metrics/builder.py
index c76fe386..9e875cc4 100644
--- a/modelscope/metrics/builder.py
+++ b/modelscope/metrics/builder.py
@@ -1,4 +1,5 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import Dict, Mapping, Union
 
 from modelscope.metainfo import Metrics
 from modelscope.utils.config import ConfigDict
@@ -15,7 +16,12 @@ class MetricKeys(object):
     RECALL = 'recall'
     PSNR = 'psnr'
     SSIM = 'ssim'
+    AVERAGE_LOSS = 'avg_loss'
     FScore = 'fscore'
+    BLEU_1 = 'bleu-1'
+    BLEU_4 = 'bleu-4'
+    ROUGE_1 = 'rouge-1'
+    ROUGE_L = 'rouge-l'
 
 
 task_default_metrics = {
@@ -30,19 +36,25 @@ task_default_metrics = {
     Tasks.image_portrait_enhancement:
     [Metrics.image_portrait_enhancement_metric],
     Tasks.video_summarization: [Metrics.video_summarization_metric],
+    Tasks.image_captioning: [Metrics.text_gen_metric],
+    Tasks.visual_question_answering: [Metrics.text_gen_metric],
+    Tasks.movie_scene_segmentation: [Metrics.movie_scene_segmentation_metric],
 }
 
 
-def build_metric(metric_name: str,
+def build_metric(metric_cfg: Union[str, Dict],
                  field: str = default_group,
                  default_args: dict = None):
     """ Build metric given metric_name and field.
 
     Args:
-        metric_name (:obj:`str`): The metric name.
+        metric_name (str | dict): The metric name or metric config dict.
         field (str, optional):  The field of this metric, default value: 'default' for all fields.
         default_args (dict, optional): Default initialization arguments.
     """
-    cfg = ConfigDict({'type': metric_name})
+    if isinstance(metric_cfg, Mapping):
+        assert 'type' in metric_cfg
+    else:
+        metric_cfg = ConfigDict({'type': metric_cfg})
     return build_from_cfg(
-        cfg, METRICS, group_key=field, default_args=default_args)
+        metric_cfg, METRICS, group_key=field, default_args=default_args)
diff --git a/modelscope/metrics/image_instance_segmentation_metric.py b/modelscope/metrics/image_instance_segmentation_metric.py
index 7deafbce..86a19d13 100644
--- a/modelscope/metrics/image_instance_segmentation_metric.py
+++ b/modelscope/metrics/image_instance_segmentation_metric.py
@@ -1,3 +1,5 @@
+# Part of the implementation is borrowed and modified from MMDetection, publicly available at
+# https://github.com/open-mmlab/mmdetection/blob/master/mmdet/datasets/coco.py
 import os.path as osp
 import tempfile
 from collections import OrderedDict
diff --git a/modelscope/metrics/movie_scene_segmentation_metric.py b/modelscope/metrics/movie_scene_segmentation_metric.py
new file mode 100644
index 00000000..65725b6f
--- /dev/null
+++ b/modelscope/metrics/movie_scene_segmentation_metric.py
@@ -0,0 +1,54 @@
+# The implementation here is modified based on BaSSL,
+# originally Apache 2.0 License and publicly available at https://github.com/kakaobrain/bassl
+from typing import Dict
+
+import numpy as np
+
+from modelscope.metainfo import Metrics
+from modelscope.utils.registry import default_group
+from modelscope.utils.tensor_utils import (torch_nested_detach,
+                                           torch_nested_numpify)
+from .base import Metric
+from .builder import METRICS, MetricKeys
+
+
+@METRICS.register_module(
+    group_key=default_group,
+    module_name=Metrics.movie_scene_segmentation_metric)
+class MovieSceneSegmentationMetric(Metric):
+    """The metric computation class for movie scene segmentation classes.
+    """
+
+    def __init__(self):
+        self.preds = []
+        self.labels = []
+        self.eps = 1e-5
+
+    def add(self, outputs: Dict, inputs: Dict):
+        preds = outputs['pred']
+        labels = inputs['label']
+        self.preds.extend(preds)
+        self.labels.extend(labels)
+
+    def evaluate(self):
+        gts = np.array(torch_nested_numpify(torch_nested_detach(self.labels)))
+        prob = np.array(torch_nested_numpify(torch_nested_detach(self.preds)))
+
+        gt_one = gts == 1
+        gt_zero = gts == 0
+        pred_one = prob == 1
+        pred_zero = prob == 0
+
+        tp = (gt_one * pred_one).sum()
+        fp = (gt_zero * pred_one).sum()
+        fn = (gt_one * pred_zero).sum()
+
+        precision = 100.0 * tp / (tp + fp + self.eps)
+        recall = 100.0 * tp / (tp + fn + self.eps)
+        f1 = 2 * precision * recall / (precision + recall)
+
+        return {
+            MetricKeys.F1: f1,
+            MetricKeys.RECALL: recall,
+            MetricKeys.PRECISION: precision
+        }
diff --git a/modelscope/metrics/sequence_classification_metric.py b/modelscope/metrics/sequence_classification_metric.py
index 83cb39ca..51a829ef 100644
--- a/modelscope/metrics/sequence_classification_metric.py
+++ b/modelscope/metrics/sequence_classification_metric.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 from typing import Dict
 
 import numpy as np
@@ -14,9 +16,9 @@ from .builder import METRICS, MetricKeys
 @METRICS.register_module(
     group_key=default_group, module_name=Metrics.seq_cls_metric)
 class SequenceClassificationMetric(Metric):
-    """The metric computation class for sequence classification classes.
+    """The metric computation class for sequence classification tasks.
 
-    This metric class calculates accuracy for the whole input batches.
+    This metric class calculates accuracy of the whole input batches.
     """
 
     def __init__(self, *args, **kwargs):
diff --git a/modelscope/metrics/text_generation_metric.py b/modelscope/metrics/text_generation_metric.py
index 6bdcbc58..90b80425 100644
--- a/modelscope/metrics/text_generation_metric.py
+++ b/modelscope/metrics/text_generation_metric.py
@@ -1,9 +1,14 @@
-from typing import Dict
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+from typing import Dict, Iterable, List
+
+from nltk.translate.bleu_score import sentence_bleu
+from rouge import Rouge
 
 from modelscope.metainfo import Metrics
+from modelscope.metrics.base import Metric
+from modelscope.metrics.builder import METRICS, MetricKeys
 from modelscope.utils.registry import default_group
-from .base import Metric
-from .builder import METRICS, MetricKeys
 
 
 @METRICS.register_module(
@@ -15,20 +20,49 @@ class TextGenerationMetric(Metric):
     """
 
     def __init__(self):
-        self.preds = []
-        self.tgts = []
-        from rouge_score import rouge_scorer
-        self.scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
+        self.preds: List[str] = []
+        self.tgts: List[str] = []
+        self.rouge = Rouge()
 
-    def add(self, outputs: Dict, inputs: Dict):
+    @staticmethod
+    def is_chinese_char(char: str):
+        # the length of char must be 1
+        return '\u4e00' <= char <= '\u9fa5'
+
+    # add space for each chinese char
+    def rebuild_str(self, string: str):
+        return ' '.join(''.join([
+            f' {char} ' if self.is_chinese_char(char) else char
+            for char in string
+        ]).split())
+
+    def add(self, outputs: Dict[str, List[str]], inputs: Dict = None):
         ground_truths = outputs['tgts']
         eval_results = outputs['preds']
-        self.preds.extend(eval_results)
-        self.tgts.extend(ground_truths)
+        for truth in ground_truths:
+            self.tgts.append(self.rebuild_str(truth))
+        for result in eval_results:
+            self.preds.append(self.rebuild_str(result))
 
     def evaluate(self):
-        scores = [
-            self.scorer.score(pred, tgt)['rougeL'].fmeasure
-            for pred, tgt in zip(self.preds, self.tgts)
-        ]
-        return {MetricKeys.F1: sum(scores) / len(scores)}
+
+        def mean(iter: Iterable) -> float:
+            return sum(iter) / len(self.preds)
+
+        rouge_scores = self.rouge.get_scores(hyps=self.preds, refs=self.tgts)
+        rouge_1 = mean(map(lambda score: score['rouge-1']['f'], rouge_scores))
+        rouge_l = mean(map(lambda score: score['rouge-l']['f'], rouge_scores))
+        pred_split = tuple(pred.split(' ') for pred in self.preds)
+        tgt_split = tuple(tgt.split(' ') for tgt in self.tgts)
+        bleu_1 = mean(
+            sentence_bleu([tgt], pred, weights=(1, 0, 0, 0))
+            for pred, tgt in zip(pred_split, tgt_split))
+        bleu_4 = mean(
+            sentence_bleu([tgt], pred)
+            for pred, tgt in zip(pred_split, tgt_split))
+        return {
+            MetricKeys.ROUGE_1: rouge_1,
+            MetricKeys.ROUGE_L: rouge_l,
+            MetricKeys.BLEU_1: bleu_1,
+            MetricKeys.BLEU_4: bleu_4
+        }
diff --git a/modelscope/metrics/token_classification_metric.py b/modelscope/metrics/token_classification_metric.py
index 53d13b6a..05b72170 100644
--- a/modelscope/metrics/token_classification_metric.py
+++ b/modelscope/metrics/token_classification_metric.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import importlib
 from typing import Dict, List, Optional, Union
 
diff --git a/modelscope/models/audio/aec/layers/activations.py b/modelscope/models/audio/aec/layers/activations.py
index b0215bcc..f78ad4b5 100644
--- a/modelscope/models/audio/aec/layers/activations.py
+++ b/modelscope/models/audio/aec/layers/activations.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import torch.nn as nn
 
 from .layer_base import LayerBase
diff --git a/modelscope/models/audio/aec/layers/affine_transform.py b/modelscope/models/audio/aec/layers/affine_transform.py
index 33479505..2de8a03f 100644
--- a/modelscope/models/audio/aec/layers/affine_transform.py
+++ b/modelscope/models/audio/aec/layers/affine_transform.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import numpy as np
 import torch as th
 import torch.nn as nn
diff --git a/modelscope/models/audio/aec/layers/deep_fsmn.py b/modelscope/models/audio/aec/layers/deep_fsmn.py
index 72ba07dc..1582b908 100644
--- a/modelscope/models/audio/aec/layers/deep_fsmn.py
+++ b/modelscope/models/audio/aec/layers/deep_fsmn.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import numpy as np
 import torch as th
 import torch.nn as nn
diff --git a/modelscope/models/audio/aec/layers/layer_base.py b/modelscope/models/audio/aec/layers/layer_base.py
index e56c4bc0..7c39e5be 100644
--- a/modelscope/models/audio/aec/layers/layer_base.py
+++ b/modelscope/models/audio/aec/layers/layer_base.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import abc
 import re
 
diff --git a/modelscope/models/audio/aec/layers/uni_deep_fsmn.py b/modelscope/models/audio/aec/layers/uni_deep_fsmn.py
index c22460c4..a276db05 100644
--- a/modelscope/models/audio/aec/layers/uni_deep_fsmn.py
+++ b/modelscope/models/audio/aec/layers/uni_deep_fsmn.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import numpy as np
 import torch as th
 import torch.nn as nn
diff --git a/modelscope/models/audio/aec/network/loss.py b/modelscope/models/audio/aec/network/loss.py
index 743661b3..1f20072a 100644
--- a/modelscope/models/audio/aec/network/loss.py
+++ b/modelscope/models/audio/aec/network/loss.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import torch
 import torch.nn.functional as F
 
diff --git a/modelscope/models/audio/aec/network/modulation_loss.py b/modelscope/models/audio/aec/network/modulation_loss.py
index a45ddead..3017b5c6 100644
--- a/modelscope/models/audio/aec/network/modulation_loss.py
+++ b/modelscope/models/audio/aec/network/modulation_loss.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import math
 
 import torch
diff --git a/modelscope/models/audio/aec/network/se_net.py b/modelscope/models/audio/aec/network/se_net.py
index 837cad3c..40639605 100644
--- a/modelscope/models/audio/aec/network/se_net.py
+++ b/modelscope/models/audio/aec/network/se_net.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
diff --git a/modelscope/models/audio/ans/__init__.py b/modelscope/models/audio/ans/__init__.py
index b602ad01..afcdf314 100644
--- a/modelscope/models/audio/ans/__init__.py
+++ b/modelscope/models/audio/ans/__init__.py
@@ -4,11 +4,11 @@ from typing import TYPE_CHECKING
 from modelscope.utils.import_utils import LazyImportModule
 
 if TYPE_CHECKING:
-    from .frcrn import FRCRNModel
+    from .frcrn import FRCRNDecorator
 
 else:
     _import_structure = {
-        'frcrn': ['FRCRNModel'],
+        'frcrn': ['FRCRNDecorator'],
     }
 
     import sys
diff --git a/modelscope/models/audio/ans/complex_nn.py b/modelscope/models/audio/ans/complex_nn.py
index 69dec41e..beaa3187 100644
--- a/modelscope/models/audio/ans/complex_nn.py
+++ b/modelscope/models/audio/ans/complex_nn.py
@@ -1,3 +1,10 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+#
+# The implementation of class ComplexConv2d, ComplexConvTranspose2d and
+# ComplexBatchNorm2d here is modified based on Jongho Choi(sweetcocoa@snu.ac.kr
+# / Seoul National Univ., ESTsoft ) and publicly available at
+# https://github.com/sweetcocoa/DeepComplexUNetPyTorch
+
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
diff --git a/modelscope/models/audio/ans/conv_stft.py b/modelscope/models/audio/ans/conv_stft.py
index a47d7817..4b393a4c 100644
--- a/modelscope/models/audio/ans/conv_stft.py
+++ b/modelscope/models/audio/ans/conv_stft.py
@@ -1,3 +1,4 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
 import numpy as np
 import torch
 import torch.nn as nn
diff --git a/modelscope/models/audio/ans/frcrn.py b/modelscope/models/audio/ans/frcrn.py
index ba78ab74..b74fc273 100644
--- a/modelscope/models/audio/ans/frcrn.py
+++ b/modelscope/models/audio/ans/frcrn.py
@@ -1,3 +1,4 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
 import os
 from typing import Dict
 
@@ -14,54 +15,10 @@ from .conv_stft import ConviSTFT, ConvSTFT
 from .unet import UNet
 
 
-class FTB(nn.Module):
-
-    def __init__(self, input_dim=257, in_channel=9, r_channel=5):
-
-        super(FTB, self).__init__()
-        self.in_channel = in_channel
-        self.conv1 = nn.Sequential(
-            nn.Conv2d(in_channel, r_channel, kernel_size=[1, 1]),
-            nn.BatchNorm2d(r_channel), nn.ReLU())
-
-        self.conv1d = nn.Sequential(
-            nn.Conv1d(
-                r_channel * input_dim, in_channel, kernel_size=9, padding=4),
-            nn.BatchNorm1d(in_channel), nn.ReLU())
-        self.freq_fc = nn.Linear(input_dim, input_dim, bias=False)
-
-        self.conv2 = nn.Sequential(
-            nn.Conv2d(in_channel * 2, in_channel, kernel_size=[1, 1]),
-            nn.BatchNorm2d(in_channel), nn.ReLU())
-
-    def forward(self, inputs):
-        '''
-        inputs should be [Batch, Ca, Dim, Time]
-        '''
-        # T-F attention
-        conv1_out = self.conv1(inputs)
-        B, C, D, T = conv1_out.size()
-        reshape1_out = torch.reshape(conv1_out, [B, C * D, T])
-        conv1d_out = self.conv1d(reshape1_out)
-        conv1d_out = torch.reshape(conv1d_out, [B, self.in_channel, 1, T])
-
-        # now is also [B,C,D,T]
-        att_out = conv1d_out * inputs
-
-        # tranpose to [B,C,T,D]
-        att_out = torch.transpose(att_out, 2, 3)
-        freqfc_out = self.freq_fc(att_out)
-        att_out = torch.transpose(freqfc_out, 2, 3)
-
-        cat_out = torch.cat([att_out, inputs], 1)
-        outputs = self.conv2(cat_out)
-        return outputs
-
-
 @MODELS.register_module(
     Tasks.acoustic_noise_suppression,
     module_name=Models.speech_frcrn_ans_cirm_16k)
-class FRCRNModel(TorchModel):
+class FRCRNDecorator(TorchModel):
     r""" A decorator of FRCRN for integrating into modelscope framework """
 
     def __init__(self, model_dir: str, *args, **kwargs):
@@ -71,32 +28,42 @@ class FRCRNModel(TorchModel):
             model_dir (str): the model path.
         """
         super().__init__(model_dir, *args, **kwargs)
-        kwargs.pop('device')
         self.model = FRCRN(*args, **kwargs)
         model_bin_file = os.path.join(model_dir,
                                       ModelFile.TORCH_MODEL_BIN_FILE)
         if os.path.exists(model_bin_file):
-            checkpoint = torch.load(model_bin_file)
-            self.model.load_state_dict(checkpoint, strict=False)
+            checkpoint = torch.load(
+                model_bin_file, map_location=torch.device('cpu'))
+            if isinstance(checkpoint, dict) and 'state_dict' in checkpoint:
+                # the new trained model by user is based on FRCRNDecorator
+                self.load_state_dict(checkpoint['state_dict'])
+            else:
+                # The released model on Modelscope is based on FRCRN
+                self.model.load_state_dict(checkpoint, strict=False)
 
-    def forward(self, input: Dict[str, Tensor]) -> Dict[str, Tensor]:
-        output = self.model.forward(input)
-        return {
-            'spec_l1': output[0],
-            'wav_l1': output[1],
-            'mask_l1': output[2],
-            'spec_l2': output[3],
-            'wav_l2': output[4],
-            'mask_l2': output[5]
+    def forward(self, inputs: Dict[str, Tensor]) -> Dict[str, Tensor]:
+        result_list = self.model.forward(inputs['noisy'])
+        output = {
+            'spec_l1': result_list[0],
+            'wav_l1': result_list[1],
+            'mask_l1': result_list[2],
+            'spec_l2': result_list[3],
+            'wav_l2': result_list[4],
+            'mask_l2': result_list[5]
         }
-
-    def to(self, *args, **kwargs):
-        self.model = self.model.to(*args, **kwargs)
-        return self
-
-    def eval(self):
-        self.model = self.model.train(False)
-        return self
+        if 'clean' in inputs:
+            mix_result = self.model.loss(
+                inputs['noisy'], inputs['clean'], result_list, mode='Mix')
+            output.update(mix_result)
+            sisnr_result = self.model.loss(
+                inputs['noisy'], inputs['clean'], result_list, mode='SiSNR')
+            output.update(sisnr_result)
+            # logger hooker will use items under 'log_vars'
+            output['log_vars'] = {k: mix_result[k].item() for k in mix_result}
+            output['log_vars'].update(
+                {k: sisnr_result[k].item()
+                 for k in sisnr_result})
+        return output
 
 
 class FRCRN(nn.Module):
@@ -111,7 +78,8 @@ class FRCRN(nn.Module):
                  win_len=400,
                  win_inc=100,
                  fft_len=512,
-                 win_type='hanning'):
+                 win_type='hanning',
+                 **kwargs):
         r"""
         Args:
             complex: Whether to use complex networks.
@@ -237,7 +205,7 @@ class FRCRN(nn.Module):
                 if count != 3:
                     loss = self.loss_1layer(noisy, est_spec, est_wav, labels,
                                             est_mask, mode)
-            return loss
+            return dict(sisnr=loss)
 
         elif mode == 'Mix':
             count = 0
@@ -252,7 +220,7 @@ class FRCRN(nn.Module):
                     amp_loss, phase_loss, SiSNR_loss = self.loss_1layer(
                         noisy, est_spec, est_wav, labels, est_mask, mode)
                     loss = amp_loss + phase_loss + SiSNR_loss
-            return loss, amp_loss, phase_loss
+            return dict(loss=loss, amp_loss=amp_loss, phase_loss=phase_loss)
 
     def loss_1layer(self, noisy, est, est_wav, labels, cmp_mask, mode='Mix'):
         r""" Compute the loss by mode
diff --git a/modelscope/models/audio/ans/se_module_complex.py b/modelscope/models/audio/ans/se_module_complex.py
index f62fe523..b58eb6ba 100644
--- a/modelscope/models/audio/ans/se_module_complex.py
+++ b/modelscope/models/audio/ans/se_module_complex.py
@@ -1,3 +1,4 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
 import torch
 from torch import nn
 
diff --git a/modelscope/models/audio/ans/unet.py b/modelscope/models/audio/ans/unet.py
index aa5a4254..7b4df1e9 100644
--- a/modelscope/models/audio/ans/unet.py
+++ b/modelscope/models/audio/ans/unet.py
@@ -1,3 +1,10 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+#
+# The implementation here is modified based on
+# Jongho Choi(sweetcocoa@snu.ac.kr / Seoul National Univ., ESTsoft )
+# and publicly available at
+# https://github.com/sweetcocoa/DeepComplexUNetPyTorch
+
 import torch
 import torch.nn as nn
 
diff --git a/modelscope/models/audio/kws/farfield/fsmn.py b/modelscope/models/audio/kws/farfield/fsmn.py
index e88d3976..e06d7911 100644
--- a/modelscope/models/audio/kws/farfield/fsmn.py
+++ b/modelscope/models/audio/kws/farfield/fsmn.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import numpy as np
 import torch
 import torch.nn as nn
diff --git a/modelscope/models/audio/kws/farfield/fsmn_sele_v2.py b/modelscope/models/audio/kws/farfield/fsmn_sele_v2.py
index 1884e533..8af16cc9 100644
--- a/modelscope/models/audio/kws/farfield/fsmn_sele_v2.py
+++ b/modelscope/models/audio/kws/farfield/fsmn_sele_v2.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
diff --git a/modelscope/models/audio/kws/farfield/model.py b/modelscope/models/audio/kws/farfield/model.py
index 428ec367..fea82194 100644
--- a/modelscope/models/audio/kws/farfield/model.py
+++ b/modelscope/models/audio/kws/farfield/model.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import os
 from typing import Dict
 
diff --git a/modelscope/models/audio/kws/farfield/model_def.py b/modelscope/models/audio/kws/farfield/model_def.py
index 3f5ba7d7..be9cca2c 100644
--- a/modelscope/models/audio/kws/farfield/model_def.py
+++ b/modelscope/models/audio/kws/farfield/model_def.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import math
 import struct
 from enum import Enum
diff --git a/modelscope/models/audio/tts/models/__init__.py b/modelscope/models/audio/tts/models/__init__.py
old mode 100755
new mode 100644
index c260d4fe..e69de29b
--- a/modelscope/models/audio/tts/models/__init__.py
+++ b/modelscope/models/audio/tts/models/__init__.py
@@ -1,9 +0,0 @@
-from .robutrans import RobuTrans
-from .vocoder_models import Generator
-
-
-def create_am_model(name, hparams):
-    if name == 'robutrans':
-        return RobuTrans(hparams)
-    else:
-        raise Exception('Unknown model: ' + name)
diff --git a/modelscope/models/audio/tts/models/am_models.py b/modelscope/models/audio/tts/models/am_models.py
deleted file mode 100755
index cd43ff12..00000000
--- a/modelscope/models/audio/tts/models/am_models.py
+++ /dev/null
@@ -1,460 +0,0 @@
-import tensorflow as tf
-
-
-def encoder_prenet(inputs,
-                   n_conv_layers,
-                   filters,
-                   kernel_size,
-                   dense_units,
-                   is_training,
-                   mask=None,
-                   scope='encoder_prenet'):
-    x = inputs
-    with tf.variable_scope(scope):
-        for i in range(n_conv_layers):
-            x = conv1d(
-                x,
-                filters,
-                kernel_size,
-                is_training,
-                activation=tf.nn.relu,
-                dropout=True,
-                mask=mask,
-                scope='conv1d_{}'.format(i))
-        x = tf.layers.dense(
-            x, units=dense_units, activation=None, name='dense')
-    return x
-
-
-def decoder_prenet(inputs,
-                   prenet_units,
-                   dense_units,
-                   is_training,
-                   scope='decoder_prenet'):
-    x = inputs
-    with tf.variable_scope(scope):
-        for i, units in enumerate(prenet_units):
-            x = tf.layers.dense(
-                x,
-                units=units,
-                activation=tf.nn.relu,
-                name='dense_{}'.format(i))
-            x = tf.layers.dropout(
-                x, rate=0.5, training=is_training, name='dropout_{}'.format(i))
-        x = tf.layers.dense(
-            x, units=dense_units, activation=None, name='dense')
-    return x
-
-
-def encoder(inputs,
-            input_lengths,
-            n_conv_layers,
-            filters,
-            kernel_size,
-            lstm_units,
-            is_training,
-            embedded_inputs_speaker,
-            mask=None,
-            scope='encoder'):
-    with tf.variable_scope(scope):
-        x = conv_and_lstm(
-            inputs,
-            input_lengths,
-            n_conv_layers,
-            filters,
-            kernel_size,
-            lstm_units,
-            is_training,
-            embedded_inputs_speaker,
-            mask=mask)
-    return x
-
-
-def prenet(inputs, prenet_units, is_training, scope='prenet'):
-    x = inputs
-    with tf.variable_scope(scope):
-        for i, units in enumerate(prenet_units):
-            x = tf.layers.dense(
-                x,
-                units=units,
-                activation=tf.nn.relu,
-                name='dense_{}'.format(i))
-            x = tf.layers.dropout(
-                x, rate=0.5, training=is_training, name='dropout_{}'.format(i))
-    return x
-
-
-def postnet_residual_ulstm(inputs,
-                           n_conv_layers,
-                           filters,
-                           kernel_size,
-                           lstm_units,
-                           output_units,
-                           is_training,
-                           scope='postnet_residual_ulstm'):
-    with tf.variable_scope(scope):
-        x = conv_and_ulstm(inputs, None, n_conv_layers, filters, kernel_size,
-                           lstm_units, is_training)
-        x = conv1d(
-            x,
-            output_units,
-            kernel_size,
-            is_training,
-            activation=None,
-            dropout=False,
-            scope='conv1d_{}'.format(n_conv_layers - 1))
-    return x
-
-
-def postnet_residual_lstm(inputs,
-                          n_conv_layers,
-                          filters,
-                          kernel_size,
-                          lstm_units,
-                          output_units,
-                          is_training,
-                          scope='postnet_residual_lstm'):
-    with tf.variable_scope(scope):
-        x = conv_and_lstm(inputs, None, n_conv_layers, filters, kernel_size,
-                          lstm_units, is_training)
-        x = conv1d(
-            x,
-            output_units,
-            kernel_size,
-            is_training,
-            activation=None,
-            dropout=False,
-            scope='conv1d_{}'.format(n_conv_layers - 1))
-    return x
-
-
-def postnet_linear_ulstm(inputs,
-                         n_conv_layers,
-                         filters,
-                         kernel_size,
-                         lstm_units,
-                         output_units,
-                         is_training,
-                         scope='postnet_linear'):
-    with tf.variable_scope(scope):
-        x = conv_and_ulstm(inputs, None, n_conv_layers, filters, kernel_size,
-                           lstm_units, is_training)
-        x = tf.layers.dense(x, units=output_units)
-    return x
-
-
-def postnet_linear_lstm(inputs,
-                        n_conv_layers,
-                        filters,
-                        kernel_size,
-                        lstm_units,
-                        output_units,
-                        output_lengths,
-                        is_training,
-                        embedded_inputs_speaker2,
-                        mask=None,
-                        scope='postnet_linear'):
-    with tf.variable_scope(scope):
-        x = conv_and_lstm_dec(
-            inputs,
-            output_lengths,
-            n_conv_layers,
-            filters,
-            kernel_size,
-            lstm_units,
-            is_training,
-            embedded_inputs_speaker2,
-            mask=mask)
-        x = tf.layers.dense(x, units=output_units)
-    return x
-
-
-def postnet_linear(inputs,
-                   n_conv_layers,
-                   filters,
-                   kernel_size,
-                   lstm_units,
-                   output_units,
-                   output_lengths,
-                   is_training,
-                   embedded_inputs_speaker2,
-                   mask=None,
-                   scope='postnet_linear'):
-    with tf.variable_scope(scope):
-        x = conv_dec(
-            inputs,
-            output_lengths,
-            n_conv_layers,
-            filters,
-            kernel_size,
-            lstm_units,
-            is_training,
-            embedded_inputs_speaker2,
-            mask=mask)
-    return x
-
-
-def conv_and_lstm(inputs,
-                  sequence_lengths,
-                  n_conv_layers,
-                  filters,
-                  kernel_size,
-                  lstm_units,
-                  is_training,
-                  embedded_inputs_speaker,
-                  mask=None,
-                  scope='conv_and_lstm'):
-    from tensorflow.contrib.rnn import LSTMBlockCell
-    x = inputs
-    with tf.variable_scope(scope):
-        for i in range(n_conv_layers):
-            x = conv1d(
-                x,
-                filters,
-                kernel_size,
-                is_training,
-                activation=tf.nn.relu,
-                dropout=True,
-                mask=mask,
-                scope='conv1d_{}'.format(i))
-
-        x = tf.concat([x, embedded_inputs_speaker], axis=2)
-
-        outputs, states = tf.nn.bidirectional_dynamic_rnn(
-            LSTMBlockCell(lstm_units),
-            LSTMBlockCell(lstm_units),
-            x,
-            sequence_length=sequence_lengths,
-            dtype=tf.float32)
-        x = tf.concat(outputs, axis=-1)
-
-    return x
-
-
-def conv_and_lstm_dec(inputs,
-                      sequence_lengths,
-                      n_conv_layers,
-                      filters,
-                      kernel_size,
-                      lstm_units,
-                      is_training,
-                      embedded_inputs_speaker2,
-                      mask=None,
-                      scope='conv_and_lstm'):
-    x = inputs
-    from tensorflow.contrib.rnn import LSTMBlockCell
-    with tf.variable_scope(scope):
-        for i in range(n_conv_layers):
-            x = conv1d(
-                x,
-                filters,
-                kernel_size,
-                is_training,
-                activation=tf.nn.relu,
-                dropout=True,
-                mask=mask,
-                scope='conv1d_{}'.format(i))
-
-        x = tf.concat([x, embedded_inputs_speaker2], axis=2)
-
-        outputs, states = tf.nn.bidirectional_dynamic_rnn(
-            LSTMBlockCell(lstm_units),
-            LSTMBlockCell(lstm_units),
-            x,
-            sequence_length=sequence_lengths,
-            dtype=tf.float32)
-        x = tf.concat(outputs, axis=-1)
-    return x
-
-
-def conv_dec(inputs,
-             sequence_lengths,
-             n_conv_layers,
-             filters,
-             kernel_size,
-             lstm_units,
-             is_training,
-             embedded_inputs_speaker2,
-             mask=None,
-             scope='conv_and_lstm'):
-    x = inputs
-    with tf.variable_scope(scope):
-        for i in range(n_conv_layers):
-            x = conv1d(
-                x,
-                filters,
-                kernel_size,
-                is_training,
-                activation=tf.nn.relu,
-                dropout=True,
-                mask=mask,
-                scope='conv1d_{}'.format(i))
-        x = tf.concat([x, embedded_inputs_speaker2], axis=2)
-    return x
-
-
-def conv_and_ulstm(inputs,
-                   sequence_lengths,
-                   n_conv_layers,
-                   filters,
-                   kernel_size,
-                   lstm_units,
-                   is_training,
-                   scope='conv_and_ulstm'):
-    x = inputs
-    with tf.variable_scope(scope):
-        for i in range(n_conv_layers):
-            x = conv1d(
-                x,
-                filters,
-                kernel_size,
-                is_training,
-                activation=tf.nn.relu,
-                dropout=True,
-                scope='conv1d_{}'.format(i))
-
-        outputs, states = tf.nn.dynamic_rnn(
-            LSTMBlockCell(lstm_units),
-            x,
-            sequence_length=sequence_lengths,
-            dtype=tf.float32)
-
-    return outputs
-
-
-def conv1d(inputs,
-           filters,
-           kernel_size,
-           is_training,
-           activation=None,
-           dropout=False,
-           mask=None,
-           scope='conv1d'):
-    with tf.variable_scope(scope):
-        if mask is not None:
-            inputs = inputs * tf.expand_dims(mask, -1)
-        x = tf.layers.conv1d(
-            inputs, filters=filters, kernel_size=kernel_size, padding='same')
-        if mask is not None:
-            x = x * tf.expand_dims(mask, -1)
-
-        x = tf.layers.batch_normalization(x, training=is_training)
-        if activation is not None:
-            x = activation(x)
-        if dropout:
-            x = tf.layers.dropout(x, rate=0.5, training=is_training)
-    return x
-
-
-def conv1d_dp(inputs,
-              filters,
-              kernel_size,
-              is_training,
-              activation=None,
-              dropout=False,
-              dropoutrate=0.5,
-              mask=None,
-              scope='conv1d'):
-    with tf.variable_scope(scope):
-        if mask is not None:
-            inputs = inputs * tf.expand_dims(mask, -1)
-        x = tf.layers.conv1d(
-            inputs, filters=filters, kernel_size=kernel_size, padding='same')
-        if mask is not None:
-            x = x * tf.expand_dims(mask, -1)
-
-        x = tf.contrib.layers.layer_norm(x)
-        if activation is not None:
-            x = activation(x)
-        if dropout:
-            x = tf.layers.dropout(x, rate=dropoutrate, training=is_training)
-    return x
-
-
-def duration_predictor(inputs,
-                       n_conv_layers,
-                       filters,
-                       kernel_size,
-                       lstm_units,
-                       input_lengths,
-                       is_training,
-                       embedded_inputs_speaker,
-                       mask=None,
-                       scope='duration_predictor'):
-    with tf.variable_scope(scope):
-        x = inputs
-        for i in range(n_conv_layers):
-            x = conv1d_dp(
-                x,
-                filters,
-                kernel_size,
-                is_training,
-                activation=tf.nn.relu,
-                dropout=True,
-                dropoutrate=0.1,
-                mask=mask,
-                scope='conv1d_{}'.format(i))
-
-        x = tf.concat([x, embedded_inputs_speaker], axis=2)
-
-        outputs, states = tf.nn.bidirectional_dynamic_rnn(
-            LSTMBlockCell(lstm_units),
-            LSTMBlockCell(lstm_units),
-            x,
-            sequence_length=input_lengths,
-            dtype=tf.float32)
-        x = tf.concat(outputs, axis=-1)
-
-        x = tf.layers.dense(x, units=1)
-        x = tf.nn.relu(x)
-    return x
-
-
-def duration_predictor2(inputs,
-                        n_conv_layers,
-                        filters,
-                        kernel_size,
-                        input_lengths,
-                        is_training,
-                        mask=None,
-                        scope='duration_predictor'):
-    with tf.variable_scope(scope):
-        x = inputs
-        for i in range(n_conv_layers):
-            x = conv1d_dp(
-                x,
-                filters,
-                kernel_size,
-                is_training,
-                activation=tf.nn.relu,
-                dropout=True,
-                dropoutrate=0.1,
-                mask=mask,
-                scope='conv1d_{}'.format(i))
-
-        x = tf.layers.dense(x, units=1)
-        x = tf.nn.relu(x)
-    return x
-
-
-def conv_prenet(inputs,
-                n_conv_layers,
-                filters,
-                kernel_size,
-                is_training,
-                mask=None,
-                scope='conv_prenet'):
-    x = inputs
-    with tf.variable_scope(scope):
-        for i in range(n_conv_layers):
-            x = conv1d(
-                x,
-                filters,
-                kernel_size,
-                is_training,
-                activation=tf.nn.relu,
-                dropout=True,
-                mask=mask,
-                scope='conv1d_{}'.format(i))
-
-    return x
diff --git a/modelscope/models/audio/tts/models/compat.py b/modelscope/models/audio/tts/models/compat.py
deleted file mode 100755
index bb810841..00000000
--- a/modelscope/models/audio/tts/models/compat.py
+++ /dev/null
@@ -1,82 +0,0 @@
-"""Functions for compatibility with different TensorFlow versions."""
-
-import tensorflow as tf
-
-
-def is_tf2():
-    """Returns ``True`` if running TensorFlow 2.0."""
-    return tf.__version__.startswith('2')
-
-
-def tf_supports(symbol):
-    """Returns ``True`` if TensorFlow defines :obj:`symbol`."""
-    return _string_to_tf_symbol(symbol) is not None
-
-
-def tf_any(*symbols):
-    """Returns the first supported symbol."""
-    for symbol in symbols:
-        module = _string_to_tf_symbol(symbol)
-        if module is not None:
-            return module
-    return None
-
-
-def tf_compat(v2=None, v1=None):  # pylint: disable=invalid-name
-    """Returns the compatible symbol based on the current TensorFlow version.
-
-    Args:
-      v2: The candidate v2 symbol name.
-      v1: The candidate v1 symbol name.
-
-    Returns:
-      A TensorFlow symbol.
-
-    Raises:
-      ValueError: if no symbol can be found.
-    """
-    candidates = []
-    if v2 is not None:
-        candidates.append(v2)
-    if v1 is not None:
-        candidates.append(v1)
-        candidates.append('compat.v1.%s' % v1)
-    symbol = tf_any(*candidates)
-    if symbol is None:
-        raise ValueError('Failure to resolve the TensorFlow symbol')
-    return symbol
-
-
-def name_from_variable_scope(name=''):
-    """Creates a name prefixed by the current variable scope."""
-    var_scope = tf_compat(v1='get_variable_scope')().name
-    compat_name = ''
-    if name:
-        compat_name = '%s/' % name
-    if var_scope:
-        compat_name = '%s/%s' % (var_scope, compat_name)
-    return compat_name
-
-
-def reuse():
-    """Returns ``True`` if the current variable scope is marked for reuse."""
-    return tf_compat(v1='get_variable_scope')().reuse
-
-
-def _string_to_tf_symbol(symbol):
-    modules = symbol.split('.')
-    namespace = tf
-    for module in modules:
-        namespace = getattr(namespace, module, None)
-        if namespace is None:
-            return None
-    return namespace
-
-
-# pylint: disable=invalid-name
-gfile_copy = tf_compat(v2='io.gfile.copy', v1='gfile.Copy')
-gfile_exists = tf_compat(v2='io.gfile.exists', v1='gfile.Exists')
-gfile_open = tf_compat(v2='io.gfile.GFile', v1='gfile.GFile')
-is_tensor = tf_compat(v2='is_tensor', v1='contrib.framework.is_tensor')
-logging = tf_compat(v1='logging')
-nest = tf_compat(v2='nest', v1='contrib.framework.nest')
diff --git a/modelscope/models/audio/tts/text/__init__.py b/modelscope/models/audio/tts/models/datasets/__init__.py
similarity index 100%
rename from modelscope/models/audio/tts/text/__init__.py
rename to modelscope/models/audio/tts/models/datasets/__init__.py
diff --git a/modelscope/models/audio/tts/models/datasets/kantts_data4fs.py b/modelscope/models/audio/tts/models/datasets/kantts_data4fs.py
new file mode 100644
index 00000000..cc47d0c4
--- /dev/null
+++ b/modelscope/models/audio/tts/models/datasets/kantts_data4fs.py
@@ -0,0 +1,238 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import os
+
+import json
+import numpy as np
+import torch
+from torch.utils.data import Dataset
+from tqdm import tqdm
+
+from modelscope.utils.logger import get_logger
+from .units import KanTtsLinguisticUnit
+
+logger = get_logger()
+
+
+class KanTtsText2MelDataset(Dataset):
+
+    def __init__(self, metadata_filename, config_filename, cache=False):
+        super(KanTtsText2MelDataset, self).__init__()
+
+        self.cache = cache
+
+        with open(config_filename) as f:
+            self._config = json.loads(f.read())
+
+        # Load metadata:
+        self._datadir = os.path.dirname(metadata_filename)
+        with open(metadata_filename, encoding='utf-8') as f:
+            self._metadata = [line.strip().split('|') for line in f]
+            self._length_lst = [int(x[2]) for x in self._metadata]
+            hours = sum(
+                self._length_lst) * self._config['audio']['frame_shift_ms'] / (
+                    3600 * 1000)
+
+            logger.info('Loaded metadata for %d examples (%.2f hours)' %
+                        (len(self._metadata), hours))
+            logger.info('Minimum length: %d, Maximum length: %d' %
+                        (min(self._length_lst), max(self._length_lst)))
+
+        self.ling_unit = KanTtsLinguisticUnit(config_filename)
+        self.pad_executor = KanTtsText2MelPad()
+
+        self.r = self._config['am']['outputs_per_step']
+        self.num_mels = self._config['am']['num_mels']
+
+        if 'adv' in self._config:
+            self.feat_window = self._config['adv']['random_window']
+        else:
+            self.feat_window = None
+        logger.info(self.feat_window)
+
+        self.data_cache = [
+            self.cache_load(i) for i in tqdm(range(self.__len__()))
+        ] if self.cache else []
+
+    def get_frames_lst(self):
+        return self._length_lst
+
+    def __getitem__(self, index):
+        if self.cache:
+            sample = self.data_cache[index]
+            return sample
+
+        return self.cache_load(index)
+
+    def cache_load(self, index):
+        sample = {}
+
+        meta = self._metadata[index]
+
+        sample['utt_id'] = meta[0]
+
+        sample['mel_target'] = np.load(os.path.join(
+            self._datadir, meta[1]))[:, :self.num_mels]
+        sample['output_length'] = len(sample['mel_target'])
+
+        lfeat_symbol = meta[3]
+        sample['ling'] = self.ling_unit.encode_symbol_sequence(lfeat_symbol)
+
+        sample['duration'] = np.load(os.path.join(self._datadir, meta[4]))
+
+        sample['pitch_contour'] = np.load(os.path.join(self._datadir, meta[5]))
+
+        sample['energy_contour'] = np.load(
+            os.path.join(self._datadir, meta[6]))
+
+        return sample
+
+    def __len__(self):
+        return len(self._metadata)
+
+    def collate_fn(self, batch):
+        data_dict = {}
+
+        max_input_length = max((len(x['ling'][0]) for x in batch))
+
+        # pure linguistic info: sy|tone|syllable_flag|word_segment
+
+        # sy
+        lfeat_type = self.ling_unit._lfeat_type_list[0]
+        inputs_sy = self.pad_executor._prepare_scalar_inputs(
+            [x['ling'][0] for x in batch], max_input_length,
+            self.ling_unit._sub_unit_pad[lfeat_type]).long()
+        # tone
+        lfeat_type = self.ling_unit._lfeat_type_list[1]
+        inputs_tone = self.pad_executor._prepare_scalar_inputs(
+            [x['ling'][1] for x in batch], max_input_length,
+            self.ling_unit._sub_unit_pad[lfeat_type]).long()
+
+        # syllable_flag
+        lfeat_type = self.ling_unit._lfeat_type_list[2]
+        inputs_syllable_flag = self.pad_executor._prepare_scalar_inputs(
+            [x['ling'][2] for x in batch], max_input_length,
+            self.ling_unit._sub_unit_pad[lfeat_type]).long()
+
+        # word_segment
+        lfeat_type = self.ling_unit._lfeat_type_list[3]
+        inputs_ws = self.pad_executor._prepare_scalar_inputs(
+            [x['ling'][3] for x in batch], max_input_length,
+            self.ling_unit._sub_unit_pad[lfeat_type]).long()
+
+        # emotion category
+        lfeat_type = self.ling_unit._lfeat_type_list[4]
+        data_dict['input_emotions'] = self.pad_executor._prepare_scalar_inputs(
+            [x['ling'][4] for x in batch], max_input_length,
+            self.ling_unit._sub_unit_pad[lfeat_type]).long()
+
+        # speaker category
+        lfeat_type = self.ling_unit._lfeat_type_list[5]
+        data_dict['input_speakers'] = self.pad_executor._prepare_scalar_inputs(
+            [x['ling'][5] for x in batch], max_input_length,
+            self.ling_unit._sub_unit_pad[lfeat_type]).long()
+
+        data_dict['input_lings'] = torch.stack(
+            [inputs_sy, inputs_tone, inputs_syllable_flag, inputs_ws], dim=2)
+
+        data_dict['valid_input_lengths'] = torch.as_tensor(
+            [len(x['ling'][0]) - 1 for x in batch], dtype=torch.long
+        )  # There is one '~' in the last of symbol sequence. We put length-1 for calculation.
+
+        data_dict['valid_output_lengths'] = torch.as_tensor(
+            [x['output_length'] for x in batch], dtype=torch.long)
+        max_output_length = torch.max(data_dict['valid_output_lengths']).item()
+        max_output_round_length = self.pad_executor._round_up(
+            max_output_length, self.r)
+
+        if self.feat_window is not None:
+            active_feat_len = np.minimum(max_output_round_length,
+                                         self.feat_window)
+            if active_feat_len < self.feat_window:
+                max_output_round_length = self.pad_executor._round_up(
+                    self.feat_window, self.r)
+                active_feat_len = self.feat_window
+
+            max_offsets = [x['output_length'] - active_feat_len for x in batch]
+            feat_offsets = [
+                np.random.randint(0, np.maximum(1, offset))
+                for offset in max_offsets
+            ]
+            feat_offsets = torch.from_numpy(
+                np.asarray(feat_offsets, dtype=np.int32)).long()
+            data_dict['feat_offsets'] = feat_offsets
+
+        data_dict['mel_targets'] = self.pad_executor._prepare_targets(
+            [x['mel_target'] for x in batch], max_output_round_length, 0.0)
+        data_dict['durations'] = self.pad_executor._prepare_durations(
+            [x['duration'] for x in batch], max_input_length,
+            max_output_round_length)
+
+        data_dict['pitch_contours'] = self.pad_executor._prepare_scalar_inputs(
+            [x['pitch_contour'] for x in batch], max_input_length,
+            0.0).float()
+        data_dict[
+            'energy_contours'] = self.pad_executor._prepare_scalar_inputs(
+                [x['energy_contour'] for x in batch], max_input_length,
+                0.0).float()
+
+        data_dict['utt_ids'] = [x['utt_id'] for x in batch]
+
+        return data_dict
+
+
+class KanTtsText2MelPad(object):
+
+    def __init__(self):
+        super(KanTtsText2MelPad, self).__init__()
+        pass
+
+    def _pad1D(self, x, length, pad):
+        return np.pad(
+            x, (0, length - x.shape[0]), mode='constant', constant_values=pad)
+
+    def _pad2D(self, x, length, pad):
+        return np.pad(
+            x, [(0, length - x.shape[0]), (0, 0)],
+            mode='constant',
+            constant_values=pad)
+
+    def _pad_durations(self, duration, max_in_len, max_out_len):
+        framenum = np.sum(duration)
+        symbolnum = duration.shape[0]
+        if framenum < max_out_len:
+            padframenum = max_out_len - framenum
+            duration = np.insert(
+                duration, symbolnum, values=padframenum, axis=0)
+            duration = np.insert(
+                duration,
+                symbolnum + 1,
+                values=[0] * (max_in_len - symbolnum - 1),
+                axis=0)
+        else:
+            if symbolnum < max_in_len:
+                duration = np.insert(
+                    duration,
+                    symbolnum,
+                    values=[0] * (max_in_len - symbolnum),
+                    axis=0)
+        return duration
+
+    def _round_up(self, x, multiple):
+        remainder = x % multiple
+        return x if remainder == 0 else x + multiple - remainder
+
+    def _prepare_scalar_inputs(self, inputs, max_len, pad):
+        return torch.from_numpy(
+            np.stack([self._pad1D(x, max_len, pad) for x in inputs]))
+
+    def _prepare_targets(self, targets, max_len, pad):
+        return torch.from_numpy(
+            np.stack([self._pad2D(t, max_len, pad) for t in targets])).float()
+
+    def _prepare_durations(self, durations, max_in_len, max_out_len):
+        return torch.from_numpy(
+            np.stack([
+                self._pad_durations(t, max_in_len, max_out_len)
+                for t in durations
+            ])).long()
diff --git a/modelscope/models/audio/tts/models/datasets/samplers.py b/modelscope/models/audio/tts/models/datasets/samplers.py
new file mode 100644
index 00000000..0657fa8a
--- /dev/null
+++ b/modelscope/models/audio/tts/models/datasets/samplers.py
@@ -0,0 +1,131 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import math
+import random
+
+import torch
+from torch import distributed as dist
+from torch.utils.data import Sampler
+
+
+class LenSortGroupPoolSampler(Sampler):
+
+    def __init__(self, data_source, length_lst, group_size):
+        super(LenSortGroupPoolSampler, self).__init__(data_source)
+
+        self.data_source = data_source
+        self.length_lst = length_lst
+        self.group_size = group_size
+
+        self.num = len(self.length_lst)
+        self.buckets = self.num // group_size
+
+    def __iter__(self):
+
+        def getkey(item):
+            return item[1]
+
+        random_lst = torch.randperm(self.num).tolist()
+        random_len_lst = [(i, self.length_lst[i]) for i in random_lst]
+
+        # Bucket examples based on similar output sequence length for efficiency:
+        groups = [
+            random_len_lst[i:i + self.group_size]
+            for i in range(0, self.num, self.group_size)
+        ]
+        if (self.num % self.group_size):
+            groups.append(random_len_lst[self.buckets * self.group_size:-1])
+
+        indices = []
+
+        for group in groups:
+            group.sort(key=getkey, reverse=True)
+            for item in group:
+                indices.append(item[0])
+
+        return iter(indices)
+
+    def __len__(self):
+        return len(self.data_source)
+
+
+class DistributedLenSortGroupPoolSampler(Sampler):
+
+    def __init__(self,
+                 dataset,
+                 length_lst,
+                 group_size,
+                 num_replicas=None,
+                 rank=None,
+                 shuffle=True):
+        super(DistributedLenSortGroupPoolSampler, self).__init__(dataset)
+
+        if num_replicas is None:
+            if not dist.is_available():
+                raise RuntimeError(
+                    'modelscope error: Requires distributed package to be available'
+                )
+            num_replicas = dist.get_world_size()
+        if rank is None:
+            if not dist.is_available():
+                raise RuntimeError(
+                    'modelscope error: Requires distributed package to be available'
+                )
+            rank = dist.get_rank()
+        self.dataset = dataset
+        self.length_lst = length_lst
+        self.group_size = group_size
+        self.num_replicas = num_replicas
+        self.rank = rank
+        self.epoch = 0
+        self.num_samples = int(
+            math.ceil(len(self.dataset) * 1.0 / self.num_replicas))
+        self.total_size = self.num_samples * self.num_replicas
+        self.buckets = self.num_samples // group_size
+        self.shuffle = shuffle
+
+    def __iter__(self):
+
+        def getkey(item):
+            return item[1]
+
+        # deterministically shuffle based on epoch
+        g = torch.Generator()
+        g.manual_seed(self.epoch)
+        if self.shuffle:
+            indices = torch.randperm(len(self.dataset), generator=g).tolist()
+        else:
+            indices = list(range(len(self.dataset)))
+
+        # add extra samples to make it evenly divisible
+        indices += indices[:(self.total_size - len(indices))]
+        assert len(indices) == self.total_size
+
+        # subsample
+        indices = indices[self.rank:self.total_size:self.num_replicas]
+        assert len(indices) == self.num_samples
+
+        random_len_lst = [(i, self.length_lst[i]) for i in indices]
+
+        # Bucket examples based on similar output sequence length for efficiency:
+        groups = [
+            random_len_lst[i:i + self.group_size]
+            for i in range(0, self.num_samples, self.group_size)
+        ]
+        if (self.num_samples % self.group_size):
+            groups.append(random_len_lst[self.buckets * self.group_size:-1])
+
+        new_indices = []
+
+        for group in groups:
+            group.sort(key=getkey, reverse=True)
+            for item in group:
+                new_indices.append(item[0])
+
+        return iter(new_indices)
+
+    def __len__(self):
+        return self.num_samples
+
+    def set_epoch(self, epoch):
+        self.epoch = epoch
diff --git a/modelscope/models/audio/tts/models/datasets/units/__init__.py b/modelscope/models/audio/tts/models/datasets/units/__init__.py
new file mode 100644
index 00000000..4d03df04
--- /dev/null
+++ b/modelscope/models/audio/tts/models/datasets/units/__init__.py
@@ -0,0 +1,3 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+from .ling_unit import *  # noqa F403
diff --git a/modelscope/models/audio/tts/models/datasets/units/cleaners.py b/modelscope/models/audio/tts/models/datasets/units/cleaners.py
new file mode 100644
index 00000000..07d4fbdb
--- /dev/null
+++ b/modelscope/models/audio/tts/models/datasets/units/cleaners.py
@@ -0,0 +1,88 @@
+# from https://github.com/keithito/tacotron
+# Cleaners are transformations that run over the input text at both training and eval time.
+#
+# Cleaners can be selected by passing a comma-delimited list of cleaner names as the "cleaners"
+# hyperparameter. Some cleaners are English-specific. You'll typically want to use:
+#   1. "english_cleaners" for English text
+#   2. "transliteration_cleaners" for non-English text that can be transliterated to ASCII using
+#      the Unidecode library (https://pypi.python.org/pypi/Unidecode)
+#   3. "basic_cleaners" if you do not want to transliterate (in this case, you should also update
+#      the symbols in symbols.py to match your data).
+
+import re
+
+from unidecode import unidecode
+
+from .numbers import normalize_numbers
+
+# Regular expression matching whitespace:
+_whitespace_re = re.compile(r'\s+')
+
+# List of (regular expression, replacement) pairs for abbreviations:
+_abbreviations = [
+    (re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1])
+    for x in [('mrs', 'misess'),
+              ('mr', 'mister'),
+              ('dr', 'doctor'),
+              ('st', 'saint'),
+              ('co', 'company'),
+              ('jr', 'junior'),
+              ('maj', 'major'),
+              ('gen', 'general'),
+              ('drs', 'doctors'),
+              ('rev', 'reverend'),
+              ('lt', 'lieutenant'),
+              ('hon', 'honorable'),
+              ('sgt', 'sergeant'),
+              ('capt', 'captain'),
+              ('esq', 'esquire'),
+              ('ltd', 'limited'),
+              ('col', 'colonel'),
+              ('ft', 'fort'), ]]  # yapf:disable
+
+
+def expand_abbreviations(text):
+    for regex, replacement in _abbreviations:
+        text = re.sub(regex, replacement, text)
+    return text
+
+
+def expand_numbers(text):
+    return normalize_numbers(text)
+
+
+def lowercase(text):
+    return text.lower()
+
+
+def collapse_whitespace(text):
+    return re.sub(_whitespace_re, ' ', text)
+
+
+def convert_to_ascii(text):
+    return unidecode(text)
+
+
+def basic_cleaners(text):
+    '''Basic pipeline that lowercases and collapses whitespace without transliteration.'''
+    text = lowercase(text)
+    text = collapse_whitespace(text)
+    return text
+
+
+def transliteration_cleaners(text):
+    '''Pipeline for non-English text that transliterates to ASCII.'''
+    text = convert_to_ascii(text)
+    text = lowercase(text)
+    text = collapse_whitespace(text)
+    return text
+
+
+def english_cleaners(text):
+    '''Pipeline for English text, including number and abbreviation expansion.'''
+    text = convert_to_ascii(text)
+    text = lowercase(text)
+    text = expand_numbers(text)
+    text = expand_abbreviations(text)
+    text = collapse_whitespace(text)
+    return text
diff --git a/modelscope/models/audio/tts/models/datasets/units/ling_unit.py b/modelscope/models/audio/tts/models/datasets/units/ling_unit.py
new file mode 100644
index 00000000..3c211cc7
--- /dev/null
+++ b/modelscope/models/audio/tts/models/datasets/units/ling_unit.py
@@ -0,0 +1,395 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import abc
+import codecs
+import os
+import re
+import shutil
+
+import json
+import numpy as np
+
+from . import cleaners as cleaners
+
+# Regular expression matching text enclosed in curly braces:
+_curly_re = re.compile(r'(.*?)\{(.+?)\}(.*)')
+
+
+def _clean_text(text, cleaner_names):
+    for name in cleaner_names:
+        cleaner = getattr(cleaners, name)
+        if not cleaner:
+            raise Exception(
+                'modelscope error: configuration cleaner unknown: %s' % name)
+        text = cleaner(text)
+    return text
+
+
+class LinguisticBaseUnit(abc.ABC):
+
+    def set_config_params(self, config_params):
+        self.config_params = config_params
+
+    def save(self, config, config_name, path):
+        t_path = os.path.join(path, config_name)
+        if config != t_path:
+            os.makedirs(path, exist_ok=True)
+            shutil.copyfile(config, os.path.join(path, config_name))
+
+
+class KanTtsLinguisticUnit(LinguisticBaseUnit):
+
+    def __init__(self, config, path, has_mask=True):
+        super(KanTtsLinguisticUnit, self).__init__()
+
+        # special symbol
+        self._pad = '_'
+        self._eos = '~'
+        self._mask = '@[MASK]'
+        self._has_mask = has_mask
+        self._unit_config = config
+        self._path = path
+
+        self._cleaner_names = [
+            x.strip() for x in self._unit_config['cleaners'].split(',')
+        ]
+        self._lfeat_type_list = self._unit_config['lfeat_type_list'].strip(
+        ).split(',')
+
+        self.build()
+
+    def get_unit_size(self):
+        ling_unit_size = {}
+        ling_unit_size['sy'] = len(self.sy)
+        ling_unit_size['tone'] = len(self.tone)
+        ling_unit_size['syllable_flag'] = len(self.syllable_flag)
+        ling_unit_size['word_segment'] = len(self.word_segment)
+
+        if 'emo_category' in self._lfeat_type_list:
+            ling_unit_size['emotion'] = len(self.emo_category)
+        if 'speaker_category' in self._lfeat_type_list:
+            ling_unit_size['speaker'] = len(self.speaker)
+
+        return ling_unit_size
+
+    def build(self):
+
+        self._sub_unit_dim = {}
+        self._sub_unit_pad = {}
+        # sy sub-unit
+        _characters = ''
+
+        _ch_symbols = []
+
+        sy_path = os.path.join(self._path, self._unit_config['sy'])
+        f = codecs.open(sy_path, 'r')
+        for line in f:
+            line = line.strip('\r\n')
+            _ch_symbols.append(line)
+
+        _arpabet = ['@' + s for s in _ch_symbols]
+
+        # Export all symbols:
+        self.sy = list(_characters) + _arpabet + [self._pad, self._eos]
+        if self._has_mask:
+            self.sy.append(self._mask)
+        self._sy_to_id = {s: i for i, s in enumerate(self.sy)}
+        self._id_to_sy = {i: s for i, s in enumerate(self.sy)}
+        self._sub_unit_dim['sy'] = len(self.sy)
+        self._sub_unit_pad['sy'] = self._sy_to_id['_']
+
+        # tone sub-unit
+        _characters = ''
+
+        _ch_tones = []
+
+        tone_path = os.path.join(self._path, self._unit_config['tone'])
+        f = codecs.open(tone_path, 'r')
+        for line in f:
+            line = line.strip('\r\n')
+            _ch_tones.append(line)
+
+        # Export all tones:
+        self.tone = list(_characters) + _ch_tones + [self._pad, self._eos]
+        if self._has_mask:
+            self.tone.append(self._mask)
+        self._tone_to_id = {s: i for i, s in enumerate(self.tone)}
+        self._id_to_tone = {i: s for i, s in enumerate(self.tone)}
+        self._sub_unit_dim['tone'] = len(self.tone)
+        self._sub_unit_pad['tone'] = self._tone_to_id['_']
+
+        # syllable flag sub-unit
+        _characters = ''
+
+        _ch_syllable_flags = []
+
+        sy_flag_path = os.path.join(self._path,
+                                    self._unit_config['syllable_flag'])
+        f = codecs.open(sy_flag_path, 'r')
+        for line in f:
+            line = line.strip('\r\n')
+            _ch_syllable_flags.append(line)
+
+        # Export all syllable_flags:
+        self.syllable_flag = list(_characters) + _ch_syllable_flags + [
+            self._pad, self._eos
+        ]
+        if self._has_mask:
+            self.syllable_flag.append(self._mask)
+        self._syllable_flag_to_id = {
+            s: i
+            for i, s in enumerate(self.syllable_flag)
+        }
+        self._id_to_syllable_flag = {
+            i: s
+            for i, s in enumerate(self.syllable_flag)
+        }
+        self._sub_unit_dim['syllable_flag'] = len(self.syllable_flag)
+        self._sub_unit_pad['syllable_flag'] = self._syllable_flag_to_id['_']
+
+        # word segment sub-unit
+        _characters = ''
+
+        _ch_word_segments = []
+
+        ws_path = os.path.join(self._path, self._unit_config['word_segment'])
+        f = codecs.open(ws_path, 'r')
+        for line in f:
+            line = line.strip('\r\n')
+            _ch_word_segments.append(line)
+
+        # Export all syllable_flags:
+        self.word_segment = list(_characters) + _ch_word_segments + [
+            self._pad, self._eos
+        ]
+        if self._has_mask:
+            self.word_segment.append(self._mask)
+        self._word_segment_to_id = {
+            s: i
+            for i, s in enumerate(self.word_segment)
+        }
+        self._id_to_word_segment = {
+            i: s
+            for i, s in enumerate(self.word_segment)
+        }
+        self._sub_unit_dim['word_segment'] = len(self.word_segment)
+        self._sub_unit_pad['word_segment'] = self._word_segment_to_id['_']
+
+        if 'emo_category' in self._lfeat_type_list:
+            # emotion category sub-unit
+            _characters = ''
+
+            _ch_emo_types = []
+
+            emo_path = os.path.join(self._path,
+                                    self._unit_config['emo_category'])
+            f = codecs.open(emo_path, 'r')
+            for line in f:
+                line = line.strip('\r\n')
+                _ch_emo_types.append(line)
+
+            self.emo_category = list(_characters) + _ch_emo_types + [
+                self._pad, self._eos
+            ]
+            if self._has_mask:
+                self.emo_category.append(self._mask)
+            self._emo_category_to_id = {
+                s: i
+                for i, s in enumerate(self.emo_category)
+            }
+            self._id_to_emo_category = {
+                i: s
+                for i, s in enumerate(self.emo_category)
+            }
+            self._sub_unit_dim['emo_category'] = len(self.emo_category)
+            self._sub_unit_pad['emo_category'] = self._emo_category_to_id['_']
+
+        if 'speaker_category' in self._lfeat_type_list:
+            # speaker category sub-unit
+            _characters = ''
+
+            _ch_speakers = []
+
+            speaker_path = os.path.join(self._path,
+                                        self._unit_config['speaker_category'])
+            f = codecs.open(speaker_path, 'r')
+            for line in f:
+                line = line.strip('\r\n')
+                _ch_speakers.append(line)
+
+            # Export all syllable_flags:
+            self.speaker = list(_characters) + _ch_speakers + [
+                self._pad, self._eos
+            ]
+            if self._has_mask:
+                self.speaker.append(self._mask)
+            self._speaker_to_id = {s: i for i, s in enumerate(self.speaker)}
+            self._id_to_speaker = {i: s for i, s in enumerate(self.speaker)}
+            self._sub_unit_dim['speaker_category'] = len(self._speaker_to_id)
+            self._sub_unit_pad['speaker_category'] = self._speaker_to_id['_']
+
+    def encode_symbol_sequence(self, lfeat_symbol):
+        lfeat_symbol = lfeat_symbol.strip().split(' ')
+
+        lfeat_symbol_separate = [''] * int(len(self._lfeat_type_list))
+        for this_lfeat_symbol in lfeat_symbol:
+            this_lfeat_symbol = this_lfeat_symbol.strip('{').strip('}').split(
+                '$')
+            index = 0
+            while index < len(lfeat_symbol_separate):
+                lfeat_symbol_separate[index] = lfeat_symbol_separate[
+                    index] + this_lfeat_symbol[index] + ' '
+                index = index + 1
+
+        input_and_label_data = []
+        index = 0
+        while index < len(self._lfeat_type_list):
+            sequence = self.encode_sub_unit(
+                lfeat_symbol_separate[index].strip(),
+                self._lfeat_type_list[index])
+            sequence_array = np.asarray(sequence, dtype=np.int32)
+            input_and_label_data.append(sequence_array)
+            index = index + 1
+
+        return input_and_label_data
+
+    def decode_symbol_sequence(self, sequence):
+        result = []
+        for i, lfeat_type in enumerate(self._lfeat_type_list):
+            s = ''
+            sequence_item = sequence[i].tolist()
+            if lfeat_type == 'sy':
+                s = self.decode_sy(sequence_item)
+            elif lfeat_type == 'tone':
+                s = self.decode_tone(sequence_item)
+            elif lfeat_type == 'syllable_flag':
+                s = self.decode_syllable_flag(sequence_item)
+            elif lfeat_type == 'word_segment':
+                s = self.decode_word_segment(sequence_item)
+            elif lfeat_type == 'emo_category':
+                s = self.decode_emo_category(sequence_item)
+            elif lfeat_type == 'speaker_category':
+                s = self.decode_speaker_category(sequence_item)
+            else:
+                raise Exception(
+                    'modelscope error: configuration lfeat type(%s) unknown.'
+                    % lfeat_type)
+            result.append('%s:%s' % (lfeat_type, s))
+
+        return result
+
+    def encode_sub_unit(self, this_lfeat_symbol, lfeat_type):
+        sequence = []
+        if lfeat_type == 'sy':
+            this_lfeat_symbol = this_lfeat_symbol.strip().split(' ')
+            this_lfeat_symbol_format = ''
+            index = 0
+            while index < len(this_lfeat_symbol):
+                this_lfeat_symbol_format = this_lfeat_symbol_format + '{' + this_lfeat_symbol[
+                    index] + '}' + ' '
+                index = index + 1
+            sequence = self.encode_text(this_lfeat_symbol_format,
+                                        self._cleaner_names)
+        elif lfeat_type == 'tone':
+            sequence = self.encode_tone(this_lfeat_symbol)
+        elif lfeat_type == 'syllable_flag':
+            sequence = self.encode_syllable_flag(this_lfeat_symbol)
+        elif lfeat_type == 'word_segment':
+            sequence = self.encode_word_segment(this_lfeat_symbol)
+        elif lfeat_type == 'emo_category':
+            sequence = self.encode_emo_category(this_lfeat_symbol)
+        elif lfeat_type == 'speaker_category':
+            sequence = self.encode_speaker_category(this_lfeat_symbol)
+        else:
+            raise Exception(
+                'modelscope error: configuration lfeat type(%s) unknown.'
+                % lfeat_type)
+
+        return sequence
+
+    def encode_text(self, text, cleaner_names):
+        sequence = []
+
+        # Check for curly braces and treat their contents as ARPAbet:
+        while len(text):
+            m = _curly_re.match(text)
+            if not m:
+                sequence += self.encode_sy(_clean_text(text, cleaner_names))
+                break
+            sequence += self.encode_sy(_clean_text(m.group(1), cleaner_names))
+            sequence += self.encode_arpanet(m.group(2))
+            text = m.group(3)
+
+        # Append EOS token
+        sequence.append(self._sy_to_id['~'])
+        return sequence
+
+    def encode_sy(self, sy):
+        return [self._sy_to_id[s] for s in sy if self.should_keep_sy(s)]
+
+    def decode_sy(self, id):
+        s = self._id_to_sy[id]
+        if len(s) > 1 and s[0] == '@':
+            s = s[1:]
+        return s
+
+    def should_keep_sy(self, s):
+        return s in self._sy_to_id and s != '_' and s != '~'
+
+    def encode_arpanet(self, text):
+        return self.encode_sy(['@' + s for s in text.split()])
+
+    def encode_tone(self, tone):
+        tones = tone.strip().split(' ')
+        sequence = []
+        for this_tone in tones:
+            sequence.append(self._tone_to_id[this_tone])
+        sequence.append(self._tone_to_id['~'])
+        return sequence
+
+    def decode_tone(self, id):
+        return self._id_to_tone[id]
+
+    def encode_syllable_flag(self, syllable_flag):
+        syllable_flags = syllable_flag.strip().split(' ')
+        sequence = []
+        for this_syllable_flag in syllable_flags:
+            sequence.append(self._syllable_flag_to_id[this_syllable_flag])
+        sequence.append(self._syllable_flag_to_id['~'])
+        return sequence
+
+    def decode_syllable_flag(self, id):
+        return self._id_to_syllable_flag[id]
+
+    def encode_word_segment(self, word_segment):
+        word_segments = word_segment.strip().split(' ')
+        sequence = []
+        for this_word_segment in word_segments:
+            sequence.append(self._word_segment_to_id[this_word_segment])
+        sequence.append(self._word_segment_to_id['~'])
+        return sequence
+
+    def decode_word_segment(self, id):
+        return self._id_to_word_segment[id]
+
+    def encode_emo_category(self, emo_type):
+        emo_categories = emo_type.strip().split(' ')
+        sequence = []
+        for this_category in emo_categories:
+            sequence.append(self._emo_category_to_id[this_category])
+        sequence.append(self._emo_category_to_id['~'])
+        return sequence
+
+    def decode_emo_category(self, id):
+        return self._id_to_emo_category[id]
+
+    def encode_speaker_category(self, speaker):
+        speakers = speaker.strip().split(' ')
+        sequence = []
+        for this_speaker in speakers:
+            sequence.append(self._speaker_to_id[this_speaker])
+        sequence.append(self._speaker_to_id['~'])
+        return sequence
+
+    def decode_speaker_category(self, id):
+        return self._id_to_speaker[id]
diff --git a/modelscope/models/audio/tts/text/numbers.py b/modelscope/models/audio/tts/models/datasets/units/numbers.py
old mode 100755
new mode 100644
similarity index 94%
rename from modelscope/models/audio/tts/text/numbers.py
rename to modelscope/models/audio/tts/models/datasets/units/numbers.py
index d9453fee..d8835059
--- a/modelscope/models/audio/tts/text/numbers.py
+++ b/modelscope/models/audio/tts/models/datasets/units/numbers.py
@@ -1,3 +1,6 @@
+# The implementation is adopted from tacotron,
+# made publicly available under the MIT License at https://github.com/keithito/tacotron
+
 import re
 
 import inflect
diff --git a/modelscope/models/audio/tts/models/fsmn.py b/modelscope/models/audio/tts/models/fsmn.py
deleted file mode 100755
index 875c27f0..00000000
--- a/modelscope/models/audio/tts/models/fsmn.py
+++ /dev/null
@@ -1,273 +0,0 @@
-import tensorflow as tf
-
-
-def build_sequence_mask(sequence_length,
-                        maximum_length=None,
-                        dtype=tf.float32):
-    """Builds the dot product mask.
-
-    Args:
-      sequence_length: The sequence length.
-      maximum_length: Optional size of the returned time dimension. Otherwise
-        it is the maximum of :obj:`sequence_length`.
-      dtype: The type of the mask tensor.
-
-    Returns:
-      A broadcastable ``tf.Tensor`` of type :obj:`dtype` and shape
-      ``[batch_size, max_length]``.
-    """
-    mask = tf.sequence_mask(
-        sequence_length, maxlen=maximum_length, dtype=dtype)
-
-    return mask
-
-
-def norm(inputs):
-    """Layer normalizes :obj:`inputs`."""
-    return tf.contrib.layers.layer_norm(inputs, begin_norm_axis=-1)
-
-
-def pad_in_time(x, padding_shape):
-    """Helper function to pad a tensor in the time dimension and retain the static depth dimension.
-
-       Agrs:
-        x: [Batch, Time, Frequency]
-        padding_length: padding size of constant value (0) before the time dimension
-
-      return:
-        padded x
-    """
-
-    depth = x.get_shape().as_list()[-1]
-    x = tf.pad(x, [[0, 0], padding_shape, [0, 0]])
-    x.set_shape((None, None, depth))
-
-    return x
-
-
-def pad_in_time_right(x, padding_length):
-    """Helper function to pad a tensor in the time dimension and retain the static depth dimension.
-
-       Agrs:
-        x: [Batch, Time, Frequency]
-        padding_length: padding size of constant value (0) before the time dimension
-
-      return:
-        padded x
-    """
-    depth = x.get_shape().as_list()[-1]
-    x = tf.pad(x, [[0, 0], [0, padding_length], [0, 0]])
-    x.set_shape((None, None, depth))
-
-    return x
-
-
-def feed_forward(x, ffn_dim, memory_units, mode, dropout=0.0):
-    """Implements the Transformer's "Feed Forward" layer.
-
-    .. math::
-
-        ffn(x) = max(0, x*W_1 + b_1)*W_2
-
-    Args:
-      x: The input.
-      ffn_dim: The number of units of the nonlinear transformation.
-      memory_units: the number of units of linear transformation
-      mode: A ``tf.estimator.ModeKeys`` mode.
-      dropout: The probability to drop units from the inner transformation.
-
-    Returns:
-      The transformed input.
-    """
-    inner = tf.layers.conv1d(x, ffn_dim, 1, activation=tf.nn.relu)
-    inner = tf.layers.dropout(
-        inner, rate=dropout, training=mode == tf.estimator.ModeKeys.TRAIN)
-    outer = tf.layers.conv1d(inner, memory_units, 1, use_bias=False)
-
-    return outer
-
-
-def drop_and_add(inputs, outputs, mode, dropout=0.0):
-    """Drops units in the outputs and adds the previous values.
-
-    Args:
-      inputs: The input of the previous layer.
-      outputs: The output of the previous layer.
-      mode: A ``tf.estimator.ModeKeys`` mode.
-      dropout: The probability to drop units in :obj:`outputs`.
-
-    Returns:
-      The residual and normalized output.
-    """
-    outputs = tf.layers.dropout(outputs, rate=dropout, training=mode)
-
-    input_dim = inputs.get_shape().as_list()[-1]
-    output_dim = outputs.get_shape().as_list()[-1]
-
-    if input_dim == output_dim:
-        outputs += inputs
-
-    return outputs
-
-
-def MemoryBlock(
-    inputs,
-    filter_size,
-    mode,
-    mask=None,
-    dropout=0.0,
-):
-    """
-    Define the bidirectional memory block in FSMN
-
-    Agrs:
-      inputs: The output of the previous layer. [Batch, Time, Frequency]
-      filter_size: memory block filter size
-      mode: Training or Evaluation
-      mask: A ``tf.Tensor`` applied to the memory block output
-
-    return:
-      output: 3-D tensor ([Batch, Time, Frequency])
-    """
-    static_shape = inputs.get_shape().as_list()
-    depth = static_shape[-1]
-    inputs = tf.expand_dims(inputs, axis=1)  # [Batch, 1, Time, Frequency]
-    depthwise_filter = tf.get_variable(
-        'depth_conv_w',
-        shape=[1, filter_size, depth, 1],
-        initializer=tf.glorot_uniform_initializer(),
-        dtype=tf.float32)
-    memory = tf.nn.depthwise_conv2d(
-        input=inputs,
-        filter=depthwise_filter,
-        strides=[1, 1, 1, 1],
-        padding='SAME',
-        rate=[1, 1],
-        data_format='NHWC')
-    memory = memory + inputs
-    output = tf.layers.dropout(memory, rate=dropout, training=mode)
-    output = tf.reshape(
-        output,
-        [tf.shape(output)[0], tf.shape(output)[2], depth])
-    if mask is not None:
-        output = output * tf.expand_dims(mask, -1)
-
-    return output
-
-
-def MemoryBlockV2(
-    inputs,
-    filter_size,
-    mode,
-    shift=0,
-    mask=None,
-    dropout=0.0,
-):
-    """
-    Define the bidirectional memory block in FSMN
-
-    Agrs:
-      inputs: The output of the previous layer. [Batch, Time, Frequency]
-      filter_size: memory block filter size
-      mode: Training or Evaluation
-      shift: left padding, to control delay
-      mask: A ``tf.Tensor`` applied to the memory block output
-
-    return:
-      output: 3-D tensor ([Batch, Time, Frequency])
-    """
-    if mask is not None:
-        inputs = inputs * tf.expand_dims(mask, -1)
-
-    static_shape = inputs.get_shape().as_list()
-    depth = static_shape[-1]
-    # padding
-    left_padding = int(round((filter_size - 1) / 2))
-    right_padding = int((filter_size - 1) / 2)
-    if shift > 0:
-        left_padding = left_padding + shift
-        right_padding = right_padding - shift
-    pad_inputs = pad_in_time(inputs, [left_padding, right_padding])
-    pad_inputs = tf.expand_dims(
-        pad_inputs, axis=1)  # [Batch, 1, Time, Frequency]
-    depthwise_filter = tf.get_variable(
-        'depth_conv_w',
-        shape=[1, filter_size, depth, 1],
-        initializer=tf.glorot_uniform_initializer(),
-        dtype=tf.float32)
-    memory = tf.nn.depthwise_conv2d(
-        input=pad_inputs,
-        filter=depthwise_filter,
-        strides=[1, 1, 1, 1],
-        padding='VALID',
-        rate=[1, 1],
-        data_format='NHWC')
-    memory = tf.reshape(
-        memory,
-        [tf.shape(memory)[0], tf.shape(memory)[2], depth])
-    memory = memory + inputs
-    output = tf.layers.dropout(memory, rate=dropout, training=mode)
-    if mask is not None:
-        output = output * tf.expand_dims(mask, -1)
-
-    return output
-
-
-def UniMemoryBlock(
-    inputs,
-    filter_size,
-    mode,
-    cache=None,
-    mask=None,
-    dropout=0.0,
-):
-    """
-    Define the unidirectional memory block in FSMN
-
-    Agrs:
-      inputs: The output of the previous layer. [Batch, Time, Frequency]
-      filter_size: memory block filter size
-      cache: for streaming inference
-      mode: Training or Evaluation
-      mask: A ``tf.Tensor`` applied to the memory block output
-      dropout: dorpout factor
-    return:
-      output: 3-D tensor ([Batch, Time, Frequency])
-    """
-    if cache is not None:
-        static_shape = cache['queries'].get_shape().as_list()
-        depth = static_shape[-1]
-        queries = tf.slice(cache['queries'], [0, 1, 0], [
-            tf.shape(cache['queries'])[0],
-            tf.shape(cache['queries'])[1] - 1, depth
-        ])
-        queries = tf.concat([queries, inputs], axis=1)
-        cache['queries'] = queries
-    else:
-        padding_length = filter_size - 1
-        queries = pad_in_time(inputs, [padding_length, 0])
-
-    queries = tf.expand_dims(queries, axis=1)  # [Batch, 1, Time, Frequency]
-    static_shape = queries.get_shape().as_list()
-    depth = static_shape[-1]
-    depthwise_filter = tf.get_variable(
-        'depth_conv_w',
-        shape=[1, filter_size, depth, 1],
-        initializer=tf.glorot_uniform_initializer(),
-        dtype=tf.float32)
-    memory = tf.nn.depthwise_conv2d(
-        input=queries,
-        filter=depthwise_filter,
-        strides=[1, 1, 1, 1],
-        padding='VALID',
-        rate=[1, 1],
-        data_format='NHWC')
-    memory = tf.reshape(
-        memory,
-        [tf.shape(memory)[0], tf.shape(memory)[2], depth])
-    memory = memory + inputs
-    output = tf.layers.dropout(memory, rate=dropout, training=mode)
-    if mask is not None:
-        output = output * tf.expand_dims(mask, -1)
-
-    return output
diff --git a/modelscope/models/audio/tts/models/fsmn_encoder.py b/modelscope/models/audio/tts/models/fsmn_encoder.py
deleted file mode 100755
index 2c650624..00000000
--- a/modelscope/models/audio/tts/models/fsmn_encoder.py
+++ /dev/null
@@ -1,178 +0,0 @@
-import tensorflow as tf
-
-from . import fsmn
-
-
-class FsmnEncoder():
-    """Encoder using Fsmn
-    """
-
-    def __init__(self,
-                 filter_size,
-                 fsmn_num_layers,
-                 dnn_num_layers,
-                 num_memory_units=512,
-                 ffn_inner_dim=2048,
-                 dropout=0.0,
-                 position_encoder=None):
-        """Initializes the parameters of the encoder.
-
-        Args:
-          filter_size: the total order of memory block
-          fsmn_num_layers: The number of fsmn layers.
-          dnn_num_layers: The number of dnn layers
-          num_units: The number of memory units.
-          ffn_inner_dim: The number of units of the inner linear transformation
-            in the feed forward layer.
-          dropout: The probability to drop units from the outputs.
-          position_encoder: The :class:`opennmt.layers.position.PositionEncoder` to
-            apply on inputs or ``None``.
-        """
-        super(FsmnEncoder, self).__init__()
-        self.filter_size = filter_size
-        self.fsmn_num_layers = fsmn_num_layers
-        self.dnn_num_layers = dnn_num_layers
-        self.num_memory_units = num_memory_units
-        self.ffn_inner_dim = ffn_inner_dim
-        self.dropout = dropout
-        self.position_encoder = position_encoder
-
-    def encode(self, inputs, sequence_length=None, mode=True):
-        if self.position_encoder is not None:
-            inputs = self.position_encoder(inputs)
-
-        inputs = tf.layers.dropout(inputs, rate=self.dropout, training=mode)
-
-        mask = fsmn.build_sequence_mask(
-            sequence_length, maximum_length=tf.shape(inputs)[1])
-
-        state = ()
-
-        for layer in range(self.fsmn_num_layers):
-            with tf.variable_scope('fsmn_layer_{}'.format(layer)):
-                with tf.variable_scope('ffn'):
-                    context = fsmn.feed_forward(
-                        inputs,
-                        self.ffn_inner_dim,
-                        self.num_memory_units,
-                        mode,
-                        dropout=self.dropout)
-
-                with tf.variable_scope('memory'):
-                    memory = fsmn.MemoryBlock(
-                        context,
-                        self.filter_size,
-                        mode,
-                        mask=mask,
-                        dropout=self.dropout)
-
-                    memory = fsmn.drop_and_add(
-                        inputs, memory, mode, dropout=self.dropout)
-
-                inputs = memory
-                state += (tf.reduce_mean(inputs, axis=1), )
-
-        for layer in range(self.dnn_num_layers):
-            with tf.variable_scope('dnn_layer_{}'.format(layer)):
-                transformed = fsmn.feed_forward(
-                    inputs,
-                    self.ffn_inner_dim,
-                    self.num_memory_units,
-                    mode,
-                    dropout=self.dropout)
-
-                inputs = transformed
-                state += (tf.reduce_mean(inputs, axis=1), )
-
-        outputs = inputs
-        return (outputs, state, sequence_length)
-
-
-class FsmnEncoderV2():
-    """Encoder using Fsmn
-    """
-
-    def __init__(self,
-                 filter_size,
-                 fsmn_num_layers,
-                 dnn_num_layers,
-                 num_memory_units=512,
-                 ffn_inner_dim=2048,
-                 dropout=0.0,
-                 shift=0,
-                 position_encoder=None):
-        """Initializes the parameters of the encoder.
-
-        Args:
-          filter_size: the total order of memory block
-          fsmn_num_layers: The number of fsmn layers.
-          dnn_num_layers: The number of dnn layers
-          num_units: The number of memory units.
-          ffn_inner_dim: The number of units of the inner linear transformation
-            in the feed forward layer.
-          dropout: The probability to drop units from the outputs.
-          shift: left padding, to control delay
-          position_encoder: The :class:`opennmt.layers.position.PositionEncoder` to
-            apply on inputs or ``None``.
-        """
-        super(FsmnEncoderV2, self).__init__()
-        self.filter_size = filter_size
-        self.fsmn_num_layers = fsmn_num_layers
-        self.dnn_num_layers = dnn_num_layers
-        self.num_memory_units = num_memory_units
-        self.ffn_inner_dim = ffn_inner_dim
-        self.dropout = dropout
-        self.shift = shift
-        if not isinstance(shift, list):
-            self.shift = [shift for _ in range(self.fsmn_num_layers)]
-        self.position_encoder = position_encoder
-
-    def encode(self, inputs, sequence_length=None, mode=True):
-        if self.position_encoder is not None:
-            inputs = self.position_encoder(inputs)
-
-        inputs = tf.layers.dropout(inputs, rate=self.dropout, training=mode)
-
-        mask = fsmn.build_sequence_mask(
-            sequence_length, maximum_length=tf.shape(inputs)[1])
-
-        state = ()
-        for layer in range(self.fsmn_num_layers):
-            with tf.variable_scope('fsmn_layer_{}'.format(layer)):
-                with tf.variable_scope('ffn'):
-                    context = fsmn.feed_forward(
-                        inputs,
-                        self.ffn_inner_dim,
-                        self.num_memory_units,
-                        mode,
-                        dropout=self.dropout)
-
-                with tf.variable_scope('memory'):
-                    memory = fsmn.MemoryBlockV2(
-                        context,
-                        self.filter_size,
-                        mode,
-                        shift=self.shift[layer],
-                        mask=mask,
-                        dropout=self.dropout)
-
-                    memory = fsmn.drop_and_add(
-                        inputs, memory, mode, dropout=self.dropout)
-
-                inputs = memory
-                state += (tf.reduce_mean(inputs, axis=1), )
-
-        for layer in range(self.dnn_num_layers):
-            with tf.variable_scope('dnn_layer_{}'.format(layer)):
-                transformed = fsmn.feed_forward(
-                    inputs,
-                    self.ffn_inner_dim,
-                    self.num_memory_units,
-                    mode,
-                    dropout=self.dropout)
-
-                inputs = transformed
-                state += (tf.reduce_mean(inputs, axis=1), )
-
-        outputs = inputs
-        return (outputs, state, sequence_length)
diff --git a/modelscope/models/audio/tts/models/helpers.py b/modelscope/models/audio/tts/models/helpers.py
deleted file mode 100755
index 371000a4..00000000
--- a/modelscope/models/audio/tts/models/helpers.py
+++ /dev/null
@@ -1,159 +0,0 @@
-import numpy as np
-import tensorflow as tf
-
-
-class VarTestHelper(tf.contrib.seq2seq.Helper):
-
-    def __init__(self, batch_size, inputs, dim):
-        with tf.name_scope('VarTestHelper'):
-            self._batch_size = batch_size
-            self._inputs = inputs
-            self._dim = dim
-
-            num_steps = tf.shape(self._inputs)[1]
-            self._lengths = tf.tile([num_steps], [self._batch_size])
-
-            self._inputs = tf.roll(inputs, shift=-1, axis=1)
-            self._init_inputs = inputs[:, 0, :]
-
-    @property
-    def batch_size(self):
-        return self._batch_size
-
-    @property
-    def sample_ids_shape(self):
-        return tf.TensorShape([])
-
-    @property
-    def sample_ids_dtype(self):
-        return np.int32
-
-    def initialize(self, name=None):
-        return (tf.tile([False], [self._batch_size]),
-                _go_frames(self._batch_size, self._dim, self._init_inputs))
-
-    def sample(self, time, outputs, state, name=None):
-        return tf.tile([0], [self._batch_size])  # Return all 0; we ignore them
-
-    def next_inputs(self, time, outputs, state, sample_ids, name=None):
-        with tf.name_scope('VarTestHelper'):
-            finished = (time + 1 >= self._lengths)
-            next_inputs = tf.concat([outputs, self._inputs[:, time, :]],
-                                    axis=-1)
-            return (finished, next_inputs, state)
-
-
-class VarTrainingHelper(tf.contrib.seq2seq.Helper):
-
-    def __init__(self, targets, inputs, dim):
-        with tf.name_scope('VarTrainingHelper'):
-            self._targets = targets  # [N, T_in, 1]
-            self._batch_size = tf.shape(inputs)[0]  # N
-            self._inputs = inputs
-            self._dim = dim
-
-            num_steps = tf.shape(self._targets)[1]
-            self._lengths = tf.tile([num_steps], [self._batch_size])
-
-            self._inputs = tf.roll(inputs, shift=-1, axis=1)
-            self._init_inputs = inputs[:, 0, :]
-
-    @property
-    def batch_size(self):
-        return self._batch_size
-
-    @property
-    def sample_ids_shape(self):
-        return tf.TensorShape([])
-
-    @property
-    def sample_ids_dtype(self):
-        return np.int32
-
-    def initialize(self, name=None):
-        return (tf.tile([False], [self._batch_size]),
-                _go_frames(self._batch_size, self._dim, self._init_inputs))
-
-    def sample(self, time, outputs, state, name=None):
-        return tf.tile([0], [self._batch_size])  # Return all 0; we ignore them
-
-    def next_inputs(self, time, outputs, state, sample_ids, name=None):
-        with tf.name_scope(name or 'VarTrainingHelper'):
-            finished = (time + 1 >= self._lengths)
-            next_inputs = tf.concat(
-                [self._targets[:, time, :], self._inputs[:, time, :]], axis=-1)
-            return (finished, next_inputs, state)
-
-
-class VarTrainingSSHelper(tf.contrib.seq2seq.Helper):
-
-    def __init__(self, targets, inputs, dim, global_step, schedule_begin,
-                 alpha, decay_steps):
-        with tf.name_scope('VarTrainingSSHelper'):
-            self._targets = targets  # [N, T_in, 1]
-            self._batch_size = tf.shape(inputs)[0]  # N
-            self._inputs = inputs
-            self._dim = dim
-
-            num_steps = tf.shape(self._targets)[1]
-            self._lengths = tf.tile([num_steps], [self._batch_size])
-
-            self._inputs = tf.roll(inputs, shift=-1, axis=1)
-            self._init_inputs = inputs[:, 0, :]
-
-            # for schedule sampling
-            self._global_step = global_step
-            self._schedule_begin = schedule_begin
-            self._alpha = alpha
-            self._decay_steps = decay_steps
-
-    @property
-    def batch_size(self):
-        return self._batch_size
-
-    @property
-    def sample_ids_shape(self):
-        return tf.TensorShape([])
-
-    @property
-    def sample_ids_dtype(self):
-        return np.int32
-
-    def initialize(self, name=None):
-        self._ratio = _tf_decay(self._global_step, self._schedule_begin,
-                                self._alpha, self._decay_steps)
-        return (tf.tile([False], [self._batch_size]),
-                _go_frames(self._batch_size, self._dim, self._init_inputs))
-
-    def sample(self, time, outputs, state, name=None):
-        return tf.tile([0], [self._batch_size])  # Return all 0; we ignore them
-
-    def next_inputs(self, time, outputs, state, sample_ids, name=None):
-        with tf.name_scope(name or 'VarTrainingHelper'):
-            finished = (time + 1 >= self._lengths)
-            next_inputs_tmp = tf.cond(
-                tf.less(
-                    tf.random_uniform([], minval=0, maxval=1,
-                                      dtype=tf.float32), self._ratio),
-                lambda: self._targets[:, time, :], lambda: outputs)
-            next_inputs = tf.concat(
-                [next_inputs_tmp, self._inputs[:, time, :]], axis=-1)
-            return (finished, next_inputs, state)
-
-
-def _go_frames(batch_size, dim, init_inputs):
-    '''Returns all-zero <GO> frames for a given batch size and output dimension'''
-    return tf.concat([tf.tile([[0.0]], [batch_size, dim]), init_inputs],
-                     axis=-1)
-
-
-def _tf_decay(global_step, schedule_begin, alpha, decay_steps):
-    tfr = tf.train.exponential_decay(
-        1.0,
-        global_step=global_step - schedule_begin,
-        decay_steps=decay_steps,
-        decay_rate=alpha,
-        name='tfr_decay')
-    final_tfr = tf.cond(
-        tf.less(global_step, schedule_begin), lambda: 1.0, lambda: tfr)
-    return final_tfr
diff --git a/modelscope/models/audio/tts/models/models/__init__.py b/modelscope/models/audio/tts/models/models/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/modelscope/models/audio/tts/models/models/hifigan/__init__.py b/modelscope/models/audio/tts/models/models/hifigan/__init__.py
new file mode 100644
index 00000000..ae9d10ea
--- /dev/null
+++ b/modelscope/models/audio/tts/models/models/hifigan/__init__.py
@@ -0,0 +1,3 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+from .hifigan import *  # noqa F403
diff --git a/modelscope/models/audio/tts/models/models/hifigan/hifigan.py b/modelscope/models/audio/tts/models/models/hifigan/hifigan.py
new file mode 100755
index 00000000..0f950539
--- /dev/null
+++ b/modelscope/models/audio/tts/models/models/hifigan/hifigan.py
@@ -0,0 +1,238 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+# Part of the implementation is borrowed from https://github.com/jik876/hifi-gan
+
+from distutils.version import LooseVersion
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn import AvgPool1d, Conv1d, Conv2d, ConvTranspose1d
+from torch.nn.utils import remove_weight_norm, spectral_norm, weight_norm
+
+from modelscope.models.audio.tts.models.utils import get_padding, init_weights
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+is_pytorch_17plus = LooseVersion(torch.__version__) >= LooseVersion('1.7')
+
+
+def stft(x, fft_size, hop_size, win_length, window):
+    """Perform STFT and convert to magnitude spectrogram.
+
+    Args:
+        x (Tensor): Input signal tensor (B, T).
+        fft_size (int): FFT size.
+        hop_size (int): Hop size.
+        win_length (int): Window length.
+        window (str): Window function type.
+
+    Returns:
+        Tensor: Magnitude spectrogram (B).
+
+    """
+    if is_pytorch_17plus:
+        x_stft = torch.stft(
+            x, fft_size, hop_size, win_length, window, return_complex=False)
+    else:
+        x_stft = torch.stft(x, fft_size, hop_size, win_length, window)
+    real = x_stft[..., 0]
+    imag = x_stft[..., 1]
+
+    # NOTE(kan-bayashi): clamp is needed to avoid nan or inf
+    return torch.sqrt(torch.clamp(real**2 + imag**2, min=1e-7)).transpose(2, 1)
+
+
+LRELU_SLOPE = 0.1
+
+
+def get_padding_casual(kernel_size, dilation=1):
+    return int(kernel_size * dilation - dilation)
+
+
+class Conv1dCasual(torch.nn.Module):
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 bias=True,
+                 padding_mode='zeros'):
+        super(Conv1dCasual, self).__init__()
+        self.pad = padding
+        self.conv1d = weight_norm(
+            Conv1d(
+                in_channels,
+                out_channels,
+                kernel_size,
+                stride,
+                padding=0,
+                dilation=dilation,
+                groups=groups,
+                bias=bias,
+                padding_mode=padding_mode))
+        self.conv1d.apply(init_weights)
+
+    def forward(self, x):  # bdt
+        # described starting from the last dimension and moving forward.
+        x = F.pad(x, (self.pad, 0, 0, 0, 0, 0), 'constant')
+        x = self.conv1d(x)
+        return x
+
+    def remove_weight_norm(self):
+        remove_weight_norm(self.conv1d)
+
+
+class ConvTranspose1dCausal(torch.nn.Module):
+    """CausalConvTranspose1d module with customized initialization."""
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride,
+                 padding=0):
+        """Initialize CausalConvTranspose1d module."""
+        super(ConvTranspose1dCausal, self).__init__()
+        self.deconv = weight_norm(
+            ConvTranspose1d(in_channels, out_channels, kernel_size, stride))
+        self.stride = stride
+        self.deconv.apply(init_weights)
+        self.pad = kernel_size - stride
+
+    def forward(self, x):
+        """Calculate forward propagation.
+        Args:
+            x (Tensor): Input tensor (B, in_channels, T_in).
+        Returns:
+            Tensor: Output tensor (B, out_channels, T_out).
+        """
+        # x = F.pad(x, (self.pad, 0, 0, 0, 0, 0), "constant")
+        return self.deconv(x)[:, :, :-self.pad]
+
+    def remove_weight_norm(self):
+        remove_weight_norm(self.deconv)
+
+
+class ResBlock1(torch.nn.Module):
+
+    def __init__(self, h, channels, kernel_size=3, dilation=(1, 3, 5)):
+        super(ResBlock1, self).__init__()
+        self.h = h
+        self.convs1 = nn.ModuleList([
+            Conv1dCasual(
+                channels,
+                channels,
+                kernel_size,
+                1,
+                dilation=dilation[i],
+                padding=get_padding_casual(kernel_size, dilation[i]))
+            for i in range(len(dilation))
+        ])
+
+        self.convs2 = nn.ModuleList([
+            Conv1dCasual(
+                channels,
+                channels,
+                kernel_size,
+                1,
+                dilation=1,
+                padding=get_padding_casual(kernel_size, 1))
+            for i in range(len(dilation))
+        ])
+
+    def forward(self, x):
+        for c1, c2 in zip(self.convs1, self.convs2):
+            xt = F.leaky_relu(x, LRELU_SLOPE)
+            xt = c1(xt)
+            xt = F.leaky_relu(xt, LRELU_SLOPE)
+            xt = c2(xt)
+            x = xt + x
+        return x
+
+    def remove_weight_norm(self):
+        for layer in self.convs1:
+            layer.remove_weight_norm()
+        for layer in self.convs2:
+            layer.remove_weight_norm()
+
+
+class Generator(torch.nn.Module):
+
+    def __init__(self, h):
+        super(Generator, self).__init__()
+        self.h = h
+        self.num_kernels = len(h.resblock_kernel_sizes)
+        self.num_upsamples = len(h.upsample_rates)
+        logger.info('num_kernels={}, num_upsamples={}'.format(
+            self.num_kernels, self.num_upsamples))
+        self.conv_pre = Conv1dCasual(
+            80, h.upsample_initial_channel, 7, 1, padding=7 - 1)
+        resblock = ResBlock1 if h.resblock == '1' else ResBlock2
+
+        self.ups = nn.ModuleList()
+        self.repeat_ups = nn.ModuleList()
+        for i, (u, k) in enumerate(
+                zip(h.upsample_rates, h.upsample_kernel_sizes)):
+            upsample = nn.Sequential(
+                nn.Upsample(mode='nearest', scale_factor=u),
+                nn.LeakyReLU(LRELU_SLOPE),
+                Conv1dCasual(
+                    h.upsample_initial_channel // (2**i),
+                    h.upsample_initial_channel // (2**(i + 1)),
+                    kernel_size=7,
+                    stride=1,
+                    padding=7 - 1))
+            self.repeat_ups.append(upsample)
+            self.ups.append(
+                ConvTranspose1dCausal(
+                    h.upsample_initial_channel // (2**i),
+                    h.upsample_initial_channel // (2**(i + 1)),
+                    k,
+                    u,
+                    padding=(k - u) // 2))
+
+        self.resblocks = nn.ModuleList()
+        for i in range(len(self.ups)):
+            ch = h.upsample_initial_channel // (2**(i + 1))
+            for j, (k, d) in enumerate(
+                    zip(h.resblock_kernel_sizes, h.resblock_dilation_sizes)):
+                self.resblocks.append(resblock(h, ch, k, d))
+
+        self.conv_post = Conv1dCasual(ch, 1, 7, 1, padding=7 - 1)
+
+    def forward(self, x):
+        x = self.conv_pre(x)
+        for i in range(self.num_upsamples):
+            x = torch.sin(x) + x
+            # transconv
+            x1 = F.leaky_relu(x, LRELU_SLOPE)
+            x1 = self.ups[i](x1)
+            # repeat
+            x2 = self.repeat_ups[i](x)
+            x = x1 + x2
+            xs = None
+            for j in range(self.num_kernels):
+                if xs is None:
+                    xs = self.resblocks[i * self.num_kernels + j](x)
+                else:
+                    xs += self.resblocks[i * self.num_kernels + j](x)
+            x = xs / self.num_kernels
+        x = F.leaky_relu(x)
+        x = self.conv_post(x)
+        x = torch.tanh(x)
+        return x
+
+    def remove_weight_norm(self):
+        logger.info('Removing weight norm...')
+        for layer in self.ups:
+            layer.remove_weight_norm()
+        for layer in self.repeat_ups:
+            layer[-1].remove_weight_norm()
+        for layer in self.resblocks:
+            layer.remove_weight_norm()
+        self.conv_pre.remove_weight_norm()
+        self.conv_post.remove_weight_norm()
diff --git a/modelscope/models/audio/tts/models/models/sambert/__init__.py b/modelscope/models/audio/tts/models/models/sambert/__init__.py
new file mode 100644
index 00000000..f0bf5290
--- /dev/null
+++ b/modelscope/models/audio/tts/models/models/sambert/__init__.py
@@ -0,0 +1,3 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+from .kantts_sambert import *  # noqa F403
diff --git a/modelscope/models/audio/tts/models/models/sambert/adaptors.py b/modelscope/models/audio/tts/models/models/sambert/adaptors.py
new file mode 100644
index 00000000..c171a1db
--- /dev/null
+++ b/modelscope/models/audio/tts/models/models/sambert/adaptors.py
@@ -0,0 +1,131 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from .base import Prenet
+from .fsmn import FsmnEncoderV2
+
+
+class LengthRegulator(nn.Module):
+
+    def __init__(self, r=1):
+        super(LengthRegulator, self).__init__()
+
+        self.r = r
+
+    def forward(self, inputs, durations, masks=None):
+        reps = (durations + 0.5).long()
+        output_lens = reps.sum(dim=1)
+        max_len = output_lens.max()
+        reps_cumsum = torch.cumsum(
+            F.pad(reps.float(), (1, 0, 0, 0), value=0.0), dim=1)[:, None, :]
+        range_ = torch.arange(max_len).to(inputs.device)[None, :, None]
+        mult = ((reps_cumsum[:, :, :-1] <= range_)
+                & (reps_cumsum[:, :, 1:] > range_))  # yapf:disable
+        mult = mult.float()
+        out = torch.matmul(mult, inputs)
+
+        if masks is not None:
+            out = out.masked_fill(masks.unsqueeze(-1), 0.0)
+
+        seq_len = out.size(1)
+        padding = self.r - int(seq_len) % self.r
+        if (padding < self.r):
+            out = F.pad(
+                out.transpose(1, 2), (0, padding, 0, 0, 0, 0), value=0.0)
+            out = out.transpose(1, 2)
+
+        return out, output_lens
+
+
+class VarRnnARPredictor(nn.Module):
+
+    def __init__(self, cond_units, prenet_units, rnn_units):
+        super(VarRnnARPredictor, self).__init__()
+
+        self.prenet = Prenet(1, prenet_units)
+        self.lstm = nn.LSTM(
+            prenet_units[-1] + cond_units,
+            rnn_units,
+            num_layers=2,
+            batch_first=True,
+            bidirectional=False)
+        self.fc = nn.Linear(rnn_units, 1)
+
+    def forward(self, inputs, cond, h=None, masks=None):
+        x = torch.cat([self.prenet(inputs), cond], dim=-1)
+        # The input can also be a packed variable length sequence,
+        # here we just omit it for simplicity due to the mask and uni-directional lstm.
+        x, h_new = self.lstm(x, h)
+
+        x = self.fc(x).squeeze(-1)
+        x = F.relu(x)
+
+        if masks is not None:
+            x = x.masked_fill(masks, 0.0)
+
+        return x, h_new
+
+    def infer(self, cond, masks=None):
+        batch_size, length = cond.size(0), cond.size(1)
+
+        output = []
+        x = torch.zeros((batch_size, 1)).to(cond.device)
+        h = None
+
+        for i in range(length):
+            x, h = self.forward(x.unsqueeze(1), cond[:, i:i + 1, :], h=h)
+            output.append(x)
+
+        output = torch.cat(output, dim=-1)
+
+        if masks is not None:
+            output = output.masked_fill(masks, 0.0)
+
+        return output
+
+
+class VarFsmnRnnNARPredictor(nn.Module):
+
+    def __init__(self, in_dim, filter_size, fsmn_num_layers, num_memory_units,
+                 ffn_inner_dim, dropout, shift, lstm_units):
+        super(VarFsmnRnnNARPredictor, self).__init__()
+
+        self.fsmn = FsmnEncoderV2(filter_size, fsmn_num_layers, in_dim,
+                                  num_memory_units, ffn_inner_dim, dropout,
+                                  shift)
+        self.blstm = nn.LSTM(
+            num_memory_units,
+            lstm_units,
+            num_layers=1,
+            batch_first=True,
+            bidirectional=True)
+        self.fc = nn.Linear(2 * lstm_units, 1)
+
+    def forward(self, inputs, masks=None):
+        input_lengths = None
+        if masks is not None:
+            input_lengths = torch.sum((~masks).float(), dim=1).long()
+
+        x = self.fsmn(inputs, masks)
+
+        if input_lengths is not None:
+            x = nn.utils.rnn.pack_padded_sequence(
+                x,
+                input_lengths.tolist(),
+                batch_first=True,
+                enforce_sorted=False)
+            x, _ = self.blstm(x)
+            x, _ = nn.utils.rnn.pad_packed_sequence(
+                x, batch_first=True, total_length=inputs.size(1))
+        else:
+            x, _ = self.blstm(x)
+
+        x = self.fc(x).squeeze(-1)
+
+        if masks is not None:
+            x = x.masked_fill(masks, 0.0)
+
+        return x
diff --git a/modelscope/models/audio/tts/models/models/sambert/base.py b/modelscope/models/audio/tts/models/models/sambert/base.py
new file mode 100644
index 00000000..873aecbf
--- /dev/null
+++ b/modelscope/models/audio/tts/models/models/sambert/base.py
@@ -0,0 +1,369 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class ScaledDotProductAttention(nn.Module):
+    """ Scaled Dot-Product Attention """
+
+    def __init__(self, temperature, dropatt=0.0):
+        super().__init__()
+        self.temperature = temperature
+        self.softmax = nn.Softmax(dim=2)
+        self.dropatt = nn.Dropout(dropatt)
+
+    def forward(self, q, k, v, mask=None):
+
+        attn = torch.bmm(q, k.transpose(1, 2))
+        attn = attn / self.temperature
+
+        if mask is not None:
+            attn = attn.masked_fill(mask, -np.inf)
+
+        attn = self.softmax(attn)
+        attn = self.dropatt(attn)
+        output = torch.bmm(attn, v)
+
+        return output, attn
+
+
+class Prenet(nn.Module):
+
+    def __init__(self, in_units, prenet_units, out_units=0):
+        super(Prenet, self).__init__()
+
+        self.fcs = nn.ModuleList()
+        for in_dim, out_dim in zip([in_units] + prenet_units[:-1],
+                                   prenet_units):
+            self.fcs.append(nn.Linear(in_dim, out_dim))
+            self.fcs.append(nn.ReLU())
+            self.fcs.append(nn.Dropout(0.5))
+
+        if (out_units):
+            self.fcs.append(nn.Linear(prenet_units[-1], out_units))
+
+    def forward(self, input):
+        output = input
+        for layer in self.fcs:
+            output = layer(output)
+        return output
+
+
+class MultiHeadSelfAttention(nn.Module):
+    """ Multi-Head SelfAttention module """
+
+    def __init__(self, n_head, d_in, d_model, d_head, dropout, dropatt=0.0):
+        super().__init__()
+
+        self.n_head = n_head
+        self.d_head = d_head
+        self.d_in = d_in
+        self.d_model = d_model
+
+        self.layer_norm = nn.LayerNorm(d_in, eps=1e-6)
+        self.w_qkv = nn.Linear(d_in, 3 * n_head * d_head)
+
+        self.attention = ScaledDotProductAttention(
+            temperature=np.power(d_head, 0.5), dropatt=dropatt)
+
+        self.fc = nn.Linear(n_head * d_head, d_model)
+
+        self.dropout = nn.Dropout(dropout)
+
+    def forward(self, input, mask=None):
+        d_head, n_head = self.d_head, self.n_head
+
+        sz_b, len_in, _ = input.size()
+
+        residual = input
+
+        x = self.layer_norm(input)
+        qkv = self.w_qkv(x)
+        q, k, v = qkv.chunk(3, -1)
+
+        q = q.view(sz_b, len_in, n_head, d_head)
+        k = k.view(sz_b, len_in, n_head, d_head)
+        v = v.view(sz_b, len_in, n_head, d_head)
+
+        q = q.permute(2, 0, 1, 3).contiguous().view(-1, len_in,
+                                                    d_head)  # (n*b) x l x d
+        k = k.permute(2, 0, 1, 3).contiguous().view(-1, len_in,
+                                                    d_head)  # (n*b) x l x d
+        v = v.permute(2, 0, 1, 3).contiguous().view(-1, len_in,
+                                                    d_head)  # (n*b) x l x d
+
+        if mask is not None:
+            mask = mask.repeat(n_head, 1, 1)  # (n*b) x .. x ..
+        output, attn = self.attention(q, k, v, mask=mask)
+
+        output = output.view(n_head, sz_b, len_in, d_head)
+        output = (output.permute(1, 2, 0,
+                                 3).contiguous().view(sz_b, len_in,
+                                                      -1))  # b x l x (n*d)
+
+        output = self.dropout(self.fc(output))
+        if (output.size(-1) == residual.size(-1)):
+            output = output + residual
+
+        return output, attn
+
+
+class PositionwiseConvFeedForward(nn.Module):
+    """ A two-feed-forward-layer module """
+
+    def __init__(self,
+                 d_in,
+                 d_hid,
+                 kernel_size=(3, 1),
+                 dropout_inner=0.1,
+                 dropout=0.1):
+        super().__init__()
+        # Use Conv1D
+        # position-wise
+        self.w_1 = nn.Conv1d(
+            d_in,
+            d_hid,
+            kernel_size=kernel_size[0],
+            padding=(kernel_size[0] - 1) // 2,
+        )
+        # position-wise
+        self.w_2 = nn.Conv1d(
+            d_hid,
+            d_in,
+            kernel_size=kernel_size[1],
+            padding=(kernel_size[1] - 1) // 2,
+        )
+
+        self.layer_norm = nn.LayerNorm(d_in, eps=1e-6)
+        self.dropout_inner = nn.Dropout(dropout_inner)
+        self.dropout = nn.Dropout(dropout)
+
+    def forward(self, x, mask=None):
+        residual = x
+        x = self.layer_norm(x)
+
+        output = x.transpose(1, 2)
+        output = F.relu(self.w_1(output))
+        if mask is not None:
+            output = output.masked_fill(mask.unsqueeze(1), 0)
+        output = self.dropout_inner(output)
+        output = self.w_2(output)
+        output = output.transpose(1, 2)
+        output = self.dropout(output)
+
+        output = output + residual
+
+        return output
+
+
+class FFTBlock(nn.Module):
+    """FFT Block"""
+
+    def __init__(self,
+                 d_in,
+                 d_model,
+                 n_head,
+                 d_head,
+                 d_inner,
+                 kernel_size,
+                 dropout,
+                 dropout_attn=0.0,
+                 dropout_relu=0.0):
+        super(FFTBlock, self).__init__()
+        self.slf_attn = MultiHeadSelfAttention(
+            n_head,
+            d_in,
+            d_model,
+            d_head,
+            dropout=dropout,
+            dropatt=dropout_attn)
+        self.pos_ffn = PositionwiseConvFeedForward(
+            d_model,
+            d_inner,
+            kernel_size,
+            dropout_inner=dropout_relu,
+            dropout=dropout)
+
+    def forward(self, input, mask=None, slf_attn_mask=None):
+        output, slf_attn = self.slf_attn(input, mask=slf_attn_mask)
+        if mask is not None:
+            output = output.masked_fill(mask.unsqueeze(-1), 0)
+
+        output = self.pos_ffn(output, mask=mask)
+        if mask is not None:
+            output = output.masked_fill(mask.unsqueeze(-1), 0)
+
+        return output, slf_attn
+
+
+class MultiHeadPNCAAttention(nn.Module):
+    """ Multi-Head Attention PNCA module """
+
+    def __init__(self, n_head, d_model, d_mem, d_head, dropout, dropatt=0.0):
+        super().__init__()
+
+        self.n_head = n_head
+        self.d_head = d_head
+        self.d_model = d_model
+        self.d_mem = d_mem
+
+        self.layer_norm = nn.LayerNorm(d_model, eps=1e-6)
+
+        self.w_x_qkv = nn.Linear(d_model, 3 * n_head * d_head)
+        self.fc_x = nn.Linear(n_head * d_head, d_model)
+
+        self.w_h_kv = nn.Linear(d_mem, 2 * n_head * d_head)
+        self.fc_h = nn.Linear(n_head * d_head, d_model)
+
+        self.attention = ScaledDotProductAttention(
+            temperature=np.power(d_head, 0.5), dropatt=dropatt)
+
+        self.dropout = nn.Dropout(dropout)
+
+    def update_x_state(self, x):
+        d_head, n_head = self.d_head, self.n_head
+
+        sz_b, len_x, _ = x.size()
+
+        x_qkv = self.w_x_qkv(x)
+        x_q, x_k, x_v = x_qkv.chunk(3, -1)
+
+        x_q = x_q.view(sz_b, len_x, n_head, d_head)
+        x_k = x_k.view(sz_b, len_x, n_head, d_head)
+        x_v = x_v.view(sz_b, len_x, n_head, d_head)
+
+        x_q = x_q.permute(2, 0, 1, 3).contiguous().view(-1, len_x, d_head)
+        x_k = x_k.permute(2, 0, 1, 3).contiguous().view(-1, len_x, d_head)
+        x_v = x_v.permute(2, 0, 1, 3).contiguous().view(-1, len_x, d_head)
+
+        if (self.x_state_size):
+            self.x_k = torch.cat([self.x_k, x_k], dim=1)
+            self.x_v = torch.cat([self.x_v, x_v], dim=1)
+        else:
+            self.x_k = x_k
+            self.x_v = x_v
+
+        self.x_state_size += len_x
+
+        return x_q, x_k, x_v
+
+    def update_h_state(self, h):
+        if (self.h_state_size == h.size(1)):
+            return None, None
+
+        d_head, n_head = self.d_head, self.n_head
+
+        # H
+        sz_b, len_h, _ = h.size()
+
+        h_kv = self.w_h_kv(h)
+        h_k, h_v = h_kv.chunk(2, -1)
+
+        h_k = h_k.view(sz_b, len_h, n_head, d_head)
+        h_v = h_v.view(sz_b, len_h, n_head, d_head)
+
+        self.h_k = h_k.permute(2, 0, 1, 3).contiguous().view(-1, len_h, d_head)
+        self.h_v = h_v.permute(2, 0, 1, 3).contiguous().view(-1, len_h, d_head)
+
+        self.h_state_size += len_h
+
+        return h_k, h_v
+
+    def reset_state(self):
+        self.h_k = None
+        self.h_v = None
+        self.h_state_size = 0
+        self.x_k = None
+        self.x_v = None
+        self.x_state_size = 0
+
+    def forward(self, x, h, mask_x=None, mask_h=None):
+        residual = x
+        self.update_h_state(h)
+        x_q, x_k, x_v = self.update_x_state(self.layer_norm(x))
+
+        d_head, n_head = self.d_head, self.n_head
+
+        sz_b, len_in, _ = x.size()
+
+        # X
+        if mask_x is not None:
+            mask_x = mask_x.repeat(n_head, 1, 1)  # (n*b) x .. x ..
+        output_x, attn_x = self.attention(x_q, self.x_k, self.x_v, mask=mask_x)
+
+        output_x = output_x.view(n_head, sz_b, len_in, d_head)
+        output_x = (output_x.permute(1, 2, 0,
+                                     3).contiguous().view(sz_b, len_in,
+                                                          -1))  # b x l x (n*d)
+        output_x = self.fc_x(output_x)
+
+        # H
+        if mask_h is not None:
+            mask_h = mask_h.repeat(n_head, 1, 1)
+        output_h, attn_h = self.attention(x_q, self.h_k, self.h_v, mask=mask_h)
+
+        output_h = output_h.view(n_head, sz_b, len_in, d_head)
+        output_h = (output_h.permute(1, 2, 0,
+                                     3).contiguous().view(sz_b, len_in,
+                                                          -1))  # b x l x (n*d)
+        output_h = self.fc_h(output_h)
+
+        output = output_x + output_h
+
+        output = self.dropout(output)
+
+        output = output + residual
+
+        return output, attn_x, attn_h
+
+
+class PNCABlock(nn.Module):
+    """PNCA Block"""
+
+    def __init__(self,
+                 d_model,
+                 d_mem,
+                 n_head,
+                 d_head,
+                 d_inner,
+                 kernel_size,
+                 dropout,
+                 dropout_attn=0.0,
+                 dropout_relu=0.0):
+        super(PNCABlock, self).__init__()
+        self.pnca_attn = MultiHeadPNCAAttention(
+            n_head,
+            d_model,
+            d_mem,
+            d_head,
+            dropout=dropout,
+            dropatt=dropout_attn)
+        self.pos_ffn = PositionwiseConvFeedForward(
+            d_model,
+            d_inner,
+            kernel_size,
+            dropout_inner=dropout_relu,
+            dropout=dropout)
+
+    def forward(self,
+                input,
+                memory,
+                mask=None,
+                pnca_x_attn_mask=None,
+                pnca_h_attn_mask=None):
+        output, pnca_attn_x, pnca_attn_h = self.pnca_attn(
+            input, memory, pnca_x_attn_mask, pnca_h_attn_mask)
+        if mask is not None:
+            output = output.masked_fill(mask.unsqueeze(-1), 0)
+
+        output = self.pos_ffn(output, mask=mask)
+        if mask is not None:
+            output = output.masked_fill(mask.unsqueeze(-1), 0)
+
+        return output, pnca_attn_x, pnca_attn_h
+
+    def reset_state(self):
+        self.pnca_attn.reset_state()
diff --git a/modelscope/models/audio/tts/models/models/sambert/fsmn.py b/modelscope/models/audio/tts/models/models/sambert/fsmn.py
new file mode 100644
index 00000000..c070ef35
--- /dev/null
+++ b/modelscope/models/audio/tts/models/models/sambert/fsmn.py
@@ -0,0 +1,126 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+"""
+FSMN Pytorch Version
+"""
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class FeedForwardNet(nn.Module):
+    """ A two-feed-forward-layer module """
+
+    def __init__(self, d_in, d_hid, d_out, kernel_size=[1, 1], dropout=0.1):
+        super().__init__()
+
+        # Use Conv1D
+        # position-wise
+        self.w_1 = nn.Conv1d(
+            d_in,
+            d_hid,
+            kernel_size=kernel_size[0],
+            padding=(kernel_size[0] - 1) // 2,
+        )
+        # position-wise
+        self.w_2 = nn.Conv1d(
+            d_hid,
+            d_out,
+            kernel_size=kernel_size[1],
+            padding=(kernel_size[1] - 1) // 2,
+            bias=False)
+
+        self.dropout = nn.Dropout(dropout)
+
+    def forward(self, x):
+        output = x.transpose(1, 2)
+        output = F.relu(self.w_1(output))
+        output = self.dropout(output)
+        output = self.w_2(output)
+        output = output.transpose(1, 2)
+
+        return output
+
+
+class MemoryBlockV2(nn.Module):
+
+    def __init__(self, d, filter_size, shift, dropout=0.0):
+        super(MemoryBlockV2, self).__init__()
+
+        left_padding = int(round((filter_size - 1) / 2))
+        right_padding = int((filter_size - 1) / 2)
+        if shift > 0:
+            left_padding += shift
+            right_padding -= shift
+
+        self.lp, self.rp = left_padding, right_padding
+
+        self.conv_dw = nn.Conv1d(d, d, filter_size, 1, 0, groups=d, bias=False)
+        self.dropout = nn.Dropout(dropout)
+
+    def forward(self, input, mask=None):
+        if mask is not None:
+            input = input.masked_fill(mask.unsqueeze(-1), 0)
+
+        x = F.pad(
+            input, (0, 0, self.lp, self.rp, 0, 0), mode='constant', value=0.0)
+        output = self.conv_dw(x.contiguous().transpose(
+            1, 2)).contiguous().transpose(1, 2)
+        output += input
+        output = self.dropout(output)
+
+        if mask is not None:
+            output = output.masked_fill(mask.unsqueeze(-1), 0)
+
+        return output
+
+
+class FsmnEncoderV2(nn.Module):
+
+    def __init__(self,
+                 filter_size,
+                 fsmn_num_layers,
+                 input_dim,
+                 num_memory_units,
+                 ffn_inner_dim,
+                 dropout=0.0,
+                 shift=0):
+        super(FsmnEncoderV2, self).__init__()
+
+        self.filter_size = filter_size
+        self.fsmn_num_layers = fsmn_num_layers
+        self.num_memory_units = num_memory_units
+        self.ffn_inner_dim = ffn_inner_dim
+        self.dropout = dropout
+        self.shift = shift
+        if not isinstance(shift, list):
+            self.shift = [shift for _ in range(self.fsmn_num_layers)]
+
+        self.ffn_lst = nn.ModuleList()
+        self.ffn_lst.append(
+            FeedForwardNet(
+                input_dim, ffn_inner_dim, num_memory_units, dropout=dropout))
+        for i in range(1, fsmn_num_layers):
+            self.ffn_lst.append(
+                FeedForwardNet(
+                    num_memory_units,
+                    ffn_inner_dim,
+                    num_memory_units,
+                    dropout=dropout))
+
+        self.memory_block_lst = nn.ModuleList()
+        for i in range(fsmn_num_layers):
+            self.memory_block_lst.append(
+                MemoryBlockV2(num_memory_units, filter_size, self.shift[i],
+                              dropout))
+
+    def forward(self, input, mask=None):
+        x = F.dropout(input, self.dropout, self.training)
+        for (ffn, memory_block) in zip(self.ffn_lst, self.memory_block_lst):
+            context = ffn(x)
+            memory = memory_block(context, mask)
+            memory = F.dropout(memory, self.dropout, self.training)
+            if (memory.size(-1) == x.size(-1)):
+                memory += x
+            x = memory
+
+        return x
diff --git a/modelscope/models/audio/tts/models/models/sambert/kantts_sambert.py b/modelscope/models/audio/tts/models/models/sambert/kantts_sambert.py
new file mode 100644
index 00000000..3837a2e8
--- /dev/null
+++ b/modelscope/models/audio/tts/models/models/sambert/kantts_sambert.py
@@ -0,0 +1,718 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from modelscope.models.audio.tts.models.utils import get_mask_from_lengths
+from .adaptors import (LengthRegulator, VarFsmnRnnNARPredictor,
+                       VarRnnARPredictor)
+from .base import FFTBlock, PNCABlock, Prenet
+from .fsmn import FsmnEncoderV2
+from .positions import DurSinusoidalPositionEncoder, SinusoidalPositionEncoder
+
+
+class SelfAttentionEncoder(nn.Module):
+
+    def __init__(self, n_layer, d_in, d_model, n_head, d_head, d_inner,
+                 dropout, dropout_att, dropout_relu, position_encoder):
+        super(SelfAttentionEncoder, self).__init__()
+
+        self.d_in = d_in
+        self.d_model = d_model
+        self.dropout = dropout
+        d_in_lst = [d_in] + [d_model] * (n_layer - 1)
+        self.fft = nn.ModuleList([
+            FFTBlock(d, d_model, n_head, d_head, d_inner, (3, 1), dropout,
+                     dropout_att, dropout_relu) for d in d_in_lst
+        ])
+        self.ln = nn.LayerNorm(d_model, eps=1e-6)
+        self.position_enc = position_encoder
+
+    def forward(self, input, mask=None, return_attns=False):
+        input *= self.d_model**0.5
+        if (isinstance(self.position_enc, SinusoidalPositionEncoder)):
+            input = self.position_enc(input)
+        else:
+            raise NotImplementedError('modelscope error: position_enc invalid')
+
+        input = F.dropout(input, p=self.dropout, training=self.training)
+
+        enc_slf_attn_list = []
+        max_len = input.size(1)
+        if mask is not None:
+            slf_attn_mask = mask.unsqueeze(1).expand(-1, max_len, -1)
+        else:
+            slf_attn_mask = None
+
+        enc_output = input
+        for id, layer in enumerate(self.fft):
+            enc_output, enc_slf_attn = layer(
+                enc_output, mask=mask, slf_attn_mask=slf_attn_mask)
+            if return_attns:
+                enc_slf_attn_list += [enc_slf_attn]
+
+        enc_output = self.ln(enc_output)
+
+        return enc_output, enc_slf_attn_list
+
+
+class HybridAttentionDecoder(nn.Module):
+
+    def __init__(self, d_in, prenet_units, n_layer, d_model, d_mem, n_head,
+                 d_head, d_inner, dropout, dropout_att, dropout_relu, d_out):
+        super(HybridAttentionDecoder, self).__init__()
+
+        self.d_model = d_model
+        self.dropout = dropout
+        self.prenet = Prenet(d_in, prenet_units, d_model)
+        self.dec_in_proj = nn.Linear(d_model + d_mem, d_model)
+        self.pnca = nn.ModuleList([
+            PNCABlock(d_model, d_mem, n_head, d_head, d_inner, (1, 1), dropout,
+                      dropout_att, dropout_relu) for _ in range(n_layer)
+        ])
+        self.ln = nn.LayerNorm(d_model, eps=1e-6)
+        self.dec_out_proj = nn.Linear(d_model, d_out)
+
+    def reset_state(self):
+        for layer in self.pnca:
+            layer.reset_state()
+
+    def get_pnca_attn_mask(self,
+                           device,
+                           max_len,
+                           x_band_width,
+                           h_band_width,
+                           mask=None):
+        if mask is not None:
+            pnca_attn_mask = mask.unsqueeze(1).expand(-1, max_len, -1)
+        else:
+            pnca_attn_mask = None
+
+        range_ = torch.arange(max_len).to(device)
+        x_start = torch.clamp_min(range_ - x_band_width, 0)[None, None, :]
+        x_end = (range_ + 1)[None, None, :]
+        h_start = range_[None, None, :]
+        h_end = torch.clamp_max(range_ + h_band_width + 1,
+                                max_len + 1)[None, None, :]
+
+        pnca_x_attn_mask = ~((x_start <= range_[None, :, None])
+                             & (x_end > range_[None, :, None])).transpose(1, 2)  # yapf:disable
+        pnca_h_attn_mask = ~((h_start <= range_[None, :, None])
+                             & (h_end > range_[None, :, None])).transpose(1, 2)  # yapf:disable
+
+        if pnca_attn_mask is not None:
+            pnca_x_attn_mask = (pnca_x_attn_mask | pnca_attn_mask)
+            pnca_h_attn_mask = (pnca_h_attn_mask | pnca_attn_mask)
+            pnca_x_attn_mask = pnca_x_attn_mask.masked_fill(
+                pnca_attn_mask.transpose(1, 2), False)
+            pnca_h_attn_mask = pnca_h_attn_mask.masked_fill(
+                pnca_attn_mask.transpose(1, 2), False)
+
+        return pnca_attn_mask, pnca_x_attn_mask, pnca_h_attn_mask
+
+    # must call reset_state before
+    def forward(self,
+                input,
+                memory,
+                x_band_width,
+                h_band_width,
+                mask=None,
+                return_attns=False):
+        input = self.prenet(input)
+        input = torch.cat([memory, input], dim=-1)
+        input = self.dec_in_proj(input)
+
+        if mask is not None:
+            input = input.masked_fill(mask.unsqueeze(-1), 0)
+
+        input *= self.d_model**0.5
+        input = F.dropout(input, p=self.dropout, training=self.training)
+
+        max_len = input.size(1)
+        pnca_attn_mask, pnca_x_attn_mask, pnca_h_attn_mask = self.get_pnca_attn_mask(
+            input.device, max_len, x_band_width, h_band_width, mask)
+
+        dec_pnca_attn_x_list = []
+        dec_pnca_attn_h_list = []
+        dec_output = input
+        for id, layer in enumerate(self.pnca):
+            dec_output, dec_pnca_attn_x, dec_pnca_attn_h = layer(
+                dec_output,
+                memory,
+                mask=mask,
+                pnca_x_attn_mask=pnca_x_attn_mask,
+                pnca_h_attn_mask=pnca_h_attn_mask)
+            if return_attns:
+                dec_pnca_attn_x_list += [dec_pnca_attn_x]
+                dec_pnca_attn_h_list += [dec_pnca_attn_h]
+
+        dec_output = self.ln(dec_output)
+        dec_output = self.dec_out_proj(dec_output)
+
+        return dec_output, dec_pnca_attn_x_list, dec_pnca_attn_h_list
+
+    # must call reset_state before when step == 0
+    def infer(self,
+              step,
+              input,
+              memory,
+              x_band_width,
+              h_band_width,
+              mask=None,
+              return_attns=False):
+        max_len = memory.size(1)
+
+        input = self.prenet(input)
+        input = torch.cat([memory[:, step:step + 1, :], input], dim=-1)
+        input = self.dec_in_proj(input)
+
+        input *= self.d_model**0.5
+        input = F.dropout(input, p=self.dropout, training=self.training)
+
+        pnca_attn_mask, pnca_x_attn_mask, pnca_h_attn_mask = self.get_pnca_attn_mask(
+            input.device, max_len, x_band_width, h_band_width, mask)
+
+        dec_pnca_attn_x_list = []
+        dec_pnca_attn_h_list = []
+        dec_output = input
+        for id, layer in enumerate(self.pnca):
+            if mask is not None:
+                mask_step = mask[:, step:step + 1]
+            else:
+                mask_step = None
+            dec_output, dec_pnca_attn_x, dec_pnca_attn_h = layer(
+                dec_output,
+                memory,
+                mask=mask_step,
+                pnca_x_attn_mask=pnca_x_attn_mask[:,
+                                                  step:step + 1, :(step + 1)],
+                pnca_h_attn_mask=pnca_h_attn_mask[:, step:step + 1, :])
+            if return_attns:
+                dec_pnca_attn_x_list += [dec_pnca_attn_x]
+                dec_pnca_attn_h_list += [dec_pnca_attn_h]
+
+        dec_output = self.ln(dec_output)
+        dec_output = self.dec_out_proj(dec_output)
+
+        return dec_output, dec_pnca_attn_x_list, dec_pnca_attn_h_list
+
+
+class TextFftEncoder(nn.Module):
+
+    def __init__(self, config, ling_unit_size):
+        super(TextFftEncoder, self).__init__()
+
+        # linguistic unit lookup table
+        nb_ling_sy = ling_unit_size['sy']
+        nb_ling_tone = ling_unit_size['tone']
+        nb_ling_syllable_flag = ling_unit_size['syllable_flag']
+        nb_ling_ws = ling_unit_size['word_segment']
+
+        max_len = config['am']['max_len']
+
+        d_emb = config['am']['embedding_dim']
+        nb_layers = config['am']['encoder_num_layers']
+        nb_heads = config['am']['encoder_num_heads']
+        d_model = config['am']['encoder_num_units']
+        d_head = d_model // nb_heads
+        d_inner = config['am']['encoder_ffn_inner_dim']
+        dropout = config['am']['encoder_dropout']
+        dropout_attn = config['am']['encoder_attention_dropout']
+        dropout_relu = config['am']['encoder_relu_dropout']
+        d_proj = config['am']['encoder_projection_units']
+
+        self.d_model = d_model
+
+        self.sy_emb = nn.Embedding(nb_ling_sy, d_emb)
+        self.tone_emb = nn.Embedding(nb_ling_tone, d_emb)
+        self.syllable_flag_emb = nn.Embedding(nb_ling_syllable_flag, d_emb)
+        self.ws_emb = nn.Embedding(nb_ling_ws, d_emb)
+
+        position_enc = SinusoidalPositionEncoder(max_len, d_emb)
+
+        self.ling_enc = SelfAttentionEncoder(nb_layers, d_emb, d_model,
+                                             nb_heads, d_head, d_inner,
+                                             dropout, dropout_attn,
+                                             dropout_relu, position_enc)
+
+        self.ling_proj = nn.Linear(d_model, d_proj, bias=False)
+
+    def forward(self, inputs_ling, masks=None, return_attns=False):
+        # Parse inputs_ling_seq
+        inputs_sy = inputs_ling[:, :, 0]
+        inputs_tone = inputs_ling[:, :, 1]
+        inputs_syllable_flag = inputs_ling[:, :, 2]
+        inputs_ws = inputs_ling[:, :, 3]
+
+        # Lookup table
+        sy_embedding = self.sy_emb(inputs_sy)
+        tone_embedding = self.tone_emb(inputs_tone)
+        syllable_flag_embedding = self.syllable_flag_emb(inputs_syllable_flag)
+        ws_embedding = self.ws_emb(inputs_ws)
+
+        ling_embedding = sy_embedding + tone_embedding + syllable_flag_embedding + ws_embedding
+
+        enc_output, enc_slf_attn_list = self.ling_enc(ling_embedding, masks,
+                                                      return_attns)
+
+        enc_output = self.ling_proj(enc_output)
+
+        return enc_output, enc_slf_attn_list
+
+
+class VarianceAdaptor(nn.Module):
+
+    def __init__(self, config):
+        super(VarianceAdaptor, self).__init__()
+
+        input_dim = config['am']['encoder_projection_units'] + config['am'][
+            'emotion_units'] + config['am']['speaker_units']
+        filter_size = config['am']['predictor_filter_size']
+        fsmn_num_layers = config['am']['predictor_fsmn_num_layers']
+        num_memory_units = config['am']['predictor_num_memory_units']
+        ffn_inner_dim = config['am']['predictor_ffn_inner_dim']
+        dropout = config['am']['predictor_dropout']
+        shift = config['am']['predictor_shift']
+        lstm_units = config['am']['predictor_lstm_units']
+
+        dur_pred_prenet_units = config['am']['dur_pred_prenet_units']
+        dur_pred_lstm_units = config['am']['dur_pred_lstm_units']
+
+        self.pitch_predictor = VarFsmnRnnNARPredictor(input_dim, filter_size,
+                                                      fsmn_num_layers,
+                                                      num_memory_units,
+                                                      ffn_inner_dim, dropout,
+                                                      shift, lstm_units)
+        self.energy_predictor = VarFsmnRnnNARPredictor(input_dim, filter_size,
+                                                       fsmn_num_layers,
+                                                       num_memory_units,
+                                                       ffn_inner_dim, dropout,
+                                                       shift, lstm_units)
+        self.duration_predictor = VarRnnARPredictor(input_dim,
+                                                    dur_pred_prenet_units,
+                                                    dur_pred_lstm_units)
+
+        self.length_regulator = LengthRegulator(
+            config['am']['outputs_per_step'])
+        self.dur_position_encoder = DurSinusoidalPositionEncoder(
+            config['am']['encoder_projection_units'],
+            config['am']['outputs_per_step'])
+
+        self.pitch_emb = nn.Conv1d(
+            1,
+            config['am']['encoder_projection_units'],
+            kernel_size=9,
+            padding=4)
+        self.energy_emb = nn.Conv1d(
+            1,
+            config['am']['encoder_projection_units'],
+            kernel_size=9,
+            padding=4)
+
+    def forward(self,
+                inputs_text_embedding,
+                inputs_emo_embedding,
+                inputs_spk_embedding,
+                masks=None,
+                output_masks=None,
+                duration_targets=None,
+                pitch_targets=None,
+                energy_targets=None):
+
+        batch_size = inputs_text_embedding.size(0)
+
+        variance_predictor_inputs = torch.cat([
+            inputs_text_embedding, inputs_spk_embedding, inputs_emo_embedding
+        ], dim=-1)  # yapf:disable
+
+        pitch_predictions = self.pitch_predictor(variance_predictor_inputs,
+                                                 masks)
+        energy_predictions = self.energy_predictor(variance_predictor_inputs,
+                                                   masks)
+
+        if pitch_targets is not None:
+            pitch_embeddings = self.pitch_emb(
+                pitch_targets.unsqueeze(1)).transpose(1, 2)
+        else:
+            pitch_embeddings = self.pitch_emb(
+                pitch_predictions.unsqueeze(1)).transpose(1, 2)
+
+        if energy_targets is not None:
+            energy_embeddings = self.energy_emb(
+                energy_targets.unsqueeze(1)).transpose(1, 2)
+        else:
+            energy_embeddings = self.energy_emb(
+                energy_predictions.unsqueeze(1)).transpose(1, 2)
+
+        inputs_text_embedding_aug = inputs_text_embedding + pitch_embeddings + energy_embeddings
+        duration_predictor_cond = torch.cat([
+            inputs_text_embedding_aug, inputs_spk_embedding,
+            inputs_emo_embedding
+        ], dim=-1)  # yapf:disable
+        if duration_targets is not None:
+            duration_predictor_go_frame = torch.zeros(batch_size, 1).to(
+                inputs_text_embedding.device)
+            duration_predictor_input = torch.cat([
+                duration_predictor_go_frame, duration_targets[:, :-1].float()
+            ], dim=-1)  # yapf:disable
+            duration_predictor_input = torch.log(duration_predictor_input + 1)
+            log_duration_predictions, _ = self.duration_predictor(
+                duration_predictor_input.unsqueeze(-1),
+                duration_predictor_cond,
+                masks=masks)
+            duration_predictions = torch.exp(log_duration_predictions) - 1
+        else:
+            log_duration_predictions = self.duration_predictor.infer(
+                duration_predictor_cond, masks=masks)
+            duration_predictions = torch.exp(log_duration_predictions) - 1
+
+        if duration_targets is not None:
+            LR_text_outputs, LR_length_rounded = self.length_regulator(
+                inputs_text_embedding_aug,
+                duration_targets,
+                masks=output_masks)
+            LR_position_embeddings = self.dur_position_encoder(
+                duration_targets, masks=output_masks)
+            LR_emo_outputs, _ = self.length_regulator(
+                inputs_emo_embedding, duration_targets, masks=output_masks)
+            LR_spk_outputs, _ = self.length_regulator(
+                inputs_spk_embedding, duration_targets, masks=output_masks)
+
+        else:
+            LR_text_outputs, LR_length_rounded = self.length_regulator(
+                inputs_text_embedding_aug,
+                duration_predictions,
+                masks=output_masks)
+            LR_position_embeddings = self.dur_position_encoder(
+                duration_predictions, masks=output_masks)
+            LR_emo_outputs, _ = self.length_regulator(
+                inputs_emo_embedding, duration_predictions, masks=output_masks)
+            LR_spk_outputs, _ = self.length_regulator(
+                inputs_spk_embedding, duration_predictions, masks=output_masks)
+
+        LR_text_outputs = LR_text_outputs + LR_position_embeddings
+
+        return (LR_text_outputs, LR_emo_outputs, LR_spk_outputs,
+                LR_length_rounded, log_duration_predictions, pitch_predictions,
+                energy_predictions)
+
+
+class MelPNCADecoder(nn.Module):
+
+    def __init__(self, config):
+        super(MelPNCADecoder, self).__init__()
+
+        prenet_units = config['am']['decoder_prenet_units']
+        nb_layers = config['am']['decoder_num_layers']
+        nb_heads = config['am']['decoder_num_heads']
+        d_model = config['am']['decoder_num_units']
+        d_head = d_model // nb_heads
+        d_inner = config['am']['decoder_ffn_inner_dim']
+        dropout = config['am']['decoder_dropout']
+        dropout_attn = config['am']['decoder_attention_dropout']
+        dropout_relu = config['am']['decoder_relu_dropout']
+        outputs_per_step = config['am']['outputs_per_step']
+
+        d_mem = config['am'][
+            'encoder_projection_units'] * outputs_per_step + config['am'][
+                'emotion_units'] + config['am']['speaker_units']
+        d_mel = config['am']['num_mels']
+
+        self.d_mel = d_mel
+        self.r = outputs_per_step
+        self.nb_layers = nb_layers
+
+        self.mel_dec = HybridAttentionDecoder(d_mel, prenet_units, nb_layers,
+                                              d_model, d_mem, nb_heads, d_head,
+                                              d_inner, dropout, dropout_attn,
+                                              dropout_relu,
+                                              d_mel * outputs_per_step)
+
+    def forward(self,
+                memory,
+                x_band_width,
+                h_band_width,
+                target=None,
+                mask=None,
+                return_attns=False):
+        batch_size = memory.size(0)
+        go_frame = torch.zeros((batch_size, 1, self.d_mel)).to(memory.device)
+
+        if target is not None:
+            self.mel_dec.reset_state()
+            input = target[:, self.r - 1::self.r, :]
+            input = torch.cat([go_frame, input], dim=1)[:, :-1, :]
+            dec_output, dec_pnca_attn_x_list, dec_pnca_attn_h_list = self.mel_dec(
+                input,
+                memory,
+                x_band_width,
+                h_band_width,
+                mask=mask,
+                return_attns=return_attns)
+
+        else:
+            dec_output = []
+            dec_pnca_attn_x_list = [[] for _ in range(self.nb_layers)]
+            dec_pnca_attn_h_list = [[] for _ in range(self.nb_layers)]
+            self.mel_dec.reset_state()
+            input = go_frame
+            for step in range(memory.size(1)):
+                dec_output_step, dec_pnca_attn_x_step, dec_pnca_attn_h_step = self.mel_dec.infer(
+                    step,
+                    input,
+                    memory,
+                    x_band_width,
+                    h_band_width,
+                    mask=mask,
+                    return_attns=return_attns)
+                input = dec_output_step[:, :, -self.d_mel:]
+
+                dec_output.append(dec_output_step)
+                for layer_id, (pnca_x_attn, pnca_h_attn) in enumerate(
+                        zip(dec_pnca_attn_x_step, dec_pnca_attn_h_step)):
+                    left = memory.size(1) - pnca_x_attn.size(-1)
+                    if (left > 0):
+                        padding = torch.zeros(
+                            (pnca_x_attn.size(0), 1, left)).to(pnca_x_attn)
+                        pnca_x_attn = torch.cat([pnca_x_attn, padding], dim=-1)
+                    dec_pnca_attn_x_list[layer_id].append(pnca_x_attn)
+                    dec_pnca_attn_h_list[layer_id].append(pnca_h_attn)
+
+            dec_output = torch.cat(dec_output, dim=1)
+            for layer_id in range(self.nb_layers):
+                dec_pnca_attn_x_list[layer_id] = torch.cat(
+                    dec_pnca_attn_x_list[layer_id], dim=1)
+                dec_pnca_attn_h_list[layer_id] = torch.cat(
+                    dec_pnca_attn_h_list[layer_id], dim=1)
+
+        return dec_output, dec_pnca_attn_x_list, dec_pnca_attn_h_list
+
+
+class PostNet(nn.Module):
+
+    def __init__(self, config):
+        super(PostNet, self).__init__()
+
+        self.filter_size = config['am']['postnet_filter_size']
+        self.fsmn_num_layers = config['am']['postnet_fsmn_num_layers']
+        self.num_memory_units = config['am']['postnet_num_memory_units']
+        self.ffn_inner_dim = config['am']['postnet_ffn_inner_dim']
+        self.dropout = config['am']['postnet_dropout']
+        self.shift = config['am']['postnet_shift']
+        self.lstm_units = config['am']['postnet_lstm_units']
+        self.num_mels = config['am']['num_mels']
+
+        self.fsmn = FsmnEncoderV2(self.filter_size, self.fsmn_num_layers,
+                                  self.num_mels, self.num_memory_units,
+                                  self.ffn_inner_dim, self.dropout, self.shift)
+        self.lstm = nn.LSTM(
+            self.num_memory_units,
+            self.lstm_units,
+            num_layers=1,
+            batch_first=True)
+        self.fc = nn.Linear(self.lstm_units, self.num_mels)
+
+    def forward(self, x, mask=None):
+        postnet_fsmn_output = self.fsmn(x, mask)
+        # The input can also be a packed variable length sequence,
+        # here we just omit it for simpliciy due to the mask and uni-directional lstm.
+        postnet_lstm_output, _ = self.lstm(postnet_fsmn_output)
+        mel_residual_output = self.fc(postnet_lstm_output)
+
+        return mel_residual_output
+
+
+def mel_recon_loss_fn(output_lengths,
+                      mel_targets,
+                      dec_outputs,
+                      postnet_outputs=None):
+    mae_loss = nn.L1Loss(reduction='none')
+
+    output_masks = get_mask_from_lengths(
+        output_lengths, max_len=mel_targets.size(1))
+    output_masks = ~output_masks
+    valid_outputs = output_masks.sum()
+
+    mel_loss_ = torch.sum(
+        mae_loss(mel_targets, dec_outputs) * output_masks.unsqueeze(-1)) / (
+            valid_outputs * mel_targets.size(-1))
+
+    if postnet_outputs is not None:
+        mel_loss = torch.sum(
+            mae_loss(mel_targets, postnet_outputs)
+            * output_masks.unsqueeze(-1)) / (
+                valid_outputs * mel_targets.size(-1))
+    else:
+        mel_loss = 0.0
+
+    return mel_loss_, mel_loss
+
+
+def prosody_recon_loss_fn(input_lengths, duration_targets, pitch_targets,
+                          energy_targets, log_duration_predictions,
+                          pitch_predictions, energy_predictions):
+    mae_loss = nn.L1Loss(reduction='none')
+
+    input_masks = get_mask_from_lengths(
+        input_lengths, max_len=duration_targets.size(1))
+    input_masks = ~input_masks
+    valid_inputs = input_masks.sum()
+
+    dur_loss = torch.sum(
+        mae_loss(
+            torch.log(duration_targets.float() + 1), log_duration_predictions)
+        * input_masks) / valid_inputs
+    pitch_loss = torch.sum(
+        mae_loss(pitch_targets, pitch_predictions)
+        * input_masks) / valid_inputs
+    energy_loss = torch.sum(
+        mae_loss(energy_targets, energy_predictions)
+        * input_masks) / valid_inputs
+
+    return dur_loss, pitch_loss, energy_loss
+
+
+class KanTtsSAMBERT(nn.Module):
+
+    def __init__(self, config, ling_unit_size):
+        super(KanTtsSAMBERT, self).__init__()
+
+        self.text_encoder = TextFftEncoder(config, ling_unit_size)
+        self.spk_tokenizer = nn.Embedding(ling_unit_size['speaker'],
+                                          config['am']['speaker_units'])
+        self.emo_tokenizer = nn.Embedding(ling_unit_size['emotion'],
+                                          config['am']['emotion_units'])
+        self.variance_adaptor = VarianceAdaptor(config)
+        self.mel_decoder = MelPNCADecoder(config)
+        self.mel_postnet = PostNet(config)
+
+    def get_lfr_mask_from_lengths(self, lengths, max_len):
+        batch_size = lengths.size(0)
+        # padding according to the outputs_per_step
+        padded_lr_lengths = torch.zeros_like(lengths)
+        for i in range(batch_size):
+            len_item = int(lengths[i].item())
+            padding = self.mel_decoder.r - len_item % self.mel_decoder.r
+            if (padding < self.mel_decoder.r):
+                padded_lr_lengths[i] = (len_item
+                                        + padding) // self.mel_decoder.r
+            else:
+                padded_lr_lengths[i] = len_item // self.mel_decoder.r
+
+        return get_mask_from_lengths(
+            padded_lr_lengths, max_len=max_len // self.mel_decoder.r)
+
+    def forward(self,
+                inputs_ling,
+                inputs_emotion,
+                inputs_speaker,
+                input_lengths,
+                output_lengths=None,
+                mel_targets=None,
+                duration_targets=None,
+                pitch_targets=None,
+                energy_targets=None):
+
+        batch_size = inputs_ling.size(0)
+
+        input_masks = get_mask_from_lengths(
+            input_lengths, max_len=inputs_ling.size(1))
+
+        text_hid, enc_sla_attn_lst = self.text_encoder(
+            inputs_ling, input_masks, return_attns=True)
+
+        emo_hid = self.emo_tokenizer(inputs_emotion)
+        spk_hid = self.spk_tokenizer(inputs_speaker)
+
+        if output_lengths is not None:
+            output_masks = get_mask_from_lengths(
+                output_lengths, max_len=mel_targets.size(1))
+        else:
+            output_masks = None
+
+        (LR_text_outputs, LR_emo_outputs, LR_spk_outputs, LR_length_rounded,
+         log_duration_predictions, pitch_predictions,
+         energy_predictions) = self.variance_adaptor(
+             text_hid,
+             emo_hid,
+             spk_hid,
+             masks=input_masks,
+             output_masks=output_masks,
+             duration_targets=duration_targets,
+             pitch_targets=pitch_targets,
+             energy_targets=energy_targets)
+
+        if output_lengths is not None:
+            lfr_masks = self.get_lfr_mask_from_lengths(
+                output_lengths, max_len=LR_text_outputs.size(1))
+        else:
+            output_masks = get_mask_from_lengths(
+                LR_length_rounded, max_len=LR_text_outputs.size(1))
+            lfr_masks = None
+
+        # LFR with the factor of outputs_per_step
+        LFR_text_inputs = LR_text_outputs.contiguous().view(
+            batch_size, -1, self.mel_decoder.r * text_hid.shape[-1])
+        LFR_emo_inputs = LR_emo_outputs.contiguous().view(
+            batch_size, -1,
+            self.mel_decoder.r * emo_hid.shape[-1])[:, :, :emo_hid.shape[-1]]
+        LFR_spk_inputs = LR_spk_outputs.contiguous().view(
+            batch_size, -1,
+            self.mel_decoder.r * spk_hid.shape[-1])[:, :, :spk_hid.shape[-1]]
+
+        memory = torch.cat([LFR_text_inputs, LFR_spk_inputs, LFR_emo_inputs],
+                           dim=-1)
+
+        if duration_targets is not None:
+            x_band_width = int(
+                duration_targets.float().masked_fill(input_masks, 0).max()
+                / self.mel_decoder.r + 0.5)
+            h_band_width = x_band_width
+        else:
+            x_band_width = int((torch.exp(log_duration_predictions) - 1).max()
+                               / self.mel_decoder.r + 0.5)
+            h_band_width = x_band_width
+
+        dec_outputs, pnca_x_attn_lst, pnca_h_attn_lst = self.mel_decoder(
+            memory,
+            x_band_width,
+            h_band_width,
+            target=mel_targets,
+            mask=lfr_masks,
+            return_attns=True)
+
+        # De-LFR with the factor of outputs_per_step
+        dec_outputs = dec_outputs.contiguous().view(batch_size, -1,
+                                                    self.mel_decoder.d_mel)
+
+        if output_masks is not None:
+            dec_outputs = dec_outputs.masked_fill(
+                output_masks.unsqueeze(-1), 0)
+
+        postnet_outputs = self.mel_postnet(dec_outputs,
+                                           output_masks) + dec_outputs
+        if output_masks is not None:
+            postnet_outputs = postnet_outputs.masked_fill(
+                output_masks.unsqueeze(-1), 0)
+
+        res = {
+            'x_band_width': x_band_width,
+            'h_band_width': h_band_width,
+            'enc_slf_attn_lst': enc_sla_attn_lst,
+            'pnca_x_attn_lst': pnca_x_attn_lst,
+            'pnca_h_attn_lst': pnca_h_attn_lst,
+            'dec_outputs': dec_outputs,
+            'postnet_outputs': postnet_outputs,
+            'LR_length_rounded': LR_length_rounded,
+            'log_duration_predictions': log_duration_predictions,
+            'pitch_predictions': pitch_predictions,
+            'energy_predictions': energy_predictions
+        }
+
+        res['LR_text_outputs'] = LR_text_outputs
+        res['LR_emo_outputs'] = LR_emo_outputs
+        res['LR_spk_outputs'] = LR_spk_outputs
+
+        return res
diff --git a/modelscope/models/audio/tts/models/models/sambert/positions.py b/modelscope/models/audio/tts/models/models/sambert/positions.py
new file mode 100644
index 00000000..9d1e375d
--- /dev/null
+++ b/modelscope/models/audio/tts/models/models/sambert/positions.py
@@ -0,0 +1,101 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class SinusoidalPositionEncoder(nn.Module):
+
+    def __init__(self, max_len, depth):
+        super(SinusoidalPositionEncoder, self).__init__()
+
+        self.max_len = max_len
+        self.depth = depth
+        self.position_enc = nn.Parameter(
+            self.get_sinusoid_encoding_table(max_len, depth).unsqueeze(0),
+            requires_grad=False)
+
+    def forward(self, input):
+        bz_in, len_in, _ = input.size()
+        if len_in > self.max_len:
+            self.max_len = len_in
+            self.position_enc.data = self.get_sinusoid_encoding_table(
+                self.max_len, self.depth).unsqueeze(0).to(input.device)
+
+        output = input + self.position_enc[:, :len_in, :].expand(bz_in, -1, -1)
+
+        return output
+
+    @staticmethod
+    def get_sinusoid_encoding_table(n_position, d_hid, padding_idx=None):
+        """ Sinusoid position encoding table """
+
+        def cal_angle(position, hid_idx):
+            return position / np.power(10000, hid_idx / float(d_hid / 2 - 1))
+
+        def get_posi_angle_vec(position):
+            return [cal_angle(position, hid_j) for hid_j in range(d_hid // 2)]
+
+        scaled_time_table = np.array(
+            [get_posi_angle_vec(pos_i + 1) for pos_i in range(n_position)])
+
+        sinusoid_table = np.zeros((n_position, d_hid))
+        sinusoid_table[:, :d_hid // 2] = np.sin(scaled_time_table)
+        sinusoid_table[:, d_hid // 2:] = np.cos(scaled_time_table)
+
+        if padding_idx is not None:
+            # zero vector for padding dimension
+            sinusoid_table[padding_idx] = 0.0
+
+        return torch.FloatTensor(sinusoid_table)
+
+
+class DurSinusoidalPositionEncoder(nn.Module):
+
+    def __init__(self, depth, outputs_per_step):
+        super(DurSinusoidalPositionEncoder, self).__init__()
+
+        self.depth = depth
+        self.outputs_per_step = outputs_per_step
+
+        inv_timescales = [
+            np.power(10000, 2 * (hid_idx // 2) / depth)
+            for hid_idx in range(depth)
+        ]
+        self.inv_timescales = nn.Parameter(
+            torch.FloatTensor(inv_timescales), requires_grad=False)
+
+    def forward(self, durations, masks=None):
+        reps = (durations + 0.5).long()
+        output_lens = reps.sum(dim=1)
+        max_len = output_lens.max()
+        reps_cumsum = torch.cumsum(
+            F.pad(reps.float(), (1, 0, 0, 0), value=0.0), dim=1)[:, None, :]
+        range_ = torch.arange(max_len).to(durations.device)[None, :, None]
+        mult = ((reps_cumsum[:, :, :-1] <= range_)
+                & (reps_cumsum[:, :, 1:] > range_))  # yapf:disable
+        mult = mult.float()
+        offsets = torch.matmul(mult,
+                               reps_cumsum[:,
+                                           0, :-1].unsqueeze(-1)).squeeze(-1)
+        dur_pos = range_[:, :, 0] - offsets + 1
+
+        if masks is not None:
+            assert masks.size(1) == dur_pos.size(1)
+            dur_pos = dur_pos.masked_fill(masks, 0.0)
+
+        seq_len = dur_pos.size(1)
+        padding = self.outputs_per_step - int(seq_len) % self.outputs_per_step
+        if (padding < self.outputs_per_step):
+            dur_pos = F.pad(dur_pos, (0, padding, 0, 0), value=0.0)
+
+        position_embedding = dur_pos[:, :, None] / self.inv_timescales[None,
+                                                                       None, :]
+        position_embedding[:, :, 0::2] = torch.sin(position_embedding[:, :,
+                                                                      0::2])
+        position_embedding[:, :, 1::2] = torch.cos(position_embedding[:, :,
+                                                                      1::2])
+
+        return position_embedding
diff --git a/modelscope/models/audio/tts/models/position.py b/modelscope/models/audio/tts/models/position.py
deleted file mode 100755
index bca658dd..00000000
--- a/modelscope/models/audio/tts/models/position.py
+++ /dev/null
@@ -1,174 +0,0 @@
-"""Define position encoder classes."""
-
-import abc
-import math
-
-import tensorflow as tf
-
-from .reducer import SumReducer
-
-
-class PositionEncoder(tf.keras.layers.Layer):
-    """Base class for position encoders."""
-
-    def __init__(self, reducer=None, **kwargs):
-        """Initializes the position encoder.
-        Args:
-          reducer: A :class:`opennmt.layers.Reducer` to merge inputs and position
-            encodings. Defaults to :class:`opennmt.layers.SumReducer`.
-          **kwargs: Additional layer keyword arguments.
-        """
-        super(PositionEncoder, self).__init__(**kwargs)
-        if reducer is None:
-            reducer = SumReducer(dtype=kwargs.get('dtype'))
-        self.reducer = reducer
-
-    def call(self, inputs, position=None):  # pylint: disable=arguments-differ
-        """Add position encodings to :obj:`inputs`.
-        Args:
-          inputs: The inputs to encode.
-          position: The single position to encode, to use when this layer is called
-            step by step.
-        Returns:
-          A ``tf.Tensor`` whose shape depends on the configured ``reducer``.
-        """
-        batch_size = tf.shape(inputs)[0]
-        timesteps = tf.shape(inputs)[1]
-        input_dim = inputs.shape[-1].value
-        positions = tf.range(timesteps) + 1 if position is None else [position]
-        position_encoding = self._encode([positions], input_dim)
-        position_encoding = tf.tile(position_encoding, [batch_size, 1, 1])
-        return self.reducer([inputs, position_encoding])
-
-    @abc.abstractmethod
-    def _encode(self, positions, depth):
-        """Creates position encodings.
-        Args:
-          positions: The positions to encode of shape :math:`[B, ...]`.
-          depth: The encoding depth :math:`D`.
-        Returns:
-          A ``tf.Tensor`` of shape :math:`[B, ..., D]`.
-        """
-        raise NotImplementedError()
-
-
-class PositionEmbedder(PositionEncoder):
-    """Encodes position with a lookup table."""
-
-    def __init__(self, maximum_position=128, reducer=None, **kwargs):
-        """Initializes the position encoder.
-        Args:
-          maximum_position: The maximum position to embed. Positions greater
-            than this value will be set to :obj:`maximum_position`.
-          reducer: A :class:`opennmt.layers.Reducer` to merge inputs and position
-            encodings. Defaults to :class:`opennmt.layers.SumReducer`.
-          **kwargs: Additional layer keyword arguments.
-        """
-        super(PositionEmbedder, self).__init__(reducer=reducer, **kwargs)
-        self.maximum_position = maximum_position
-        self.embedding = None
-
-    def build(self, input_shape):
-        shape = [self.maximum_position + 1, input_shape[-1]]
-        self.embedding = self.add_weight('position_embedding', shape)
-        super(PositionEmbedder, self).build(input_shape)
-
-    def _encode(self, positions, depth):
-        positions = tf.minimum(positions, self.maximum_position)
-        return tf.nn.embedding_lookup(self.embedding, positions)
-
-
-class SinusoidalPositionEncoder(PositionEncoder):
-    """Encodes positions with sine waves as described in
-    https://arxiv.org/abs/1706.03762.
-    """
-
-    def _encode(self, positions, depth):
-        if depth % 2 != 0:
-            raise ValueError(
-                'SinusoidalPositionEncoder expects the depth to be divisble '
-                'by 2 but got %d' % depth)
-
-        batch_size = tf.shape(positions)[0]
-        positions = tf.cast(positions, tf.float32)
-
-        log_timescale_increment = math.log(10000) / (depth / 2 - 1)
-        inv_timescales = tf.exp(
-            tf.range(depth / 2, dtype=tf.float32) * -log_timescale_increment)
-        inv_timescales = tf.reshape(
-            tf.tile(inv_timescales, [batch_size]), [batch_size, depth // 2])
-        scaled_time = tf.expand_dims(positions, -1) * tf.expand_dims(
-            inv_timescales, 1)
-        encoding = tf.concat(
-            [tf.sin(scaled_time), tf.cos(scaled_time)], axis=2)
-        return tf.cast(encoding, self.dtype)
-
-
-class SinusodalPositionalEncoding(tf.keras.layers.Layer):
-
-    def __init__(self, name='SinusodalPositionalEncoding'):
-        super(SinusodalPositionalEncoding, self).__init__(name=name)
-
-    @staticmethod
-    def positional_encoding(len, dim, step=1.):
-        """
-        :param len: int scalar
-        :param dim: int scalar
-        :param step:
-        :return: position embedding
-        """
-        pos_mat = tf.tile(
-            tf.expand_dims(
-                tf.range(0, tf.cast(len, dtype=tf.float32), dtype=tf.float32)
-                * step,
-                axis=-1), [1, dim])
-        dim_mat = tf.tile(
-            tf.expand_dims(
-                tf.range(0, tf.cast(dim, dtype=tf.float32), dtype=tf.float32),
-                axis=0), [len, 1])
-        dim_mat_int = tf.cast(dim_mat, dtype=tf.int32)
-        pos_encoding = tf.where(  # [time, dims]
-            tf.math.equal(tf.math.mod(dim_mat_int, 2), 0),
-            x=tf.math.sin(
-                pos_mat / tf.pow(10000., dim_mat / tf.cast(dim, tf.float32))),
-            y=tf.math.cos(pos_mat
-                          / tf.pow(10000.,
-                                   (dim_mat - 1) / tf.cast(dim, tf.float32))))
-        return pos_encoding
-
-
-class BatchSinusodalPositionalEncoding(tf.keras.layers.Layer):
-
-    def __init__(self, name='BatchSinusodalPositionalEncoding'):
-        super(BatchSinusodalPositionalEncoding, self).__init__(name=name)
-
-    @staticmethod
-    def positional_encoding(batch_size, len, dim, pos_mat, step=1.):
-        """
-        :param len: int scalar
-        :param dim: int scalar
-        :param step:
-        :param pos_mat: [B, len] = [len, 1] * dim
-        :return: position embedding
-        """
-        pos_mat = tf.tile(
-            tf.expand_dims(tf.cast(pos_mat, dtype=tf.float32) * step, axis=-1),
-            [1, 1, dim])  # [B, len, dim]
-
-        dim_mat = tf.tile(
-            tf.expand_dims(
-                tf.expand_dims(
-                    tf.range(
-                        0, tf.cast(dim, dtype=tf.float32), dtype=tf.float32),
-                    axis=0),
-                axis=0), [batch_size, len, 1])  # [B, len, dim]
-
-        dim_mat_int = tf.cast(dim_mat, dtype=tf.int32)
-        pos_encoding = tf.where(  # [B, time, dims]
-            tf.math.equal(tf.mod(dim_mat_int, 2), 0),
-            x=tf.math.sin(
-                pos_mat / tf.pow(10000., dim_mat / tf.cast(dim, tf.float32))),
-            y=tf.math.cos(pos_mat
-                          / tf.pow(10000.,
-                                   (dim_mat - 1) / tf.cast(dim, tf.float32))))
-        return pos_encoding
diff --git a/modelscope/models/audio/tts/models/reducer.py b/modelscope/models/audio/tts/models/reducer.py
deleted file mode 100755
index a4c9ae17..00000000
--- a/modelscope/models/audio/tts/models/reducer.py
+++ /dev/null
@@ -1,155 +0,0 @@
-"""Define reducers: objects that merge inputs."""
-
-import abc
-import functools
-
-import tensorflow as tf
-
-
-def pad_in_time(x, padding_length):
-    """Helper function to pad a tensor in the time dimension and retain the static depth dimension."""
-    return tf.pad(x, [[0, 0], [0, padding_length], [0, 0]])
-
-
-def align_in_time(x, length):
-    """Aligns the time dimension of :obj:`x` with :obj:`length`."""
-    time_dim = tf.shape(x)[1]
-    return tf.cond(
-        tf.less(time_dim, length),
-        true_fn=lambda: pad_in_time(x, length - time_dim),
-        false_fn=lambda: x[:, :length])
-
-
-def pad_with_identity(x,
-                      sequence_length,
-                      max_sequence_length,
-                      identity_values=0,
-                      maxlen=None):
-    """Pads a tensor with identity values up to :obj:`max_sequence_length`.
-    Args:
-      x: A ``tf.Tensor`` of shape ``[batch_size, time, depth]``.
-      sequence_length: The true sequence length of :obj:`x`.
-      max_sequence_length: The sequence length up to which the tensor must contain
-        :obj:`identity values`.
-      identity_values: The identity value.
-      maxlen: Size of the output time dimension. Default is the maximum value in
-        obj:`max_sequence_length`.
-    Returns:
-      A ``tf.Tensor`` of shape ``[batch_size, maxlen, depth]``.
-    """
-    if maxlen is None:
-        maxlen = tf.reduce_max(max_sequence_length)
-
-    mask = tf.sequence_mask(sequence_length, maxlen=maxlen, dtype=x.dtype)
-    mask = tf.expand_dims(mask, axis=-1)
-    mask_combined = tf.sequence_mask(
-        max_sequence_length, maxlen=maxlen, dtype=x.dtype)
-    mask_combined = tf.expand_dims(mask_combined, axis=-1)
-
-    identity_mask = mask_combined * (1.0 - mask)
-
-    x = pad_in_time(x, maxlen - tf.shape(x)[1])
-    x = x * mask + (identity_mask * identity_values)
-
-    return x
-
-
-def pad_n_with_identity(inputs, sequence_lengths, identity_values=0):
-    """Pads each input tensors with identity values up to
-    ``max(sequence_lengths)`` for each batch.
-    Args:
-      inputs: A list of ``tf.Tensor``.
-      sequence_lengths: A list of sequence length.
-      identity_values: The identity value.
-    Returns:
-      A tuple ``(padded, max_sequence_length)`` which are respectively a list of
-      ``tf.Tensor`` where each tensor are padded with identity and the combined
-      sequence length.
-    """
-    max_sequence_length = tf.reduce_max(sequence_lengths, axis=0)
-    maxlen = tf.reduce_max([tf.shape(x)[1] for x in inputs])
-    padded = [
-        pad_with_identity(
-            x,
-            length,
-            max_sequence_length,
-            identity_values=identity_values,
-            maxlen=maxlen) for x, length in zip(inputs, sequence_lengths)
-    ]
-    return padded, max_sequence_length
-
-
-class Reducer(tf.keras.layers.Layer):
-    """Base class for reducers."""
-
-    def zip_and_reduce(self, x, y):
-        """Zips the :obj:`x` with :obj:`y` structures together and reduces all
-        elements. If the structures are nested, they will be flattened first.
-        Args:
-          x: The first structure.
-          y: The second structure.
-        Returns:
-          The same structure as :obj:`x` and :obj:`y` where each element from
-          :obj:`x` is reduced with the correspond element from :obj:`y`.
-        Raises:
-          ValueError: if the two structures are not the same.
-        """
-        tf.nest.assert_same_structure(x, y)
-        x_flat = tf.nest.flatten(x)
-        y_flat = tf.nest.flatten(y)
-        reduced = list(map(self, zip(x_flat, y_flat)))
-        return tf.nest.pack_sequence_as(x, reduced)
-
-    def call(self, inputs, sequence_length=None):  # pylint: disable=arguments-differ
-        """Reduces all input elements.
-        Args:
-          inputs: A list of ``tf.Tensor``.
-          sequence_length: The length of each input, if reducing sequences.
-        Returns:
-          If :obj:`sequence_length` is set, a tuple
-          ``(reduced_input, reduced_length)``, otherwise a reduced ``tf.Tensor``
-          only.
-        """
-        if sequence_length is None:
-            return self.reduce(inputs)
-        else:
-            return self.reduce_sequence(
-                inputs, sequence_lengths=sequence_length)
-
-    @abc.abstractmethod
-    def reduce(self, inputs):
-        """See :meth:`opennmt.layers.Reducer.__call__`."""
-        raise NotImplementedError()
-
-    @abc.abstractmethod
-    def reduce_sequence(self, inputs, sequence_lengths):
-        """See :meth:`opennmt.layers.Reducer.__call__`."""
-        raise NotImplementedError()
-
-
-class SumReducer(Reducer):
-    """A reducer that sums the inputs."""
-
-    def reduce(self, inputs):
-        if len(inputs) == 1:
-            return inputs[0]
-        if len(inputs) == 2:
-            return inputs[0] + inputs[1]
-        return tf.add_n(inputs)
-
-    def reduce_sequence(self, inputs, sequence_lengths):
-        padded, combined_length = pad_n_with_identity(
-            inputs, sequence_lengths, identity_values=0)
-        return self.reduce(padded), combined_length
-
-
-class MultiplyReducer(Reducer):
-    """A reducer that multiplies the inputs."""
-
-    def reduce(self, inputs):
-        return functools.reduce(lambda a, x: a * x, inputs)
-
-    def reduce_sequence(self, inputs, sequence_lengths):
-        padded, combined_length = pad_n_with_identity(
-            inputs, sequence_lengths, identity_values=1)
-        return self.reduce(padded), combined_length
diff --git a/modelscope/models/audio/tts/models/rnn_wrappers.py b/modelscope/models/audio/tts/models/rnn_wrappers.py
deleted file mode 100755
index 6c487bab..00000000
--- a/modelscope/models/audio/tts/models/rnn_wrappers.py
+++ /dev/null
@@ -1,237 +0,0 @@
-import tensorflow as tf
-from tensorflow.python.ops import rnn_cell_impl
-
-from .am_models import prenet
-
-
-class VarPredictorCell(tf.contrib.rnn.RNNCell):
-    """Wrapper wrapper knock knock."""
-
-    def __init__(self, var_predictor_cell, is_training, dim, prenet_units):
-        super(VarPredictorCell, self).__init__()
-        self._var_predictor_cell = var_predictor_cell
-        self._is_training = is_training
-        self._dim = dim
-        self._prenet_units = prenet_units
-
-    @property
-    def state_size(self):
-        return tuple([self.output_size, self._var_predictor_cell.state_size])
-
-    @property
-    def output_size(self):
-        return self._dim
-
-    def zero_state(self, batch_size, dtype):
-        return tuple([
-            rnn_cell_impl._zero_state_tensors(self.output_size, batch_size,
-                                              dtype),
-            self._var_predictor_cell.zero_state(batch_size, dtype)
-        ])
-
-    def call(self, inputs, state):
-        """Run the Tacotron2 super decoder cell."""
-        super_cell_out, decoder_state = state
-
-        # split
-        prenet_input = inputs[:, 0:self._dim]
-        encoder_output = inputs[:, self._dim:]
-
-        # prenet and concat
-        prenet_output = prenet(
-            prenet_input,
-            self._prenet_units,
-            self._is_training,
-            scope='var_prenet')
-        decoder_input = tf.concat([prenet_output, encoder_output], axis=-1)
-
-        # decoder LSTM/GRU
-        new_super_cell_out, new_decoder_state = self._var_predictor_cell(
-            decoder_input, decoder_state)
-
-        # projection
-        new_super_cell_out = tf.layers.dense(
-            new_super_cell_out, units=self._dim)
-
-        new_states = tuple([new_super_cell_out, new_decoder_state])
-
-        return new_super_cell_out, new_states
-
-
-class DurPredictorCell(tf.contrib.rnn.RNNCell):
-    """Wrapper wrapper knock knock."""
-
-    def __init__(self, var_predictor_cell, is_training, dim, prenet_units):
-        super(DurPredictorCell, self).__init__()
-        self._var_predictor_cell = var_predictor_cell
-        self._is_training = is_training
-        self._dim = dim
-        self._prenet_units = prenet_units
-
-    @property
-    def state_size(self):
-        return tuple([self.output_size, self._var_predictor_cell.state_size])
-
-    @property
-    def output_size(self):
-        return self._dim
-
-    def zero_state(self, batch_size, dtype):
-        return tuple([
-            rnn_cell_impl._zero_state_tensors(self.output_size, batch_size,
-                                              dtype),
-            self._var_predictor_cell.zero_state(batch_size, dtype)
-        ])
-
-    def call(self, inputs, state):
-        """Run the Tacotron2 super decoder cell."""
-        super_cell_out, decoder_state = state
-
-        # split
-        prenet_input = inputs[:, 0:self._dim]
-        encoder_output = inputs[:, self._dim:]
-
-        # prenet and concat
-        prenet_output = prenet(
-            prenet_input,
-            self._prenet_units,
-            self._is_training,
-            scope='dur_prenet')
-        decoder_input = tf.concat([prenet_output, encoder_output], axis=-1)
-
-        # decoder LSTM/GRU
-        new_super_cell_out, new_decoder_state = self._var_predictor_cell(
-            decoder_input, decoder_state)
-
-        # projection
-        new_super_cell_out = tf.layers.dense(
-            new_super_cell_out, units=self._dim)
-        new_super_cell_out = tf.nn.relu(new_super_cell_out)
-        #    new_super_cell_out = tf.log(tf.cast(tf.round(tf.exp(new_super_cell_out) - 1), tf.float32) + 1)
-
-        new_states = tuple([new_super_cell_out, new_decoder_state])
-
-        return new_super_cell_out, new_states
-
-
-class DurPredictorCECell(tf.contrib.rnn.RNNCell):
-    """Wrapper wrapper knock knock."""
-
-    def __init__(self, var_predictor_cell, is_training, dim, prenet_units,
-                 max_dur, dur_embedding_dim):
-        super(DurPredictorCECell, self).__init__()
-        self._var_predictor_cell = var_predictor_cell
-        self._is_training = is_training
-        self._dim = dim
-        self._prenet_units = prenet_units
-        self._max_dur = max_dur
-        self._dur_embedding_dim = dur_embedding_dim
-
-    @property
-    def state_size(self):
-        return tuple([self.output_size, self._var_predictor_cell.state_size])
-
-    @property
-    def output_size(self):
-        return self._max_dur
-
-    def zero_state(self, batch_size, dtype):
-        return tuple([
-            rnn_cell_impl._zero_state_tensors(self.output_size, batch_size,
-                                              dtype),
-            self._var_predictor_cell.zero_state(batch_size, dtype)
-        ])
-
-    def call(self, inputs, state):
-        """Run the Tacotron2 super decoder cell."""
-        super_cell_out, decoder_state = state
-
-        # split
-        prenet_input = tf.squeeze(
-            tf.cast(inputs[:, 0:self._dim], tf.int32), axis=-1)  # [N]
-        prenet_input = tf.one_hot(
-            prenet_input, self._max_dur, on_value=1.0, off_value=0.0,
-            axis=-1)  # [N, 120]
-        prenet_input = tf.layers.dense(
-            prenet_input, units=self._dur_embedding_dim)
-        encoder_output = inputs[:, self._dim:]
-
-        # prenet and concat
-        prenet_output = prenet(
-            prenet_input,
-            self._prenet_units,
-            self._is_training,
-            scope='dur_prenet')
-        decoder_input = tf.concat([prenet_output, encoder_output], axis=-1)
-
-        # decoder LSTM/GRU
-        new_super_cell_out, new_decoder_state = self._var_predictor_cell(
-            decoder_input, decoder_state)
-
-        # projection
-        new_super_cell_out = tf.layers.dense(
-            new_super_cell_out, units=self._max_dur)  # [N, 120]
-        new_super_cell_out = tf.nn.softmax(new_super_cell_out)  # [N, 120]
-
-        new_states = tuple([new_super_cell_out, new_decoder_state])
-
-        return new_super_cell_out, new_states
-
-
-class VarPredictorCell2(tf.contrib.rnn.RNNCell):
-    """Wrapper wrapper knock knock."""
-
-    def __init__(self, var_predictor_cell, is_training, dim, prenet_units):
-        super(VarPredictorCell2, self).__init__()
-        self._var_predictor_cell = var_predictor_cell
-        self._is_training = is_training
-        self._dim = dim
-        self._prenet_units = prenet_units
-
-    @property
-    def state_size(self):
-        return tuple([self.output_size, self._var_predictor_cell.state_size])
-
-    @property
-    def output_size(self):
-        return self._dim
-
-    def zero_state(self, batch_size, dtype):
-        return tuple([
-            rnn_cell_impl._zero_state_tensors(self.output_size, batch_size,
-                                              dtype),
-            self._var_predictor_cell.zero_state(batch_size, dtype)
-        ])
-
-    def call(self, inputs, state):
-        '''Run the Tacotron2 super decoder cell.'''
-        super_cell_out, decoder_state = state
-
-        # split
-        prenet_input = inputs[:, 0:self._dim]
-        encoder_output = inputs[:, self._dim:]
-
-        # prenet and concat
-        prenet_output = prenet(
-            prenet_input,
-            self._prenet_units,
-            self._is_training,
-            scope='var_prenet')
-        decoder_input = tf.concat([prenet_output, encoder_output], axis=-1)
-
-        # decoder LSTM/GRU
-        new_super_cell_out, new_decoder_state = self._var_predictor_cell(
-            decoder_input, decoder_state)
-
-        # projection
-        new_super_cell_out = tf.layers.dense(
-            new_super_cell_out, units=self._dim)
-
-        # split and relu
-        new_super_cell_out = tf.concat([
-            tf.nn.relu(new_super_cell_out[:, 0:1]), new_super_cell_out[:, 1:]
-        ], axis=-1)  # yapf:disable
-
-        new_states = tuple([new_super_cell_out, new_decoder_state])
-
-        return new_super_cell_out, new_states
diff --git a/modelscope/models/audio/tts/models/robutrans.py b/modelscope/models/audio/tts/models/robutrans.py
deleted file mode 100755
index ab9fdfcc..00000000
--- a/modelscope/models/audio/tts/models/robutrans.py
+++ /dev/null
@@ -1,760 +0,0 @@
-import tensorflow as tf
-from tensorflow.python.ops.ragged.ragged_util import repeat
-
-from .fsmn_encoder import FsmnEncoderV2
-from .position import BatchSinusodalPositionalEncoding
-from .self_attention_decoder import SelfAttentionDecoder
-from .self_attention_encoder import SelfAttentionEncoder
-
-
-class RobuTrans():
-
-    def __init__(self, hparams):
-        self._hparams = hparams
-
-    def initialize(self,
-                   inputs,
-                   inputs_emotion,
-                   inputs_speaker,
-                   input_lengths,
-                   output_lengths=None,
-                   mel_targets=None,
-                   durations=None,
-                   pitch_contours=None,
-                   uv_masks=None,
-                   pitch_scales=None,
-                   duration_scales=None,
-                   energy_contours=None,
-                   energy_scales=None):
-        """Initializes the model for inference.
-
-        Sets "mel_outputs", "linear_outputs", "stop_token_outputs", and "alignments" fields.
-
-        Args:
-          inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of
-            steps in the input time series, and values are character IDs
-          input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths
-            of each sequence in inputs.
-          output_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths
-            of each sequence in outputs.
-          mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number
-            of steps in the output time series, M is num_mels, and values are entries in the mel
-            spectrogram. Only needed for training.
-        """
-        from tensorflow.contrib.rnn import LSTMBlockCell, MultiRNNCell
-        from tensorflow.contrib.seq2seq import BasicDecoder
-
-        with tf.variable_scope('inference') as _:
-            is_training = mel_targets is not None
-            batch_size = tf.shape(inputs)[0]
-            hp = self._hparams
-
-            input_mask = None
-            if input_lengths is not None and is_training:
-                input_mask = tf.sequence_mask(
-                    input_lengths, tf.shape(inputs)[1], dtype=tf.float32)
-
-            if input_mask is not None:
-                inputs = inputs * tf.expand_dims(input_mask, -1)
-
-            # speaker embedding
-            embedded_inputs_speaker = tf.layers.dense(
-                inputs_speaker,
-                32,
-                activation=None,
-                use_bias=False,
-                kernel_initializer=tf.truncated_normal_initializer(stddev=0.5))
-
-            # emotion embedding
-            embedded_inputs_emotion = tf.layers.dense(
-                inputs_emotion,
-                32,
-                activation=None,
-                use_bias=False,
-                kernel_initializer=tf.truncated_normal_initializer(stddev=0.5))
-
-            # symbol embedding
-            with tf.variable_scope('Embedding'):
-                embedded_inputs = tf.layers.dense(
-                    inputs,
-                    hp.embedding_dim,
-                    activation=None,
-                    use_bias=False,
-                    kernel_initializer=tf.truncated_normal_initializer(
-                        stddev=0.5))
-
-            # Encoder
-            with tf.variable_scope('Encoder'):
-                Encoder = SelfAttentionEncoder(
-                    num_layers=hp.encoder_num_layers,
-                    num_units=hp.encoder_num_units,
-                    num_heads=hp.encoder_num_heads,
-                    ffn_inner_dim=hp.encoder_ffn_inner_dim,
-                    dropout=hp.encoder_dropout,
-                    attention_dropout=hp.encoder_attention_dropout,
-                    relu_dropout=hp.encoder_relu_dropout)
-                encoder_outputs, state_mo, sequence_length_mo, attns = Encoder.encode(
-                    embedded_inputs,
-                    sequence_length=input_lengths,
-                    mode=is_training)
-                encoder_outputs = tf.layers.dense(
-                    encoder_outputs,
-                    hp.encoder_projection_units,
-                    activation=None,
-                    use_bias=False,
-                    kernel_initializer=tf.truncated_normal_initializer(
-                        stddev=0.5))
-
-            # pitch and energy
-            var_inputs = tf.concat([
-                encoder_outputs, embedded_inputs_speaker,
-                embedded_inputs_emotion
-            ], 2)
-            if input_mask is not None:
-                var_inputs = var_inputs * tf.expand_dims(input_mask, -1)
-
-            with tf.variable_scope('Pitch_Predictor'):
-                Pitch_Predictor_FSMN = FsmnEncoderV2(
-                    filter_size=hp.predictor_filter_size,
-                    fsmn_num_layers=hp.predictor_fsmn_num_layers,
-                    dnn_num_layers=hp.predictor_dnn_num_layers,
-                    num_memory_units=hp.predictor_num_memory_units,
-                    ffn_inner_dim=hp.predictor_ffn_inner_dim,
-                    dropout=hp.predictor_dropout,
-                    shift=hp.predictor_shift,
-                    position_encoder=None)
-                pitch_contour_outputs, _, _ = Pitch_Predictor_FSMN.encode(
-                    tf.concat([
-                        encoder_outputs, embedded_inputs_speaker,
-                        embedded_inputs_emotion
-                    ], 2),
-                    sequence_length=input_lengths,
-                    mode=is_training)
-                pitch_contour_outputs, _ = tf.nn.bidirectional_dynamic_rnn(
-                    LSTMBlockCell(hp.predictor_lstm_units),
-                    LSTMBlockCell(hp.predictor_lstm_units),
-                    pitch_contour_outputs,
-                    sequence_length=input_lengths,
-                    dtype=tf.float32)
-                pitch_contour_outputs = tf.concat(
-                    pitch_contour_outputs, axis=-1)
-                pitch_contour_outputs = tf.layers.dense(
-                    pitch_contour_outputs, units=1)  # [N, T_in, 1]
-                pitch_contour_outputs = tf.squeeze(
-                    pitch_contour_outputs, axis=2)  # [N, T_in]
-
-            with tf.variable_scope('Energy_Predictor'):
-                Energy_Predictor_FSMN = FsmnEncoderV2(
-                    filter_size=hp.predictor_filter_size,
-                    fsmn_num_layers=hp.predictor_fsmn_num_layers,
-                    dnn_num_layers=hp.predictor_dnn_num_layers,
-                    num_memory_units=hp.predictor_num_memory_units,
-                    ffn_inner_dim=hp.predictor_ffn_inner_dim,
-                    dropout=hp.predictor_dropout,
-                    shift=hp.predictor_shift,
-                    position_encoder=None)
-                energy_contour_outputs, _, _ = Energy_Predictor_FSMN.encode(
-                    tf.concat([
-                        encoder_outputs, embedded_inputs_speaker,
-                        embedded_inputs_emotion
-                    ], 2),
-                    sequence_length=input_lengths,
-                    mode=is_training)
-                energy_contour_outputs, _ = tf.nn.bidirectional_dynamic_rnn(
-                    LSTMBlockCell(hp.predictor_lstm_units),
-                    LSTMBlockCell(hp.predictor_lstm_units),
-                    energy_contour_outputs,
-                    sequence_length=input_lengths,
-                    dtype=tf.float32)
-                energy_contour_outputs = tf.concat(
-                    energy_contour_outputs, axis=-1)
-                energy_contour_outputs = tf.layers.dense(
-                    energy_contour_outputs, units=1)  # [N, T_in, 1]
-                energy_contour_outputs = tf.squeeze(
-                    energy_contour_outputs, axis=2)  # [N, T_in]
-
-            if is_training:
-                pitch_embeddings = tf.expand_dims(
-                    pitch_contours, axis=2)  # [N, T_in, 1]
-                pitch_embeddings = tf.layers.conv1d(
-                    pitch_embeddings,
-                    filters=hp.encoder_projection_units,
-                    kernel_size=9,
-                    padding='same',
-                    name='pitch_embeddings')  # [N, T_in, 32]
-
-                energy_embeddings = tf.expand_dims(
-                    energy_contours, axis=2)  # [N, T_in, 1]
-                energy_embeddings = tf.layers.conv1d(
-                    energy_embeddings,
-                    filters=hp.encoder_projection_units,
-                    kernel_size=9,
-                    padding='same',
-                    name='energy_embeddings')  # [N, T_in, 32]
-            else:
-                pitch_contour_outputs *= pitch_scales
-                pitch_embeddings = tf.expand_dims(
-                    pitch_contour_outputs, axis=2)  # [N, T_in, 1]
-                pitch_embeddings = tf.layers.conv1d(
-                    pitch_embeddings,
-                    filters=hp.encoder_projection_units,
-                    kernel_size=9,
-                    padding='same',
-                    name='pitch_embeddings')  # [N, T_in, 32]
-
-                energy_contour_outputs *= energy_scales
-                energy_embeddings = tf.expand_dims(
-                    energy_contour_outputs, axis=2)  # [N, T_in, 1]
-                energy_embeddings = tf.layers.conv1d(
-                    energy_embeddings,
-                    filters=hp.encoder_projection_units,
-                    kernel_size=9,
-                    padding='same',
-                    name='energy_embeddings')  # [N, T_in, 32]
-
-            encoder_outputs_ = encoder_outputs + pitch_embeddings + energy_embeddings
-
-            # duration
-            dur_inputs = tf.concat([
-                encoder_outputs_, embedded_inputs_speaker,
-                embedded_inputs_emotion
-            ], 2)
-            if input_mask is not None:
-                dur_inputs = dur_inputs * tf.expand_dims(input_mask, -1)
-            with tf.variable_scope('Duration_Predictor'):
-                duration_predictor_cell = MultiRNNCell([
-                    LSTMBlockCell(hp.predictor_lstm_units),
-                    LSTMBlockCell(hp.predictor_lstm_units)
-                ], state_is_tuple=True)  # yapf:disable
-                from .rnn_wrappers import DurPredictorCell
-                duration_output_cell = DurPredictorCell(
-                    duration_predictor_cell, is_training, 1,
-                    hp.predictor_prenet_units)
-                duration_predictor_init_state = duration_output_cell.zero_state(
-                    batch_size=batch_size, dtype=tf.float32)
-                if is_training:
-                    from .helpers import VarTrainingHelper
-                    duration_helper = VarTrainingHelper(
-                        tf.expand_dims(
-                            tf.log(tf.cast(durations, tf.float32) + 1),
-                            axis=2), dur_inputs, 1)
-                else:
-                    from .helpers import VarTestHelper
-                    duration_helper = VarTestHelper(batch_size, dur_inputs, 1)
-                (
-                    duration_outputs, _
-                ), final_duration_predictor_state, _ = tf.contrib.seq2seq.dynamic_decode(
-                    BasicDecoder(duration_output_cell, duration_helper,
-                                 duration_predictor_init_state),
-                    maximum_iterations=1000)
-                duration_outputs = tf.squeeze(
-                    duration_outputs, axis=2)  # [N, T_in]
-                if input_mask is not None:
-                    duration_outputs = duration_outputs * input_mask
-                duration_outputs_ = tf.exp(duration_outputs) - 1
-
-            # Length Regulator
-            with tf.variable_scope('Length_Regulator'):
-                if is_training:
-                    i = tf.constant(1)
-                    # position embedding
-                    j = tf.constant(1)
-                    dur_len = tf.shape(durations)[-1]
-                    embedded_position_i = tf.range(1, durations[0, 0] + 1)
-
-                    def condition_pos(j, e):
-                        return tf.less(j, dur_len)
-
-                    def loop_body_pos(j, embedded_position_i):
-                        embedded_position_i = tf.concat([
-                            embedded_position_i,
-                            tf.range(1, durations[0, j] + 1)
-                        ], axis=0)  # yapf:disable
-                        return [j + 1, embedded_position_i]
-
-                    j, embedded_position_i = tf.while_loop(
-                        condition_pos,
-                        loop_body_pos, [j, embedded_position_i],
-                        shape_invariants=[
-                            j.get_shape(),
-                            tf.TensorShape([None])
-                        ])
-                    embedded_position = tf.reshape(embedded_position_i,
-                                                   (1, -1))
-
-                    # others
-                    LR_outputs = repeat(
-                        encoder_outputs_[0:1, :, :], durations[0, :], axis=1)
-                    embedded_outputs_speaker = repeat(
-                        embedded_inputs_speaker[0:1, :, :],
-                        durations[0, :],
-                        axis=1)
-                    embedded_outputs_emotion = repeat(
-                        embedded_inputs_emotion[0:1, :, :],
-                        durations[0, :],
-                        axis=1)
-
-                    def condition(i, pos, layer, s, e):
-                        return tf.less(i, tf.shape(mel_targets)[0])
-
-                    def loop_body(i, embedded_position, LR_outputs,
-                                  embedded_outputs_speaker,
-                                  embedded_outputs_emotion):
-                        # position embedding
-                        jj = tf.constant(1)
-                        embedded_position_i = tf.range(1, durations[i, 0] + 1)
-
-                        def condition_pos_i(j, e):
-                            return tf.less(j, dur_len)
-
-                        def loop_body_pos_i(j, embedded_position_i):
-                            embedded_position_i = tf.concat([
-                                embedded_position_i,
-                                tf.range(1, durations[i, j] + 1)
-                            ], axis=0)  # yapf:disable
-                            return [j + 1, embedded_position_i]
-
-                        jj, embedded_position_i = tf.while_loop(
-                            condition_pos_i,
-                            loop_body_pos_i, [jj, embedded_position_i],
-                            shape_invariants=[
-                                jj.get_shape(),
-                                tf.TensorShape([None])
-                            ])
-                        embedded_position = tf.concat([
-                            embedded_position,
-                            tf.reshape(embedded_position_i, (1, -1))
-                        ], 0)
-
-                        # others
-                        LR_outputs = tf.concat([
-                            LR_outputs,
-                            repeat(
-                                encoder_outputs_[i:i + 1, :, :],
-                                durations[i, :],
-                                axis=1)
-                        ], 0)
-                        embedded_outputs_speaker = tf.concat([
-                            embedded_outputs_speaker,
-                            repeat(
-                                embedded_inputs_speaker[i:i + 1, :, :],
-                                durations[i, :],
-                                axis=1)
-                        ], 0)
-                        embedded_outputs_emotion = tf.concat([
-                            embedded_outputs_emotion,
-                            repeat(
-                                embedded_inputs_emotion[i:i + 1, :, :],
-                                durations[i, :],
-                                axis=1)
-                        ], 0)
-                        return [
-                            i + 1, embedded_position, LR_outputs,
-                            embedded_outputs_speaker, embedded_outputs_emotion
-                        ]
-
-                    i, embedded_position, LR_outputs,
-                    embedded_outputs_speaker,
-                    embedded_outputs_emotion = tf.while_loop(
-                        condition,
-                        loop_body, [
-                            i, embedded_position, LR_outputs,
-                            embedded_outputs_speaker, embedded_outputs_emotion
-                        ],
-                        shape_invariants=[
-                            i.get_shape(),
-                            tf.TensorShape([None, None]),
-                            tf.TensorShape([None, None, None]),
-                            tf.TensorShape([None, None, None]),
-                            tf.TensorShape([None, None, None])
-                        ],
-                        parallel_iterations=hp.batch_size)
-
-                    ori_framenum = tf.shape(mel_targets)[1]
-                else:
-                    # position
-                    j = tf.constant(1)
-                    dur_len = tf.shape(duration_outputs_)[-1]
-                    embedded_position_i = tf.range(
-                        1,
-                        tf.cast(tf.round(duration_outputs_)[0, 0], tf.int32)
-                        + 1)
-
-                    def condition_pos(j, e):
-                        return tf.less(j, dur_len)
-
-                    def loop_body_pos(j, embedded_position_i):
-                        embedded_position_i = tf.concat([
-                            embedded_position_i,
-                            tf.range(
-                                1,
-                                tf.cast(
-                                    tf.round(duration_outputs_)[0, j],
-                                    tf.int32) + 1)
-                        ], axis=0)  # yapf:disable
-                        return [j + 1, embedded_position_i]
-
-                    j, embedded_position_i = tf.while_loop(
-                        condition_pos,
-                        loop_body_pos, [j, embedded_position_i],
-                        shape_invariants=[
-                            j.get_shape(),
-                            tf.TensorShape([None])
-                        ])
-                    embedded_position = tf.reshape(embedded_position_i,
-                                                   (1, -1))
-                    # others
-                    duration_outputs_ *= duration_scales
-                    LR_outputs = repeat(
-                        encoder_outputs_[0:1, :, :],
-                        tf.cast(tf.round(duration_outputs_)[0, :], tf.int32),
-                        axis=1)
-                    embedded_outputs_speaker = repeat(
-                        embedded_inputs_speaker[0:1, :, :],
-                        tf.cast(tf.round(duration_outputs_)[0, :], tf.int32),
-                        axis=1)
-                    embedded_outputs_emotion = repeat(
-                        embedded_inputs_emotion[0:1, :, :],
-                        tf.cast(tf.round(duration_outputs_)[0, :], tf.int32),
-                        axis=1)
-                    ori_framenum = tf.shape(LR_outputs)[1]
-
-                    left = hp.outputs_per_step - tf.mod(
-                        ori_framenum, hp.outputs_per_step)
-                    LR_outputs = tf.cond(
-                        tf.equal(left,
-                                 hp.outputs_per_step), lambda: LR_outputs,
-                        lambda: tf.pad(LR_outputs, [[0, 0], [0, left], [0, 0]],
-                                       'CONSTANT'))
-                    embedded_outputs_speaker = tf.cond(
-                        tf.equal(left, hp.outputs_per_step),
-                        lambda: embedded_outputs_speaker, lambda: tf.pad(
-                            embedded_outputs_speaker, [[0, 0], [0, left],
-                                                       [0, 0]], 'CONSTANT'))
-                    embedded_outputs_emotion = tf.cond(
-                        tf.equal(left, hp.outputs_per_step),
-                        lambda: embedded_outputs_emotion, lambda: tf.pad(
-                            embedded_outputs_emotion, [[0, 0], [0, left],
-                                                       [0, 0]], 'CONSTANT'))
-                    embedded_position = tf.cond(
-                        tf.equal(left, hp.outputs_per_step),
-                        lambda: embedded_position,
-                        lambda: tf.pad(embedded_position, [[0, 0], [0, left]],
-                                       'CONSTANT'))
-
-            # Pos_Embedding
-            with tf.variable_scope('Position_Embedding'):
-                Pos_Embedding = BatchSinusodalPositionalEncoding()
-                position_embeddings = Pos_Embedding.positional_encoding(
-                    batch_size,
-                    tf.shape(LR_outputs)[1], hp.encoder_projection_units,
-                    embedded_position)
-            LR_outputs += position_embeddings
-
-            # multi-frame
-            LR_outputs = tf.reshape(LR_outputs, [
-                batch_size, -1,
-                hp.outputs_per_step * hp.encoder_projection_units
-            ])
-            embedded_outputs_speaker = tf.reshape(
-                embedded_outputs_speaker,
-                [batch_size, -1, hp.outputs_per_step * 32])[:, :, :32]
-            embedded_outputs_emotion = tf.reshape(
-                embedded_outputs_emotion,
-                [batch_size, -1, hp.outputs_per_step * 32])[:, :, :32]
-            # [N, T_out, D_LR_outputs] (D_LR_outputs = hp.outputs_per_step * hp.encoder_projection_units + 64)
-            LR_outputs = tf.concat([
-                LR_outputs, embedded_outputs_speaker, embedded_outputs_emotion
-            ], -1)
-
-            # auto bandwidth
-            if is_training:
-                durations_mask = tf.cast(durations,
-                                         tf.float32) * input_mask  # [N, T_in]
-            else:
-                durations_mask = duration_outputs_
-            X_band_width = tf.cast(
-                tf.round(tf.reduce_max(durations_mask) / hp.outputs_per_step),
-                tf.int32)
-            H_band_width = X_band_width
-
-            with tf.variable_scope('Decoder'):
-                Decoder = SelfAttentionDecoder(
-                    num_layers=hp.decoder_num_layers,
-                    num_units=hp.decoder_num_units,
-                    num_heads=hp.decoder_num_heads,
-                    ffn_inner_dim=hp.decoder_ffn_inner_dim,
-                    dropout=hp.decoder_dropout,
-                    attention_dropout=hp.decoder_attention_dropout,
-                    relu_dropout=hp.decoder_relu_dropout,
-                    prenet_units=hp.prenet_units,
-                    dense_units=hp.prenet_proj_units,
-                    num_mels=hp.num_mels,
-                    outputs_per_step=hp.outputs_per_step,
-                    X_band_width=X_band_width,
-                    H_band_width=H_band_width,
-                    position_encoder=None)
-                if is_training:
-                    if hp.free_run:
-                        r = hp.outputs_per_step
-                        init_decoder_input = tf.expand_dims(
-                            tf.tile([[0.0]], [batch_size, hp.num_mels]),
-                            axis=1)  # [N, 1, hp.num_mels]
-                        decoder_input_lengths = tf.cast(
-                            output_lengths / r, tf.int32)
-                        decoder_outputs, attention_x, attention_h = Decoder.dynamic_decode_and_search(
-                            init_decoder_input,
-                            maximum_iterations=tf.shape(LR_outputs)[1],
-                            mode=is_training,
-                            memory=LR_outputs,
-                            memory_sequence_length=decoder_input_lengths)
-                    else:
-                        r = hp.outputs_per_step
-                        decoder_input = mel_targets[:, r - 1::
-                                                    r, :]  # [N, T_out / r, hp.num_mels]
-                        init_decoder_input = tf.expand_dims(
-                            tf.tile([[0.0]], [batch_size, hp.num_mels]),
-                            axis=1)  # [N, 1, hp.num_mels]
-                        decoder_input = tf.concat(
-                            [init_decoder_input, decoder_input],
-                            axis=1)  # [N, T_out / r + 1, hp.num_mels]
-                        decoder_input = decoder_input[:, :
-                                                      -1, :]  # [N, T_out / r, hp.num_mels]
-                        decoder_input_lengths = tf.cast(
-                            output_lengths / r, tf.int32)
-                        decoder_outputs, attention_x, attention_h = Decoder.decode_from_inputs(
-                            decoder_input,
-                            decoder_input_lengths,
-                            mode=is_training,
-                            memory=LR_outputs,
-                            memory_sequence_length=decoder_input_lengths)
-                else:
-                    init_decoder_input = tf.expand_dims(
-                        tf.tile([[0.0]], [batch_size, hp.num_mels]),
-                        axis=1)  # [N, 1, hp.num_mels]
-                    decoder_outputs, attention_x, attention_h = Decoder.dynamic_decode_and_search(
-                        init_decoder_input,
-                        maximum_iterations=tf.shape(LR_outputs)[1],
-                        mode=is_training,
-                        memory=LR_outputs,
-                        memory_sequence_length=tf.expand_dims(
-                            tf.shape(LR_outputs)[1], axis=0))
-
-                if is_training:
-                    mel_outputs_ = tf.reshape(decoder_outputs,
-                                              [batch_size, -1, hp.num_mels])
-                else:
-                    mel_outputs_ = tf.reshape(
-                        decoder_outputs,
-                        [batch_size, -1, hp.num_mels])[:, :ori_framenum, :]
-                mel_outputs = mel_outputs_
-
-            with tf.variable_scope('Postnet'):
-                Postnet_FSMN = FsmnEncoderV2(
-                    filter_size=hp.postnet_filter_size,
-                    fsmn_num_layers=hp.postnet_fsmn_num_layers,
-                    dnn_num_layers=hp.postnet_dnn_num_layers,
-                    num_memory_units=hp.postnet_num_memory_units,
-                    ffn_inner_dim=hp.postnet_ffn_inner_dim,
-                    dropout=hp.postnet_dropout,
-                    shift=hp.postnet_shift,
-                    position_encoder=None)
-                if is_training:
-                    postnet_fsmn_outputs, _, _ = Postnet_FSMN.encode(
-                        mel_outputs,
-                        sequence_length=output_lengths,
-                        mode=is_training)
-                    hidden_lstm_outputs, _ = tf.nn.dynamic_rnn(
-                        LSTMBlockCell(hp.postnet_lstm_units),
-                        postnet_fsmn_outputs,
-                        sequence_length=output_lengths,
-                        dtype=tf.float32)
-                else:
-                    postnet_fsmn_outputs, _, _ = Postnet_FSMN.encode(
-                        mel_outputs,
-                        sequence_length=[tf.shape(mel_outputs_)[1]],
-                        mode=is_training)
-                    hidden_lstm_outputs, _ = tf.nn.dynamic_rnn(
-                        LSTMBlockCell(hp.postnet_lstm_units),
-                        postnet_fsmn_outputs,
-                        sequence_length=[tf.shape(mel_outputs_)[1]],
-                        dtype=tf.float32)
-
-            mel_residual_outputs = tf.layers.dense(
-                hidden_lstm_outputs, units=hp.num_mels)
-            mel_outputs += mel_residual_outputs
-
-            self.inputs = inputs
-            self.inputs_speaker = inputs_speaker
-            self.inputs_emotion = inputs_emotion
-            self.input_lengths = input_lengths
-            self.durations = durations
-            self.output_lengths = output_lengths
-            self.mel_outputs_ = mel_outputs_
-            self.mel_outputs = mel_outputs
-            self.mel_targets = mel_targets
-            self.duration_outputs = duration_outputs
-            self.duration_outputs_ = duration_outputs_
-            self.duration_scales = duration_scales
-            self.pitch_contour_outputs = pitch_contour_outputs
-            self.pitch_contours = pitch_contours
-            self.pitch_scales = pitch_scales
-            self.energy_contour_outputs = energy_contour_outputs
-            self.energy_contours = energy_contours
-            self.energy_scales = energy_scales
-            self.uv_masks_ = uv_masks
-
-            self.embedded_inputs_emotion = embedded_inputs_emotion
-            self.embedding_fsmn_outputs = embedded_inputs
-            self.encoder_outputs = encoder_outputs
-            self.encoder_outputs_ = encoder_outputs_
-            self.LR_outputs = LR_outputs
-            self.postnet_fsmn_outputs = postnet_fsmn_outputs
-
-            self.pitch_embeddings = pitch_embeddings
-            self.energy_embeddings = energy_embeddings
-
-            self.attns = attns
-            self.attention_x = attention_x
-            self.attention_h = attention_h
-            self.X_band_width = X_band_width
-            self.H_band_width = H_band_width
-
-    def add_loss(self):
-        '''Adds loss to the model. Sets "loss" field. initialize must have been called.'''
-        with tf.variable_scope('loss') as _:
-            hp = self._hparams
-            mask = tf.sequence_mask(
-                self.output_lengths,
-                tf.shape(self.mel_targets)[1],
-                dtype=tf.float32)
-            valid_outputs = tf.reduce_sum(mask)
-
-            mask_input = tf.sequence_mask(
-                self.input_lengths,
-                tf.shape(self.durations)[1],
-                dtype=tf.float32)
-            valid_inputs = tf.reduce_sum(mask_input)
-
-            # mel loss
-            if self.uv_masks_ is not None:
-                valid_outputs_mask = tf.reduce_sum(
-                    tf.expand_dims(mask, -1) * self.uv_masks_)
-                self.mel_loss_ = tf.reduce_sum(
-                    tf.abs(self.mel_targets - self.mel_outputs_)
-                    * tf.expand_dims(mask, -1) * self.uv_masks_) / (
-                        valid_outputs_mask * hp.num_mels)
-                self.mel_loss = tf.reduce_sum(
-                    tf.abs(self.mel_targets - self.mel_outputs)
-                    * tf.expand_dims(mask, -1) * self.uv_masks_) / (
-                        valid_outputs_mask * hp.num_mels)
-            else:
-                self.mel_loss_ = tf.reduce_sum(
-                    tf.abs(self.mel_targets - self.mel_outputs_)
-                    * tf.expand_dims(mask, -1)) / (
-                        valid_outputs * hp.num_mels)
-                self.mel_loss = tf.reduce_sum(
-                    tf.abs(self.mel_targets - self.mel_outputs)
-                    * tf.expand_dims(mask, -1)) / (
-                        valid_outputs * hp.num_mels)
-
-            # duration loss
-            self.duration_loss = tf.reduce_sum(
-                tf.abs(
-                    tf.log(tf.cast(self.durations, tf.float32) + 1)
-                    - self.duration_outputs) * mask_input) / valid_inputs
-
-            # pitch contour loss
-            self.pitch_contour_loss = tf.reduce_sum(
-                tf.abs(self.pitch_contours - self.pitch_contour_outputs)
-                * mask_input) / valid_inputs
-
-            # energy contour loss
-            self.energy_contour_loss = tf.reduce_sum(
-                tf.abs(self.energy_contours - self.energy_contour_outputs)
-                * mask_input) / valid_inputs
-
-            # final loss
-            self.loss = self.mel_loss_ + self.mel_loss + self.duration_loss \
-                + self.pitch_contour_loss + self.energy_contour_loss
-
-            # guided attention loss
-            self.guided_attention_loss = tf.constant(0.0)
-            if hp.guided_attention:
-                i0 = tf.constant(0)
-                loss0 = tf.constant(0.0)
-
-                def c(i, _):
-                    return tf.less(i, tf.shape(mel_targets)[0])
-
-                def loop_body(i, loss):
-                    decoder_input_lengths = tf.cast(
-                        self.output_lengths / hp.outputs_per_step, tf.int32)
-                    input_len = decoder_input_lengths[i]
-                    output_len = decoder_input_lengths[i]
-                    input_w = tf.expand_dims(
-                        tf.range(tf.cast(input_len, dtype=tf.float32)),
-                        axis=1) / tf.cast(
-                            input_len, dtype=tf.float32)  # [T_in, 1]
-                    output_w = tf.expand_dims(
-                        tf.range(tf.cast(output_len, dtype=tf.float32)),
-                        axis=0) / tf.cast(
-                            output_len, dtype=tf.float32)  # [1, T_out]
-                    guided_attention_w = 1.0 - tf.exp(
-                        -(1 / hp.guided_attention_2g_squared)
-                        * tf.square(input_w - output_w))  # [T_in, T_out]
-                    guided_attention_w = tf.expand_dims(
-                        guided_attention_w, axis=0)  # [1, T_in, T_out]
-                    # [hp.decoder_num_heads, T_in, T_out]
-                    guided_attention_w = tf.tile(guided_attention_w,
-                                                 [hp.decoder_num_heads, 1, 1])
-                    loss_i = tf.constant(0.0)
-                    for j in range(hp.decoder_num_layers):
-                        loss_i += tf.reduce_mean(
-                            self.attention_h[j][i, :, :input_len, :output_len]
-                            * guided_attention_w)
-
-                    return [tf.add(i, 1), tf.add(loss, loss_i)]
-
-                _, loss = tf.while_loop(
-                    c,
-                    loop_body,
-                    loop_vars=[i0, loss0],
-                    parallel_iterations=hp.batch_size)
-                self.guided_attention_loss = loss / hp.batch_size
-                self.loss += hp.guided_attention_loss_weight * self.guided_attention_loss
-
-    def add_optimizer(self, global_step):
-        '''Adds optimizer. Sets "gradients" and "optimize" fields. add_loss must have been called.
-
-        Args:
-          global_step: int32 scalar Tensor representing current global step in training
-        '''
-        with tf.variable_scope('optimizer') as _:
-            hp = self._hparams
-            if hp.decay_learning_rate:
-                self.learning_rate = _learning_rate_decay(
-                    hp.initial_learning_rate, global_step)
-            else:
-                self.learning_rate = tf.convert_to_tensor(
-                    hp.initial_learning_rate)
-            optimizer = tf.train.AdamOptimizer(self.learning_rate,
-                                               hp.adam_beta1, hp.adam_beta2)
-            gradients, variables = zip(*optimizer.compute_gradients(self.loss))
-            self.gradients = gradients
-            clipped_gradients, _ = tf.clip_by_global_norm(gradients, 1.0)
-
-            # Add dependency on UPDATE_OPS; otherwise batchnorm won't work correctly. See:
-            # https://github.com/tensorflow/tensorflow/issues/1122
-            with tf.control_dependencies(
-                    tf.get_collection(tf.GraphKeys.UPDATE_OPS)):
-                self.optimize = optimizer.apply_gradients(
-                    zip(clipped_gradients, variables), global_step=global_step)
-
-
-def _learning_rate_decay(init_lr, global_step):
-    # Noam scheme from tensor2tensor:
-    warmup_steps = 4000.0
-    step = tf.cast(global_step + 1, dtype=tf.float32)
-    return init_lr * warmup_steps**0.5 * tf.minimum(step * warmup_steps**-1.5,
-                                                    step**-0.5)
diff --git a/modelscope/models/audio/tts/models/self_attention_decoder.py b/modelscope/models/audio/tts/models/self_attention_decoder.py
deleted file mode 100755
index 9cf3fcaa..00000000
--- a/modelscope/models/audio/tts/models/self_attention_decoder.py
+++ /dev/null
@@ -1,817 +0,0 @@
-"""Define self-attention decoder."""
-
-import sys
-
-import tensorflow as tf
-
-from . import compat, transformer
-from .am_models import decoder_prenet
-from .position import SinusoidalPositionEncoder
-
-
-class SelfAttentionDecoder():
-    """Decoder using self-attention as described in
-    https://arxiv.org/abs/1706.03762.
-    """
-
-    def __init__(self,
-                 num_layers,
-                 num_units=512,
-                 num_heads=8,
-                 ffn_inner_dim=2048,
-                 dropout=0.1,
-                 attention_dropout=0.1,
-                 relu_dropout=0.1,
-                 prenet_units=256,
-                 dense_units=128,
-                 num_mels=80,
-                 outputs_per_step=3,
-                 X_band_width=None,
-                 H_band_width=None,
-                 position_encoder=SinusoidalPositionEncoder(),
-                 self_attention_type='scaled_dot'):
-        """Initializes the parameters of the decoder.
-
-        Args:
-          num_layers: The number of layers.
-          num_units: The number of hidden units.
-          num_heads: The number of heads in the multi-head attention.
-          ffn_inner_dim: The number of units of the inner linear transformation
-            in the feed forward layer.
-          dropout: The probability to drop units from the outputs.
-          attention_dropout: The probability to drop units from the attention.
-          relu_dropout: The probability to drop units from the ReLU activation in
-            the feed forward layer.
-          position_encoder: A :class:`opennmt.layers.position.PositionEncoder` to
-            apply on inputs or ``None``.
-          self_attention_type: Type of self attention, "scaled_dot" or "average" (case
-            insensitive).
-
-        Raises:
-          ValueError: if :obj:`self_attention_type` is invalid.
-        """
-        super(SelfAttentionDecoder, self).__init__()
-        self.num_layers = num_layers
-        self.num_units = num_units
-        self.num_heads = num_heads
-        self.ffn_inner_dim = ffn_inner_dim
-        self.dropout = dropout
-        self.attention_dropout = attention_dropout
-        self.relu_dropout = relu_dropout
-        self.position_encoder = position_encoder
-        self.self_attention_type = self_attention_type.lower()
-        if self.self_attention_type not in ('scaled_dot', 'average'):
-            raise ValueError('invalid attention type %s'
-                             % self.self_attention_type)
-        if self.self_attention_type == 'average':
-            tf.logging.warning(
-                'Support for average attention network is experimental '
-                'and may change in future versions.')
-        self.prenet_units = prenet_units
-        self.dense_units = dense_units
-        self.num_mels = num_mels
-        self.outputs_per_step = outputs_per_step
-        self.X_band_width = X_band_width
-        self.H_band_width = H_band_width
-
-    @property
-    def output_size(self):
-        """Returns the decoder output size."""
-        return self.num_units
-
-    @property
-    def support_alignment_history(self):
-        return True
-
-    @property
-    def support_multi_source(self):
-        return True
-
-    def _init_cache(self, batch_size, dtype=tf.float32, num_sources=1):
-        cache = {}
-
-        for layer in range(self.num_layers):
-            proj_cache_shape = [
-                batch_size, self.num_heads, 0, self.num_units // self.num_heads
-            ]
-            layer_cache = {}
-            layer_cache['memory'] = [{
-                'memory_keys':
-                tf.zeros(proj_cache_shape, dtype=dtype),
-                'memory_values':
-                tf.zeros(proj_cache_shape, dtype=dtype)
-            } for _ in range(num_sources)]
-            if self.self_attention_type == 'scaled_dot':
-                layer_cache['self_keys'] = tf.zeros(
-                    proj_cache_shape, dtype=dtype)
-                layer_cache['self_values'] = tf.zeros(
-                    proj_cache_shape, dtype=dtype)
-            elif self.self_attention_type == 'average':
-                layer_cache['prev_g'] = tf.zeros(
-                    [batch_size, 1, self.num_units], dtype=dtype)
-            cache['layer_{}'.format(layer)] = layer_cache
-
-        return cache
-
-    def _init_attn(self, dtype=tf.float32):
-        attn = []
-        for layer in range(self.num_layers):
-            attn.append(tf.TensorArray(tf.float32, size=0, dynamic_size=True))
-        return attn
-
-    def _self_attention_stack(self,
-                              inputs,
-                              sequence_length=None,
-                              mode=True,
-                              cache=None,
-                              memory=None,
-                              memory_sequence_length=None,
-                              step=None):
-
-        # [N, T_out, self.dense_units] or [N, 1, self.dense_units]
-        prenet_outputs = decoder_prenet(inputs, self.prenet_units,
-                                        self.dense_units, mode)
-        if step is None:
-            decoder_inputs = tf.concat(
-                [memory, prenet_outputs],
-                axis=-1)  # [N, T_out, memory_size + self.dense_units]
-        else:
-            decoder_inputs = tf.concat(
-                [memory[:, step:step + 1, :], prenet_outputs],
-                axis=-1)  # [N, 1, memory_size + self.dense_units]
-        decoder_inputs = tf.layers.dense(
-            decoder_inputs, units=self.dense_units)
-
-        inputs = decoder_inputs
-        inputs *= self.num_units**0.5
-        if self.position_encoder is not None:
-            inputs = self.position_encoder(
-                inputs, position=step + 1 if step is not None else None)
-
-        inputs = tf.layers.dropout(inputs, rate=self.dropout, training=mode)
-
-        decoder_mask = None
-        memory_mask = None
-        # last_attention = None
-
-        X_band_width_tmp = -1
-        H_band_width_tmp = -1
-        if self.X_band_width is not None:
-            X_band_width_tmp = tf.cast(
-                tf.cond(
-                    tf.less(tf.shape(memory)[1], self.X_band_width),
-                    lambda: -1, lambda: self.X_band_width),
-                dtype=tf.int64)
-        if self.H_band_width is not None:
-            H_band_width_tmp = tf.cast(
-                tf.cond(
-                    tf.less(tf.shape(memory)[1], self.H_band_width),
-                    lambda: -1, lambda: self.H_band_width),
-                dtype=tf.int64)
-
-        if self.self_attention_type == 'scaled_dot':
-            if sequence_length is not None:
-                decoder_mask = transformer.build_future_mask(
-                    sequence_length,
-                    num_heads=self.num_heads,
-                    maximum_length=tf.shape(inputs)[1],
-                    band=X_band_width_tmp)  # [N, 1, T_out, T_out]
-        elif self.self_attention_type == 'average':
-            if cache is None:
-                if sequence_length is None:
-                    sequence_length = tf.fill([tf.shape(inputs)[0]],
-                                              tf.shape(inputs)[1])
-                decoder_mask = transformer.cumulative_average_mask(
-                    sequence_length,
-                    maximum_length=tf.shape(inputs)[1],
-                    dtype=inputs.dtype)
-
-        if memory is not None and not tf.contrib.framework.nest.is_sequence(
-                memory):
-            memory = (memory, )
-        if memory_sequence_length is not None:
-            if not tf.contrib.framework.nest.is_sequence(
-                    memory_sequence_length):
-                memory_sequence_length = (memory_sequence_length, )
-            if step is None:
-                memory_mask = [
-                    transformer.build_history_mask(
-                        length,
-                        num_heads=self.num_heads,
-                        maximum_length=tf.shape(m)[1],
-                        band=H_band_width_tmp)
-                    for m, length in zip(memory, memory_sequence_length)
-                ]
-            else:
-                memory_mask = [
-                    transformer.build_history_mask(
-                        length,
-                        num_heads=self.num_heads,
-                        maximum_length=tf.shape(m)[1],
-                        band=H_band_width_tmp)[:, :, step:step + 1, :]
-                    for m, length in zip(memory, memory_sequence_length)
-                ]
-
-        # last_attention = None
-        attns_x = []
-        attns_h = []
-        for layer in range(self.num_layers):
-            layer_name = 'layer_{}'.format(layer)
-            layer_cache = cache[layer_name] if cache is not None else None
-            with tf.variable_scope(layer_name):
-                if memory is not None:
-                    for i, (mem, mask) in enumerate(zip(memory, memory_mask)):
-                        memory_cache = None
-                        if layer_cache is not None:
-                            memory_cache = layer_cache['memory'][i]
-                        scope_name = 'multi_head_{}'.format(i)
-                        if i == 0:
-                            scope_name = 'multi_head'
-                        with tf.variable_scope(scope_name):
-                            encoded, attn_x, attn_h = transformer.multi_head_attention_PNCA(
-                                self.num_heads,
-                                transformer.norm(inputs),
-                                mem,
-                                mode,
-                                num_units=self.num_units,
-                                mask=decoder_mask,
-                                mask_h=mask,
-                                cache=layer_cache,
-                                cache_h=memory_cache,
-                                dropout=self.attention_dropout,
-                                return_attention=True,
-                                layer_name=layer_name,
-                                X_band_width=self.X_band_width)
-                            attns_x.append(attn_x)
-                            attns_h.append(attn_h)
-                            context = transformer.drop_and_add(
-                                inputs, encoded, mode, dropout=self.dropout)
-
-                with tf.variable_scope('ffn'):
-                    transformed = transformer.feed_forward_ori(
-                        transformer.norm(context),
-                        self.ffn_inner_dim,
-                        mode,
-                        dropout=self.relu_dropout)
-                    transformed = transformer.drop_and_add(
-                        context, transformed, mode, dropout=self.dropout)
-
-                inputs = transformed
-
-        outputs = transformer.norm(inputs)
-        outputs = tf.layers.dense(
-            outputs, units=self.num_mels * self.outputs_per_step)
-        return outputs, attns_x, attns_h
-
-    def decode_from_inputs(self,
-                           inputs,
-                           sequence_length,
-                           initial_state=None,
-                           mode=True,
-                           memory=None,
-                           memory_sequence_length=None):
-        outputs, attention_x, attention_h = self._self_attention_stack(
-            inputs,
-            sequence_length=sequence_length,
-            mode=mode,
-            memory=memory,
-            memory_sequence_length=memory_sequence_length)
-        return outputs, attention_x, attention_h
-
-    def step_fn(self,
-                mode,
-                batch_size,
-                initial_state=None,
-                memory=None,
-                memory_sequence_length=None,
-                dtype=tf.float32):
-        if memory is None:
-            num_sources = 0
-        elif tf.contrib.framework.nest.is_sequence(memory):
-            num_sources = len(memory)
-        else:
-            num_sources = 1
-        cache = self._init_cache(
-            batch_size, dtype=dtype, num_sources=num_sources)
-        attention_x = self._init_attn(dtype=dtype)
-        attention_h = self._init_attn(dtype=dtype)
-
-        def _fn(step, inputs, cache):
-            outputs, attention_x, attention_h = self._self_attention_stack(
-                inputs,
-                mode=mode,
-                cache=cache,
-                memory=memory,
-                memory_sequence_length=memory_sequence_length,
-                step=step)
-            attention_x_tmp = []
-            for layer in range(len(attention_h)):
-                attention_x_tmp_l = tf.zeros_like(attention_h[layer])
-                if self.X_band_width is not None:
-                    pred = tf.less(step, self.X_band_width + 1)
-                    attention_x_tmp_l_1 = tf.cond(pred,  # yapf:disable
-                                                  lambda: attention_x_tmp_l[:, :, :, :step + 1] + attention_x[layer],
-                                                  lambda: tf.concat([
-                                                                    attention_x_tmp_l[:, :, :,
-                                                                                      :step - self.X_band_width],
-                                                                    attention_x_tmp_l[:, :, :,
-                                                                                      step - self.X_band_width:step + 1]
-                                                                    + attention_x[layer]],
-                                                                    axis=-1))  # yapf:disable
-                    attention_x_tmp_l_2 = attention_x_tmp_l[:, :, :, step + 1:]
-                    attention_x_tmp.append(
-                        tf.concat([attention_x_tmp_l_1, attention_x_tmp_l_2],
-                                  axis=-1))
-                else:
-                    attention_x_tmp_l_1 = attention_x_tmp_l[:, :, :, :step + 1]
-                    attention_x_tmp_l_2 = attention_x_tmp_l[:, :, :, step + 1:]
-                    attention_x_tmp.append(
-                        tf.concat([
-                            attention_x_tmp_l_1 + attention_x[layer],
-                            attention_x_tmp_l_2
-                        ], axis=-1))  # yapf:disable
-            attention_x = attention_x_tmp
-            return outputs, cache, attention_x, attention_h
-
-        return _fn, cache, attention_x, attention_h
-
-    def dynamic_decode_and_search(self, init_decoder_input, maximum_iterations,
-                                  mode, memory, memory_sequence_length):
-        batch_size = tf.shape(init_decoder_input)[0]
-        step_fn, init_cache, init_attn_x, init_attn_h = self.step_fn(
-            mode,
-            batch_size,
-            memory=memory,
-            memory_sequence_length=memory_sequence_length)
-
-        outputs, attention_x, attention_h, cache = self.dynamic_decode(
-            step_fn,
-            init_decoder_input,
-            init_cache=init_cache,
-            init_attn_x=init_attn_x,
-            init_attn_h=init_attn_h,
-            maximum_iterations=maximum_iterations,
-            batch_size=batch_size)
-        return outputs, attention_x, attention_h
-
-    def dynamic_decode_and_search_teacher_forcing(self, decoder_input,
-                                                  maximum_iterations, mode,
-                                                  memory,
-                                                  memory_sequence_length):
-        batch_size = tf.shape(decoder_input)[0]
-        step_fn, init_cache, init_attn_x, init_attn_h = self.step_fn(
-            mode,
-            batch_size,
-            memory=memory,
-            memory_sequence_length=memory_sequence_length)
-
-        outputs, attention_x, attention_h, cache = self.dynamic_decode_teacher_forcing(
-            step_fn,
-            decoder_input,
-            init_cache=init_cache,
-            init_attn_x=init_attn_x,
-            init_attn_h=init_attn_h,
-            maximum_iterations=maximum_iterations,
-            batch_size=batch_size)
-        return outputs, attention_x, attention_h
-
-    def dynamic_decode(self,
-                       step_fn,
-                       init_decoder_input,
-                       init_cache=None,
-                       init_attn_x=None,
-                       init_attn_h=None,
-                       maximum_iterations=None,
-                       batch_size=None):
-
-        def _cond(step, cache, inputs, outputs, attention_x, attention_h):  # pylint: disable=unused-argument
-            return tf.less(step, maximum_iterations)
-
-        def _body(step, cache, inputs, outputs, attention_x, attention_h):
-            # output: [1, 1, num_mels * r]
-            # attn: [1, 1, T_out]
-            output, cache, attn_x, attn_h = step_fn(
-                step, inputs, cache)  # outputs, cache, attention, attns
-            for layer in range(len(attention_x)):
-                attention_x[layer] = attention_x[layer].write(
-                    step, tf.cast(attn_x[layer], tf.float32))
-
-            for layer in range(len(attention_h)):
-                attention_h[layer] = attention_h[layer].write(
-                    step, tf.cast(attn_h[layer], tf.float32))
-
-            outputs = outputs.write(step, tf.cast(output, tf.float32))
-            return step + 1, cache, output[:, :, -self.
-                                           num_mels:], outputs, attention_x, attention_h
-
-        step = tf.constant(0, dtype=tf.int32)
-        outputs = tf.TensorArray(tf.float32, size=0, dynamic_size=True)
-
-        _, cache, _, outputs, attention_x, attention_h = tf.while_loop(
-            _cond,
-            _body,
-            loop_vars=(step, init_cache, init_decoder_input, outputs,
-                       init_attn_x, init_attn_h),
-            shape_invariants=(step.shape,
-                              compat.nest.map_structure(
-                                  self._get_shape_invariants, init_cache),
-                              compat.nest.map_structure(
-                                  self._get_shape_invariants,
-                                  init_decoder_input), tf.TensorShape(None),
-                              compat.nest.map_structure(
-                                  self._get_shape_invariants, init_attn_x),
-                              compat.nest.map_structure(
-                                  self._get_shape_invariants, init_attn_h)),
-            parallel_iterations=1,
-            back_prop=False,
-            maximum_iterations=maximum_iterations)
-        # element of outputs: [N, 1, num_mels * r]
-        outputs_stack = outputs.stack()  # [T_out, N, 1, num_mels * r]
-        outputs_stack = tf.transpose(
-            outputs_stack, perm=[2, 1, 0, 3])  # [1, N, T_out, num_mels * r]
-        outputs_stack = tf.squeeze(
-            outputs_stack, axis=0)  # [N, T_out, num_mels * r]
-
-        attention_x_stack = []
-        for layer in range(len(attention_x)):
-            attention_x_stack_tmp = attention_x[layer].stack(
-            )  # [T_out, N, H, 1, T_out]
-            attention_x_stack_tmp = tf.transpose(
-                attention_x_stack_tmp, perm=[3, 1, 2, 0,
-                                             4])  # [1, N, H, T_out, T_out]
-            attention_x_stack_tmp = tf.squeeze(
-                attention_x_stack_tmp, axis=0)  # [N, H, T_out, T_out]
-            attention_x_stack.append(attention_x_stack_tmp)
-
-        attention_h_stack = []
-        for layer in range(len(attention_h)):
-            attention_h_stack_tmp = attention_h[layer].stack(
-            )  # [T_out, N, H, 1, T_out]
-            attention_h_stack_tmp = tf.transpose(
-                attention_h_stack_tmp, perm=[3, 1, 2, 0,
-                                             4])  # [1, N, H, T_out, T_out]
-            attention_h_stack_tmp = tf.squeeze(
-                attention_h_stack_tmp, axis=0)  # [N, H, T_out, T_out]
-            attention_h_stack.append(attention_h_stack_tmp)
-
-        return outputs_stack, attention_x_stack, attention_h_stack, cache
-
-    def dynamic_decode_teacher_forcing(self,
-                                       step_fn,
-                                       decoder_input,
-                                       init_cache=None,
-                                       init_attn_x=None,
-                                       init_attn_h=None,
-                                       maximum_iterations=None,
-                                       batch_size=None):
-
-        def _cond(step, cache, inputs, outputs, attention_x, attention_h):  # pylint: disable=unused-argument
-            return tf.less(step, maximum_iterations)
-
-        def _body(step, cache, inputs, outputs, attention_x, attention_h):
-            # output: [1, 1, num_mels * r]
-            # attn: [1, 1, T_out]
-            output, cache, attn_x, attn_h = step_fn(
-                step, inputs[:, step:step + 1, :],
-                cache)  # outputs, cache, attention, attns
-            for layer in range(len(attention_x)):
-                attention_x[layer] = attention_x[layer].write(
-                    step, tf.cast(attn_x[layer], tf.float32))
-
-            for layer in range(len(attention_h)):
-                attention_h[layer] = attention_h[layer].write(
-                    step, tf.cast(attn_h[layer], tf.float32))
-            outputs = outputs.write(step, tf.cast(output, tf.float32))
-            return step + 1, cache, inputs, outputs, attention_x, attention_h
-
-        step = tf.constant(0, dtype=tf.int32)
-        outputs = tf.TensorArray(tf.float32, size=0, dynamic_size=True)
-
-        _, cache, _, outputs, attention_x, attention_h = tf.while_loop(
-            _cond,
-            _body,
-            loop_vars=(step, init_cache, decoder_input, outputs, init_attn_x,
-                       init_attn_h),
-            shape_invariants=(step.shape,
-                              compat.nest.map_structure(
-                                  self._get_shape_invariants,
-                                  init_cache), decoder_input.shape,
-                              tf.TensorShape(None),
-                              compat.nest.map_structure(
-                                  self._get_shape_invariants, init_attn_x),
-                              compat.nest.map_structure(
-                                  self._get_shape_invariants, init_attn_h)),
-            parallel_iterations=1,
-            back_prop=False,
-            maximum_iterations=maximum_iterations)
-        # element of outputs: [N, 1, num_mels * r]
-        outputs_stack = outputs.stack()  # [T_out, N, 1, num_mels * r]
-        outputs_stack = tf.transpose(
-            outputs_stack, perm=[2, 1, 0, 3])  # [1, N, T_out, num_mels * r]
-        outputs_stack = tf.squeeze(
-            outputs_stack, axis=0)  # [N, T_out, num_mels * r]
-
-        attention_x_stack = []
-        for layer in range(len(attention_x)):
-            attention_x_stack_tmp = attention_x[layer].stack(
-            )  # [T_out, N, H, 1, T_out]
-            attention_x_stack_tmp = tf.transpose(
-                attention_x_stack_tmp, perm=[3, 1, 2, 0,
-                                             4])  # [1, N, H, T_out, T_out]
-            attention_x_stack_tmp = tf.squeeze(
-                attention_x_stack_tmp, axis=0)  # [N, H, T_out, T_out]
-            attention_x_stack.append(attention_x_stack_tmp)
-
-        attention_h_stack = []
-        for layer in range(len(attention_h)):
-            attention_h_stack_tmp = attention_h[layer].stack(
-            )  # [T_out, N, H, 1, T_out]
-            attention_h_stack_tmp = tf.transpose(
-                attention_h_stack_tmp, perm=[3, 1, 2, 0,
-                                             4])  # [1, N, H, T_out, T_out]
-            attention_h_stack_tmp = tf.squeeze(
-                attention_h_stack_tmp, axis=0)  # [N, H, T_out, T_out]
-            attention_h_stack.append(attention_h_stack_tmp)
-
-        return outputs_stack, attention_x_stack, attention_h_stack, cache
-
-    def _get_shape_invariants(self, tensor):
-        """Returns the shape of the tensor but sets middle dims to None."""
-        if isinstance(tensor, tf.TensorArray):
-            shape = None
-        else:
-            shape = tensor.shape.as_list()
-            for i in range(1, len(shape) - 1):
-                shape[i] = None
-        return tf.TensorShape(shape)
-
-
-class SelfAttentionDecoderOri():
-    """Decoder using self-attention as described in
-    https://arxiv.org/abs/1706.03762.
-    """
-
-    def __init__(self,
-                 num_layers,
-                 num_units=512,
-                 num_heads=8,
-                 ffn_inner_dim=2048,
-                 dropout=0.1,
-                 attention_dropout=0.1,
-                 relu_dropout=0.1,
-                 position_encoder=SinusoidalPositionEncoder(),
-                 self_attention_type='scaled_dot'):
-        """Initializes the parameters of the decoder.
-
-        Args:
-          num_layers: The number of layers.
-          num_units: The number of hidden units.
-          num_heads: The number of heads in the multi-head attention.
-          ffn_inner_dim: The number of units of the inner linear transformation
-            in the feed forward layer.
-          dropout: The probability to drop units from the outputs.
-          attention_dropout: The probability to drop units from the attention.
-          relu_dropout: The probability to drop units from the ReLU activation in
-            the feed forward layer.
-          position_encoder: A :class:`opennmt.layers.position.PositionEncoder` to
-            apply on inputs or ``None``.
-          self_attention_type: Type of self attention, "scaled_dot" or "average" (case
-            insensitive).
-
-        Raises:
-          ValueError: if :obj:`self_attention_type` is invalid.
-        """
-        super(SelfAttentionDecoderOri, self).__init__()
-        self.num_layers = num_layers
-        self.num_units = num_units
-        self.num_heads = num_heads
-        self.ffn_inner_dim = ffn_inner_dim
-        self.dropout = dropout
-        self.attention_dropout = attention_dropout
-        self.relu_dropout = relu_dropout
-        self.position_encoder = position_encoder
-        self.self_attention_type = self_attention_type.lower()
-        if self.self_attention_type not in ('scaled_dot', 'average'):
-            raise ValueError('invalid attention type %s'
-                             % self.self_attention_type)
-        if self.self_attention_type == 'average':
-            tf.logging.warning(
-                'Support for average attention network is experimental '
-                'and may change in future versions.')
-
-    @property
-    def output_size(self):
-        """Returns the decoder output size."""
-        return self.num_units
-
-    @property
-    def support_alignment_history(self):
-        return True
-
-    @property
-    def support_multi_source(self):
-        return True
-
-    def _init_cache(self, batch_size, dtype=tf.float32, num_sources=1):
-        cache = {}
-
-        for layer in range(self.num_layers):
-            proj_cache_shape = [
-                batch_size, self.num_heads, 0, self.num_units // self.num_heads
-            ]
-            layer_cache = {}
-            layer_cache['memory'] = [{
-                'memory_keys':
-                tf.zeros(proj_cache_shape, dtype=dtype),
-                'memory_values':
-                tf.zeros(proj_cache_shape, dtype=dtype)
-            } for _ in range(num_sources)]
-            if self.self_attention_type == 'scaled_dot':
-                layer_cache['self_keys'] = tf.zeros(
-                    proj_cache_shape, dtype=dtype)
-                layer_cache['self_values'] = tf.zeros(
-                    proj_cache_shape, dtype=dtype)
-            elif self.self_attention_type == 'average':
-                layer_cache['prev_g'] = tf.zeros(
-                    [batch_size, 1, self.num_units], dtype=dtype)
-            cache['layer_{}'.format(layer)] = layer_cache
-
-        return cache
-
-    def _self_attention_stack(self,
-                              inputs,
-                              sequence_length=None,
-                              mode=True,
-                              cache=None,
-                              memory=None,
-                              memory_sequence_length=None,
-                              step=None):
-        inputs *= self.num_units**0.5
-        if self.position_encoder is not None:
-            inputs = self.position_encoder(
-                inputs, position=step + 1 if step is not None else None)
-
-        inputs = tf.layers.dropout(inputs, rate=self.dropout, training=mode)
-
-        decoder_mask = None
-        memory_mask = None
-        last_attention = None
-
-        if self.self_attention_type == 'scaled_dot':
-            if sequence_length is not None:
-                decoder_mask = transformer.build_future_mask(
-                    sequence_length,
-                    num_heads=self.num_heads,
-                    maximum_length=tf.shape(inputs)[1])
-        elif self.self_attention_type == 'average':
-            if cache is None:
-                if sequence_length is None:
-                    sequence_length = tf.fill([tf.shape(inputs)[0]],
-                                              tf.shape(inputs)[1])
-                decoder_mask = transformer.cumulative_average_mask(
-                    sequence_length,
-                    maximum_length=tf.shape(inputs)[1],
-                    dtype=inputs.dtype)
-
-        if memory is not None and not tf.contrib.framework.nest.is_sequence(
-                memory):
-            memory = (memory, )
-        if memory_sequence_length is not None:
-            if not tf.contrib.framework.nest.is_sequence(
-                    memory_sequence_length):
-                memory_sequence_length = (memory_sequence_length, )
-            memory_mask = [
-                transformer.build_sequence_mask(
-                    length,
-                    num_heads=self.num_heads,
-                    maximum_length=tf.shape(m)[1])
-                for m, length in zip(memory, memory_sequence_length)
-            ]
-
-        for layer in range(self.num_layers):
-            layer_name = 'layer_{}'.format(layer)
-            layer_cache = cache[layer_name] if cache is not None else None
-            with tf.variable_scope(layer_name):
-                if self.self_attention_type == 'scaled_dot':
-                    with tf.variable_scope('masked_multi_head'):
-                        encoded = transformer.multi_head_attention(
-                            self.num_heads,
-                            transformer.norm(inputs),
-                            None,
-                            mode,
-                            num_units=self.num_units,
-                            mask=decoder_mask,
-                            cache=layer_cache,
-                            dropout=self.attention_dropout)
-                        last_context = transformer.drop_and_add(
-                            inputs, encoded, mode, dropout=self.dropout)
-                elif self.self_attention_type == 'average':
-                    with tf.variable_scope('average_attention'):
-                        # Cumulative average.
-                        x = transformer.norm(inputs)
-                        y = transformer.cumulative_average(
-                            x,
-                            decoder_mask if cache is None else step,
-                            cache=layer_cache)
-                        # FFN.
-                        y = transformer.feed_forward(
-                            y,
-                            self.ffn_inner_dim,
-                            mode,
-                            dropout=self.relu_dropout)
-                        # Gating layer.
-                        z = tf.layers.dense(
-                            tf.concat([x, y], -1), self.num_units * 2)
-                        i, f = tf.split(z, 2, axis=-1)
-                        y = tf.sigmoid(i) * x + tf.sigmoid(f) * y
-                        last_context = transformer.drop_and_add(
-                            inputs, y, mode, dropout=self.dropout)
-
-                if memory is not None:
-                    for i, (mem, mask) in enumerate(zip(memory, memory_mask)):
-                        memory_cache = layer_cache['memory'][i] if layer_cache is not None else None  # yapf:disable
-                        with tf.variable_scope('multi_head' if i
-                                               == 0 else 'multi_head_%d' % i):  # yapf:disable
-                            context, last_attention = transformer.multi_head_attention(
-                                self.num_heads,
-                                transformer.norm(last_context),
-                                mem,
-                                mode,
-                                mask=mask,
-                                cache=memory_cache,
-                                dropout=self.attention_dropout,
-                                return_attention=True)
-                            last_context = transformer.drop_and_add(
-                                last_context,
-                                context,
-                                mode,
-                                dropout=self.dropout)
-                            if i > 0:  # Do not return attention in case of multi source.
-                                last_attention = None
-
-                with tf.variable_scope('ffn'):
-                    transformed = transformer.feed_forward_ori(
-                        transformer.norm(last_context),
-                        self.ffn_inner_dim,
-                        mode,
-                        dropout=self.relu_dropout)
-                    transformed = transformer.drop_and_add(
-                        last_context, transformed, mode, dropout=self.dropout)
-
-                inputs = transformed
-
-        if last_attention is not None:
-            # The first head of the last layer is returned.
-            first_head_attention = last_attention[:, 0]
-        else:
-            first_head_attention = None
-
-        outputs = transformer.norm(inputs)
-        return outputs, first_head_attention
-
-    def decode_from_inputs(self,
-                           inputs,
-                           sequence_length,
-                           initial_state=None,
-                           mode=True,
-                           memory=None,
-                           memory_sequence_length=None):
-        outputs, attention = self._self_attention_stack(
-            inputs,
-            sequence_length=sequence_length,
-            mode=mode,
-            memory=memory,
-            memory_sequence_length=memory_sequence_length)
-        return outputs, None, attention
-
-    def step_fn(self,
-                mode,
-                batch_size,
-                initial_state=None,
-                memory=None,
-                memory_sequence_length=None,
-                dtype=tf.float32):
-        if memory is None:
-            num_sources = 0
-        elif tf.contrib.framework.nest.is_sequence(memory):
-            num_sources = len(memory)
-        else:
-            num_sources = 1
-        cache = self._init_cache(
-            batch_size, dtype=dtype, num_sources=num_sources)
-
-        def _fn(step, inputs, cache, mode):
-            inputs = tf.expand_dims(inputs, 1)
-            outputs, attention = self._self_attention_stack(
-                inputs,
-                mode=mode,
-                cache=cache,
-                memory=memory,
-                memory_sequence_length=memory_sequence_length,
-                step=step)
-            outputs = tf.squeeze(outputs, axis=1)
-            if attention is not None:
-                attention = tf.squeeze(attention, axis=1)
-            return outputs, cache, attention
-
-        return _fn, cache
diff --git a/modelscope/models/audio/tts/models/self_attention_encoder.py b/modelscope/models/audio/tts/models/self_attention_encoder.py
deleted file mode 100755
index ce4193dc..00000000
--- a/modelscope/models/audio/tts/models/self_attention_encoder.py
+++ /dev/null
@@ -1,182 +0,0 @@
-"""Define the self-attention encoder."""
-
-import tensorflow as tf
-
-from . import transformer
-from .position import SinusoidalPositionEncoder
-
-
-class SelfAttentionEncoder():
-    """Encoder using self-attention as described in
-    https://arxiv.org/abs/1706.03762.
-    """
-
-    def __init__(self,
-                 num_layers,
-                 num_units=512,
-                 num_heads=8,
-                 ffn_inner_dim=2048,
-                 dropout=0.1,
-                 attention_dropout=0.1,
-                 relu_dropout=0.1,
-                 position_encoder=SinusoidalPositionEncoder()):
-        """Initializes the parameters of the encoder.
-
-        Args:
-          num_layers: The number of layers.
-          num_units: The number of hidden units.
-          num_heads: The number of heads in the multi-head attention.
-          ffn_inner_dim: The number of units of the inner linear transformation
-            in the feed forward layer.
-          dropout: The probability to drop units from the outputs.
-          attention_dropout: The probability to drop units from the attention.
-          relu_dropout: The probability to drop units from the ReLU activation in
-            the feed forward layer.
-          position_encoder: The :class:`opennmt.layers.position.PositionEncoder` to
-            apply on inputs or ``None``.
-        """
-        super(SelfAttentionEncoder, self).__init__()
-        self.num_layers = num_layers
-        self.num_units = num_units
-        self.num_heads = num_heads
-        self.ffn_inner_dim = ffn_inner_dim
-        self.dropout = dropout
-        self.attention_dropout = attention_dropout
-        self.relu_dropout = relu_dropout
-        self.position_encoder = position_encoder
-
-    def encode(self, inputs, sequence_length=None, mode=True):
-        inputs *= self.num_units**0.5
-        if self.position_encoder is not None:
-            inputs = self.position_encoder(inputs)
-
-        inputs = tf.layers.dropout(inputs, rate=self.dropout, training=mode)
-        mask = transformer.build_sequence_mask(
-            sequence_length,
-            num_heads=self.num_heads,
-            maximum_length=tf.shape(inputs)[1])
-
-        mask_FF = tf.squeeze(
-            transformer.build_sequence_mask(
-                sequence_length, maximum_length=tf.shape(inputs)[1]),
-            axis=1)
-
-        state = ()
-
-        attns = []
-        for layer in range(self.num_layers):
-            with tf.variable_scope('layer_{}'.format(layer)):
-                with tf.variable_scope('multi_head'):
-                    context, attn = transformer.multi_head_attention(
-                        self.num_heads,
-                        transformer.norm(inputs),
-                        None,
-                        mode,
-                        num_units=self.num_units,
-                        mask=mask,
-                        dropout=self.attention_dropout,
-                        return_attention=True)
-                    attns.append(attn)
-                    context = transformer.drop_and_add(
-                        inputs, context, mode, dropout=self.dropout)
-
-                with tf.variable_scope('ffn'):
-                    transformed = transformer.feed_forward(
-                        transformer.norm(context),
-                        self.ffn_inner_dim,
-                        mode,
-                        dropout=self.relu_dropout,
-                        mask=mask_FF)
-                    transformed = transformer.drop_and_add(
-                        context, transformed, mode, dropout=self.dropout)
-
-                inputs = transformed
-                state += (tf.reduce_mean(inputs, axis=1), )
-
-        outputs = transformer.norm(inputs)
-        return (outputs, state, sequence_length, attns)
-
-
-class SelfAttentionEncoderOri():
-    """Encoder using self-attention as described in
-    https://arxiv.org/abs/1706.03762.
-    """
-
-    def __init__(self,
-                 num_layers,
-                 num_units=512,
-                 num_heads=8,
-                 ffn_inner_dim=2048,
-                 dropout=0.1,
-                 attention_dropout=0.1,
-                 relu_dropout=0.1,
-                 position_encoder=SinusoidalPositionEncoder()):
-        """Initializes the parameters of the encoder.
-
-        Args:
-          num_layers: The number of layers.
-          num_units: The number of hidden units.
-          num_heads: The number of heads in the multi-head attention.
-          ffn_inner_dim: The number of units of the inner linear transformation
-            in the feed forward layer.
-          dropout: The probability to drop units from the outputs.
-          attention_dropout: The probability to drop units from the attention.
-          relu_dropout: The probability to drop units from the ReLU activation in
-            the feed forward layer.
-          position_encoder: The :class:`opennmt.layers.position.PositionEncoder` to
-            apply on inputs or ``None``.
-        """
-        super(SelfAttentionEncoderOri, self).__init__()
-        self.num_layers = num_layers
-        self.num_units = num_units
-        self.num_heads = num_heads
-        self.ffn_inner_dim = ffn_inner_dim
-        self.dropout = dropout
-        self.attention_dropout = attention_dropout
-        self.relu_dropout = relu_dropout
-        self.position_encoder = position_encoder
-
-    def encode(self, inputs, sequence_length=None, mode=True):
-        inputs *= self.num_units**0.5
-        if self.position_encoder is not None:
-            inputs = self.position_encoder(inputs)
-
-        inputs = tf.layers.dropout(inputs, rate=self.dropout, training=mode)
-        mask = transformer.build_sequence_mask(
-            sequence_length,
-            num_heads=self.num_heads,
-            maximum_length=tf.shape(inputs)[1])  # [N, 1, 1, T_out]
-
-        state = ()
-
-        attns = []
-        for layer in range(self.num_layers):
-            with tf.variable_scope('layer_{}'.format(layer)):
-                with tf.variable_scope('multi_head'):
-                    context, attn = transformer.multi_head_attention(
-                        self.num_heads,
-                        transformer.norm(inputs),
-                        None,
-                        mode,
-                        num_units=self.num_units,
-                        mask=mask,
-                        dropout=self.attention_dropout,
-                        return_attention=True)
-                    attns.append(attn)
-                    context = transformer.drop_and_add(
-                        inputs, context, mode, dropout=self.dropout)
-
-                with tf.variable_scope('ffn'):
-                    transformed = transformer.feed_forward_ori(
-                        transformer.norm(context),
-                        self.ffn_inner_dim,
-                        mode,
-                        dropout=self.relu_dropout)
-                    transformed = transformer.drop_and_add(
-                        context, transformed, mode, dropout=self.dropout)
-
-                inputs = transformed
-                state += (tf.reduce_mean(inputs, axis=1), )
-
-        outputs = transformer.norm(inputs)
-        return (outputs, state, sequence_length, attns)
diff --git a/modelscope/models/audio/tts/models/transformer.py b/modelscope/models/audio/tts/models/transformer.py
deleted file mode 100755
index a9f0bedc..00000000
--- a/modelscope/models/audio/tts/models/transformer.py
+++ /dev/null
@@ -1,1157 +0,0 @@
-"""Define layers related to the Google's Transformer model."""
-
-import tensorflow as tf
-
-from . import compat, fsmn
-
-
-def tile_sequence_length(sequence_length, num_heads):
-    """Tiles lengths :obj:`num_heads` times.
-
-    Args:
-      sequence_length: The sequence length.
-      num_heads: The number of heads.
-
-    Returns:
-      A ``tf.Tensor`` where each length is replicated :obj:`num_heads` times.
-    """
-    sequence_length = tf.tile(sequence_length, [num_heads])
-    sequence_length = tf.reshape(sequence_length, [num_heads, -1])
-    sequence_length = tf.transpose(sequence_length, perm=[1, 0])
-    sequence_length = tf.reshape(sequence_length, [-1])
-    return sequence_length
-
-
-def build_sequence_mask(sequence_length,
-                        num_heads=None,
-                        maximum_length=None,
-                        dtype=tf.float32):
-    """Builds the dot product mask.
-
-    Args:
-      sequence_length: The sequence length.
-      num_heads: The number of heads.
-      maximum_length: Optional size of the returned time dimension. Otherwise
-        it is the maximum of :obj:`sequence_length`.
-      dtype: The type of the mask tensor.
-
-    Returns:
-      A broadcastable ``tf.Tensor`` of type :obj:`dtype` and shape
-      ``[batch_size, 1, 1, max_length]``.
-    """
-    mask = tf.sequence_mask(
-        sequence_length, maxlen=maximum_length, dtype=dtype)
-    mask = tf.expand_dims(mask, axis=1)
-    if num_heads is not None:
-        mask = tf.expand_dims(mask, axis=1)
-    return mask
-
-
-def build_sequence_mask_window(sequence_length,
-                               left_window_size=-1,
-                               right_window_size=-1,
-                               num_heads=None,
-                               maximum_length=None,
-                               dtype=tf.float32):
-    """Builds the dot product mask.
-
-    Args:
-      sequence_length: The sequence length.
-      num_heads: The number of heads.
-      maximum_length: Optional size of the returned time dimension. Otherwise
-        it is the maximum of :obj:`sequence_length`.
-      dtype: The type of the mask tensor.
-
-    Returns:
-      A broadcastable ``tf.Tensor`` of type :obj:`dtype` and shape
-      ``[batch_size, 1, 1, max_length]``.
-    """
-    sequence_mask = tf.sequence_mask(
-        sequence_length, maxlen=maximum_length, dtype=dtype)
-    mask = _window_mask(
-        sequence_length,
-        left_window_size=left_window_size,
-        right_window_size=right_window_size,
-        maximum_length=maximum_length,
-        dtype=dtype)
-    mask *= tf.expand_dims(sequence_mask, axis=1)
-    if num_heads is not None:
-        mask = tf.expand_dims(mask, axis=1)
-    return mask
-
-
-def _lower_triangle_mask(sequence_length,
-                         maximum_length=None,
-                         dtype=tf.float32,
-                         band=-1):
-    batch_size = tf.shape(sequence_length)[0]
-    if maximum_length is None:
-        maximum_length = tf.reduce_max(sequence_length)
-    mask = tf.ones([batch_size, maximum_length, maximum_length], dtype=dtype)
-    mask = compat.tf_compat(
-        v2='linalg.band_part', v1='matrix_band_part')(mask, band, 0)
-    return mask
-
-
-def _higher_triangle_mask(sequence_length,
-                          maximum_length=None,
-                          dtype=tf.float32,
-                          band=-1):
-    batch_size = tf.shape(sequence_length)[0]
-    if maximum_length is None:
-        maximum_length = tf.reduce_max(sequence_length)
-    mask = tf.ones([batch_size, maximum_length, maximum_length], dtype=dtype)
-    mask = compat.tf_compat(
-        v2='linalg.band_part', v1='matrix_band_part')(mask, 0, band)
-    return mask
-
-
-def _window_mask(sequence_length,
-                 left_window_size=-1,
-                 right_window_size=-1,
-                 maximum_length=None,
-                 dtype=tf.float32):
-    batch_size = tf.shape(sequence_length)[0]
-    if maximum_length is None:
-        maximum_length = tf.reduce_max(sequence_length)
-    mask = tf.ones([batch_size, maximum_length, maximum_length], dtype=dtype)
-    left_window_size = tf.minimum(
-        tf.cast(left_window_size, tf.int64),
-        tf.cast(maximum_length - 1, tf.int64))
-    right_window_size = tf.minimum(
-        tf.cast(right_window_size, tf.int64),
-        tf.cast(maximum_length - 1, tf.int64))
-    mask = tf.matrix_band_part(mask, left_window_size, right_window_size)
-    return mask
-
-
-def build_future_mask(sequence_length,
-                      num_heads=None,
-                      maximum_length=None,
-                      dtype=tf.float32,
-                      band=-1):
-    """Builds the dot product mask for future positions.
-
-    Args:
-      sequence_length: The sequence length.
-      num_heads: The number of heads.
-      maximum_length: Optional size of the returned time dimension. Otherwise
-        it is the maximum of :obj:`sequence_length`.
-      dtype: The type of the mask tensor.
-
-    Returns:
-      A broadcastable ``tf.Tensor`` of type :obj:`dtype` and shape
-      ``[batch_size, 1, max_length, max_length]``.
-    """
-    sequence_mask = tf.sequence_mask(
-        sequence_length, maxlen=maximum_length, dtype=dtype)
-    mask = _lower_triangle_mask(
-        sequence_length, maximum_length=maximum_length, dtype=dtype, band=band)
-    mask *= tf.expand_dims(sequence_mask, axis=1)
-    if num_heads is not None:
-        mask = tf.expand_dims(mask, axis=1)
-    return mask
-
-
-def build_history_mask(sequence_length,
-                       num_heads=None,
-                       maximum_length=None,
-                       dtype=tf.float32,
-                       band=-1):
-    """Builds the dot product mask for future positions.
-
-    Args:
-      sequence_length: The sequence length.
-      num_heads: The number of heads.
-      maximum_length: Optional size of the returned time dimension. Otherwise
-        it is the maximum of :obj:`sequence_length`.
-      dtype: The type of the mask tensor.
-
-    Returns:
-      A broadcastable ``tf.Tensor`` of type :obj:`dtype` and shape
-      ``[batch_size, 1, max_length, max_length]``.
-    """
-    sequence_mask = tf.sequence_mask(
-        sequence_length, maxlen=maximum_length, dtype=dtype)
-    mask = _higher_triangle_mask(
-        sequence_length, maximum_length=maximum_length, dtype=dtype, band=band)
-    mask *= tf.expand_dims(sequence_mask, axis=1)
-    if num_heads is not None:
-        mask = tf.expand_dims(mask, axis=1)
-    return mask
-
-
-def cumulative_average_mask(sequence_length,
-                            maximum_length=None,
-                            dtype=tf.float32):
-    """Builds the mask to compute the cumulative average as described in
-    https://arxiv.org/abs/1805.00631.
-
-    Args:
-      sequence_length: The sequence length.
-      maximum_length: Optional size of the returned time dimension. Otherwise
-        it is the maximum of :obj:`sequence_length`.
-      dtype: The type of the mask tensor.
-
-    Returns:
-      A ``tf.Tensor`` of type :obj:`dtype` and shape
-      ``[batch_size, max_length, max_length]``.
-    """
-    sequence_mask = tf.sequence_mask(
-        sequence_length, maxlen=maximum_length, dtype=dtype)
-    mask = _lower_triangle_mask(
-        sequence_length, maximum_length=maximum_length, dtype=dtype)
-    mask *= tf.expand_dims(sequence_mask, axis=2)
-    weight = tf.range(1, tf.cast(tf.shape(mask)[1] + 1, dtype), dtype=dtype)
-    mask /= tf.expand_dims(weight, 1)
-    return mask
-
-
-def cumulative_average(inputs, mask_or_step, cache=None):
-    """Computes the cumulative average as described in
-    https://arxiv.org/abs/1805.00631.
-
-    Args:
-      inputs: The sequence to average. A tensor of shape :math:`[B, T, D]`.
-      mask_or_step: If :obj:`cache` is set, this is assumed to be the current step
-        of the dynamic decoding. Otherwise, it is the mask matrix used to compute
-        the cumulative average.
-      cache: A dictionnary containing the cumulative average of the previous step.
-
-    Returns:
-      The cumulative average, a tensor of the same shape and type as :obj:`inputs`.
-    """
-    if cache is not None:
-        step = tf.cast(mask_or_step, inputs.dtype)
-        aa = (inputs + step * cache['prev_g']) / (step + 1.0)
-        cache['prev_g'] = aa
-        return aa
-    else:
-        mask = mask_or_step
-        return tf.matmul(mask, inputs)
-
-
-def fused_projection(inputs, num_units, num_outputs=1):
-    """Projects the same input into multiple output spaces.
-
-    Args:
-      inputs: The inputs to project.
-      num_units: The number of output units of each space.
-      num_outputs: The number of output spaces.
-
-    Returns:
-      :obj:`num_outputs` ``tf.Tensor`` of depth :obj:`num_units`.
-    """
-    return tf.split(
-        tf.layers.conv1d(inputs, num_units * num_outputs, 1),
-        num_outputs,
-        axis=2)
-
-
-def split_heads(inputs, num_heads):
-    """Splits a tensor in depth.
-
-    Args:
-      inputs: A ``tf.Tensor`` of shape :math:`[B, T, D]`.
-      num_heads: The number of heads :math:`H`.
-
-    Returns:
-      A ``tf.Tensor`` of shape :math:`[B, H, T, D / H]`.
-    """
-    static_shape = inputs.get_shape().as_list()
-    depth = static_shape[-1]
-    outputs = tf.reshape(inputs, [
-        tf.shape(inputs)[0],
-        tf.shape(inputs)[1], num_heads, depth // num_heads
-    ])
-    outputs = tf.transpose(outputs, perm=[0, 2, 1, 3])
-    return outputs
-
-
-def combine_heads(inputs):
-    """Concatenates heads.
-
-    Args:
-      inputs: A ``tf.Tensor`` of shape :math:`[B, H, T, D]`.
-
-    Returns:
-      A ``tf.Tensor`` of shape :math:`[B, T, D * H]`.
-    """
-    static_shape = inputs.get_shape().as_list()
-    depth = static_shape[-1]
-    num_heads = static_shape[1]
-    outputs = tf.transpose(inputs, perm=[0, 2, 1, 3])
-    outputs = tf.reshape(
-        outputs,
-        [tf.shape(outputs)[0],
-         tf.shape(outputs)[1], depth * num_heads])
-    return outputs
-
-
-def dot_product_attention(queries, keys, values, mode, mask=None, dropout=0.0):
-    """Computes the dot product attention.
-
-    Args:
-      queries: The sequence of queries. A tensor of shape :math:`[B, T_1, ...]`.
-      keys: The sequence use to calculate attention scores. A tensor of shape
-        :math:`[B, T_2, ...]`.
-      values: The sequence to attend. A tensor of shape :math:`[B, T_2, ...]`.
-      mode: A ``tf.estimator.ModeKeys`` mode.
-      mask: A ``tf.Tensor`` applied to the dot product.
-      dropout: The probability to drop units from the inputs.
-
-    Returns:
-      A tuple ``(context vector, attention vector)``.
-    """
-    dot = tf.matmul(queries, keys, transpose_b=True)
-
-    if mask is not None:
-        dot = tf.cast(
-            tf.cast(dot, tf.float32) * mask + ((1.0 - mask) * tf.float32.min),
-            dot.dtype)
-
-    softmax = tf.nn.softmax(tf.cast(dot, tf.float32))
-    attn = tf.cast(softmax, dot.dtype)
-    drop_attn = tf.layers.dropout(attn, rate=dropout, training=mode)
-
-    context = tf.matmul(drop_attn, values)
-
-    return context, attn
-
-
-def dot_product_attention_wpa(num_heads,
-                              queries,
-                              keys,
-                              values,
-                              mode,
-                              attention_left_window=-1,
-                              attention_right_window=0,
-                              mask=None,
-                              max_id_cache=None,
-                              mono=False,
-                              peak_delay=-1,
-                              dropout=0.0):
-    """
-    Computes the dot product attention.
-    Args:
-      queries: The sequence of queries. A tensor of shape :math:`[B, T_1, ...]`.
-      keys: The sequence use to calculate attention scores. A tensor of shape
-        :math:`[B, T_2, ...]`.
-      values: The sequence to attend. A tensor of shape :math:`[B, T_2, ...]`.
-      mode: A ``tf.estimator.ModeKeys`` mode.
-      mask: A ``tf.Tensor`` applied to the dot product.
-      dropout: The probability to drop units from the inputs.
-
-    Returns:
-      A tuple ``(context vector, attention vector)``.
-    """
-    # Dot product between queries and keys.
-    dot = tf.matmul(queries, keys, transpose_b=True)
-    depth = tf.shape(dot)[-1]
-    if mask is not None:
-        dot = tf.cast(
-            tf.cast(dot, tf.float32) * mask + ((1.0 - mask) * tf.float32.min),
-            dot.dtype)
-    # wpa
-    max_id = tf.math.argmax(input=dot, axis=-1)
-    # peak delay
-    if peak_delay > 0:
-        if max_id_cache is not None:
-            M = tf.cast(max_id_cache['pre_max_id'], dtype=max_id.dtype)
-            inputs_len = tf.math.minimum(
-                M + peak_delay, tf.cast(depth - 1, dtype=max_id.dtype))
-            delay_mask = tf.sequence_mask(
-                inputs_len, maxlen=depth, dtype=tf.float32)
-            dot = tf.cast(
-                tf.cast(dot, tf.float32) * delay_mask
-                + ((1.0 - delay_mask) * tf.float32.min), dot.dtype)  # yapf:disable
-            max_id = tf.math.argmax(input=dot, axis=-1)
-    # mono
-    if mono:
-        if max_id_cache is None:
-            d = tf.shape(max_id)[-1]
-            tmp_max_id = tf.reshape(max_id, [-1, num_heads, d])
-            tmp_max_id = tf.slice(
-                tmp_max_id, [0, 0, 0],
-                [tf.shape(tmp_max_id)[0],
-                 tf.shape(tmp_max_id)[1], d - 1])
-            zeros = tf.zeros(
-                shape=(tf.shape(tmp_max_id)[0], tf.shape(tmp_max_id)[1], 1),
-                dtype=max_id.dtype)
-            tmp_max_id = tf.concat([zeros, tmp_max_id], axis=-1)
-            mask1 = tf.sequence_mask(
-                tmp_max_id, maxlen=depth, dtype=tf.float32)
-            dot = tf.cast(
-                tf.cast(dot, tf.float32)
-                * (1.0 - mask1) + mask1 * tf.float32.min, dot.dtype)  # yapf:disable
-            max_id = tf.math.argmax(input=dot, axis=-1)
-        else:
-            # eval
-            tmp_max_id = tf.reshape(max_id, [-1, num_heads, 1])
-            max_id_cache['pre_max_id'] = tmp_max_id
-    # right_mask
-    right_offset = tf.constant(attention_right_window, dtype=max_id.dtype)
-    right_len = tf.math.minimum(max_id + right_offset,
-                                tf.cast(depth - 1, dtype=max_id.dtype))
-    right_mask = tf.sequence_mask(right_len, maxlen=depth, dtype=tf.float32)
-    dot = tf.cast(
-        tf.cast(dot, tf.float32) * right_mask
-        + ((1.0 - right_mask) * tf.float32.min), dot.dtype)  # yapf:disable
-    # left_mask
-    if attention_left_window > 0:
-        left_offset = tf.constant(attention_left_window, dtype=max_id.dtype)
-        left_len = tf.math.maximum(max_id - left_offset,
-                                   tf.cast(0, dtype=max_id.dtype))
-        left_mask = tf.sequence_mask(left_len, maxlen=depth, dtype=tf.float32)
-        dot = tf.cast(
-            tf.cast(dot, tf.float32) * (1.0 - left_mask)
-            + (left_mask * tf.float32.min), dot.dtype)  # yapf:disable
-    # Compute attention weights.
-    attn = tf.cast(tf.nn.softmax(tf.cast(dot, tf.float32)), dot.dtype)
-    drop_attn = tf.layers.dropout(attn, rate=dropout, training=mode)
-
-    # Compute attention context.
-    context = tf.matmul(drop_attn, values)
-
-    return context, attn
-
-
-def multi_head_attention(num_heads,
-                         queries,
-                         memory,
-                         mode,
-                         num_units=None,
-                         mask=None,
-                         cache=None,
-                         dropout=0.0,
-                         return_attention=False):
-    """Computes the multi-head attention as described in
-    https://arxiv.org/abs/1706.03762.
-
-    Args:
-      num_heads: The number of attention heads.
-      queries: The sequence of queries. A tensor of shape :math:`[B, T_1, ...]`.
-      memory: The sequence to attend. A tensor of shape :math:`[B, T_2, ...]`.
-        If ``None``, computes self-attention.
-      mode: A ``tf.estimator.ModeKeys`` mode.
-      num_units: The number of hidden units. If not set, it is set to the input
-        dimension.
-      mask: A ``tf.Tensor`` applied to the dot product.
-      cache: A dictionary containing pre-projected keys and values.
-      dropout: The probability to drop units from the inputs.
-      return_attention: Return the attention head probabilities in addition to the
-        context.
-
-    Returns:
-      The concatenated attention context of each head and the attention
-      probabilities (if :obj:`return_attention` is set).
-    """
-    num_units = num_units or queries.get_shape().as_list()[-1]
-
-    if num_units % num_heads != 0:
-        raise ValueError('Multi head attention requires that num_units is a'
-                         ' multiple of {}'.format(num_heads))
-
-    if memory is None:
-        queries, keys, values = fused_projection(
-            queries, num_units, num_outputs=3)
-
-        keys = split_heads(keys, num_heads)
-        values = split_heads(values, num_heads)
-
-        if cache is not None:
-            keys = tf.concat([cache['self_keys'], keys], axis=2)
-            values = tf.concat([cache['self_values'], values], axis=2)
-            cache['self_keys'] = keys
-            cache['self_values'] = values
-    else:
-        queries = tf.layers.conv1d(queries, num_units, 1)
-
-        if cache is not None:
-
-            def _project_and_split():
-                k, v = fused_projection(memory, num_units, num_outputs=2)
-                return split_heads(k, num_heads), split_heads(v, num_heads)
-
-            keys, values = tf.cond(
-                tf.equal(tf.shape(cache['memory_keys'])[2], 0),
-                true_fn=_project_and_split,
-                false_fn=lambda:
-                (cache['memory_keys'], cache['memory_values']))
-            cache['memory_keys'] = keys
-            cache['memory_values'] = values
-        else:
-            keys, values = fused_projection(memory, num_units, num_outputs=2)
-            keys = split_heads(keys, num_heads)
-            values = split_heads(values, num_heads)
-
-    queries = split_heads(queries, num_heads)
-    queries *= (num_units // num_heads)**-0.5
-
-    heads, attn = dot_product_attention(
-        queries, keys, values, mode, mask=mask, dropout=dropout)
-
-    # Concatenate all heads output.
-    combined = combine_heads(heads)
-    outputs = tf.layers.conv1d(combined, num_units, 1)
-
-    if not return_attention:
-        return outputs
-    return outputs, attn
-
-
-def multi_head_attention_PNCA(num_heads,
-                              queries,
-                              memory,
-                              mode,
-                              num_units=None,
-                              mask=None,
-                              mask_h=None,
-                              cache=None,
-                              cache_h=None,
-                              dropout=0.0,
-                              return_attention=False,
-                              X_band_width=None,
-                              layer_name='multi_head'):
-    """Computes the multi-head attention as described in
-    https://arxiv.org/abs/1706.03762.
-
-    Args:
-      num_heads: The number of attention heads.
-      queries: The sequence of queries. A tensor of shape :math:`[B, T_1, ...]`.
-      memory: The sequence to attend. A tensor of shape :math:`[B, T_2, ...]`.
-        If ``None``, computes self-attention.
-      mode: A ``tf.estimator.ModeKeys`` mode.
-      num_units: The number of hidden units. If not set, it is set to the input
-        dimension.
-      mask: A ``tf.Tensor`` applied to the dot product.
-      cache: A dictionary containing pre-projected keys and values.
-      dropout: The probability to drop units from the inputs.
-      return_attention: Return the attention head probabilities in addition to the
-        context.
-
-    Returns:
-      The concatenated attention context of each head and the attention
-      probabilities (if :obj:`return_attention` is set).
-    """
-    num_units = num_units or queries.get_shape().as_list()[-1]
-
-    if num_units % num_heads != 0:
-        raise ValueError('Multi head attention requires that num_units is a'
-                         ' multiple of {}'.format(num_heads))
-
-    # X
-    queries, keys, values = fused_projection(queries, num_units, num_outputs=3)
-
-    keys = split_heads(keys, num_heads)
-    values = split_heads(values, num_heads)
-
-    if cache is not None:
-        keys = tf.concat([cache['self_keys'], keys], axis=2)
-        values = tf.concat([cache['self_values'], values], axis=2)
-        if X_band_width is not None:
-            keys_band = tf.cond(
-                tf.less(X_band_width, 0), lambda: keys, lambda: tf.cond(
-                    tf.less(tf.shape(keys)[2], X_band_width), lambda: keys,
-                    lambda: keys[:, :, -X_band_width:, :])
-            )  # not support X_band_width == 0
-            values_band = tf.cond(
-                tf.less(X_band_width, 0), lambda: values, lambda: tf.cond(
-                    tf.less(tf.shape(values)[2], X_band_width), lambda: values,
-                    lambda: values[:, :, -X_band_width:, :]))
-            cache['self_keys'] = keys_band
-            cache['self_values'] = values_band
-        else:
-            cache['self_keys'] = keys
-            cache['self_values'] = values
-
-    queries = split_heads(queries, num_heads)
-    queries *= (num_units // num_heads)**-0.5
-
-    heads, attn = dot_product_attention(
-        queries, keys, values, mode, mask=mask, dropout=dropout)
-
-    # Concatenate all heads output.
-    combined = combine_heads(heads)
-    outputs = tf.layers.conv1d(combined, num_units, 1)
-
-    # H
-    if cache_h is not None:
-
-        def _project_and_split():
-            k, v = fused_projection(memory, num_units, num_outputs=2)
-            return split_heads(k, num_heads), split_heads(v, num_heads)
-
-        keys_h, values_h = tf.cond(
-            tf.equal(tf.shape(cache_h['memory_keys'])[2], 0),
-            true_fn=_project_and_split,
-            false_fn=lambda:
-            (cache_h['memory_keys'], cache_h['memory_values']))
-        cache_h['memory_keys'] = keys_h
-        cache_h['memory_values'] = values_h
-    else:
-        keys_h, values_h = fused_projection(memory, num_units, num_outputs=2)
-        keys_h = split_heads(keys_h, num_heads)
-        values_h = split_heads(values_h, num_heads)
-
-    heads_h, attn_h = dot_product_attention(
-        queries, keys_h, values_h, mode, mask=mask_h, dropout=dropout)
-
-    # Concatenate all heads output.
-    combined_h = combine_heads(heads_h)
-    outputs_h = tf.layers.conv1d(combined_h, num_units, 1)
-
-    # ADD
-    outputs = outputs + outputs_h
-
-    # RETURN
-    return outputs, attn, attn_h
-
-
-def multi_head_attention_memory(num_heads,
-                                queries,
-                                memory,
-                                mode,
-                                num_memory=None,
-                                num_units=None,
-                                mask=None,
-                                cache=None,
-                                dropout=0.0,
-                                return_attention=False):
-    """Computes the multi-head attention as described in
-    https://arxiv.org/abs/1706.03762.
-
-    Args:
-      num_heads: The number of attention heads.
-      queries: The sequence of queries. A tensor of shape :math:`[B, T_1, ...]`.
-      memory: The sequence to attend. A tensor of shape :math:`[B, T_2, ...]`.
-        If ``None``, computes self-attention.
-      mode: A ``tf.estimator.ModeKeys`` mode.
-      num_units: The number of hidden units. If not set, it is set to the input
-        dimension.
-      mask: A ``tf.Tensor`` applied to the dot product.
-      cache: A dictionary containing pre-projected keys and values.
-      dropout: The probability to drop units from the inputs.
-      return_attention: Return the attention head probabilities in addition to the
-        context.
-
-    Returns:
-      The concatenated attention context of each head and the attention
-      probabilities (if :obj:`return_attention` is set).
-    """
-    num_units = num_units or queries.get_shape().as_list()[-1]
-
-    if num_units % num_heads != 0:
-        raise ValueError('Multi head attention requires that num_units is a'
-                         ' multiple of {}'.format(num_heads))
-
-    # PERSISTENT MEMORY
-    # key memory
-    if num_memory is not None:
-        key_m = tf.get_variable(
-            'key_m',
-            shape=[num_memory, num_units],
-            initializer=tf.glorot_uniform_initializer(),
-            dtype=tf.float32)
-        # value memory
-        value_m = tf.get_variable(
-            'value_m',
-            shape=[num_memory, num_units],
-            initializer=tf.glorot_uniform_initializer(),
-            dtype=tf.float32)
-    if memory is None:
-        queries, keys, values = fused_projection(
-            queries, num_units, num_outputs=3)
-
-        # concat memory
-        if num_memory is not None:
-            key_m_expand = tf.tile(
-                tf.expand_dims(key_m, 0), [tf.shape(keys)[0], 1, 1])
-            value_m_expand = tf.tile(
-                tf.expand_dims(value_m, 0), [tf.shape(values)[0], 1, 1])
-            keys = tf.concat([key_m_expand, keys], axis=1)
-            values = tf.concat([value_m_expand, values], axis=1)
-
-        keys = split_heads(keys, num_heads)
-        values = split_heads(values, num_heads)
-
-        if cache is not None:
-            keys = tf.concat([cache['self_keys'], keys], axis=2)
-            values = tf.concat([cache['self_values'], values], axis=2)
-            cache['self_keys'] = keys
-            cache['self_values'] = values
-    else:
-        queries = tf.layers.conv1d(queries, num_units, 1)
-
-        if cache is not None:
-
-            def _project_and_split():
-                k, v = fused_projection(memory, num_units, num_outputs=2)
-                return split_heads(k, num_heads), split_heads(v, num_heads)
-
-            keys, values = tf.cond(
-                tf.equal(tf.shape(cache['memory_keys'])[2], 0),
-                true_fn=_project_and_split,
-                false_fn=lambda:
-                (cache['memory_keys'], cache['memory_values']))
-            cache['memory_keys'] = keys
-            cache['memory_values'] = values
-        else:
-            keys, values = fused_projection(memory, num_units, num_outputs=2)
-            keys = split_heads(keys, num_heads)
-            values = split_heads(values, num_heads)
-
-    queries = split_heads(queries, num_heads)
-    queries *= (num_units // num_heads)**-0.5
-
-    heads, attn = dot_product_attention(
-        queries, keys, values, mode, mask=mask, dropout=dropout)
-
-    # Concatenate all heads output.
-    combined = combine_heads(heads)
-    outputs = tf.layers.conv1d(combined, num_units, 1)
-
-    if not return_attention:
-        return outputs
-    return outputs, attn
-
-
-def Ci_Cd_Memory(num_heads,
-                 queries,
-                 mode,
-                 filter_size=None,
-                 num_memory=None,
-                 num_units=None,
-                 fsmn_mask=None,
-                 san_mask=None,
-                 cache=None,
-                 shift=None,
-                 dropout=0.0,
-                 return_attention=False):
-    """
-    Args:
-      num_heads: The number of attention heads.
-      queries: The sequence of queries. A tensor of shape :math:`[B, T_1, ...]`.
-      memory: The sequence to attend. A tensor of shape :math:`[B, T_2, ...]`.
-        If ``None``, computes self-attention.
-      mode: A ``tf.estimator.ModeKeys`` mode.
-      num_units: The number of hidden units. If not set, it is set to the input
-        dimension.
-      mask: A ``tf.Tensor`` applied to the dot product.
-      cache: A dictionary containing pre-projected keys and values.
-      dropout: The probability to drop units from the inputs.
-      return_attention: Return the attention head probabilities in addition to the
-        context.
-
-    Returns:
-      The concatenated attention context of each head and the attention
-      probabilities (if :obj:`return_attention` is set).
-    """
-    num_units = num_units or queries.get_shape().as_list()[-1]
-
-    if num_units % num_heads != 0:
-        raise ValueError('Multi head attention requires that num_units is a'
-                         ' multiple of {}'.format(num_heads))
-    # PERSISTENT MEMORY
-    if num_memory is not None:
-        key_m = tf.get_variable(
-            'key_m',
-            shape=[num_memory, num_units],
-            initializer=tf.glorot_uniform_initializer(),
-            dtype=tf.float32)
-        value_m = tf.get_variable(
-            'value_m',
-            shape=[num_memory, num_units],
-            initializer=tf.glorot_uniform_initializer(),
-            dtype=tf.float32)
-
-    queries, keys, values = fused_projection(queries, num_units, num_outputs=3)
-    # fsmn memory block
-    if shift is not None:
-        # encoder
-        fsmn_memory = fsmn.MemoryBlockV2(
-            values,
-            filter_size,
-            mode,
-            shift=shift,
-            mask=fsmn_mask,
-            dropout=dropout)
-    else:
-        # decoder
-        fsmn_memory = fsmn.UniMemoryBlock(
-            values,
-            filter_size,
-            mode,
-            cache=cache,
-            mask=fsmn_mask,
-            dropout=dropout)
-
-    # concat persistent memory
-    if num_memory is not None:
-        key_m_expand = tf.tile(
-            tf.expand_dims(key_m, 0), [tf.shape(keys)[0], 1, 1])
-        value_m_expand = tf.tile(
-            tf.expand_dims(value_m, 0), [tf.shape(values)[0], 1, 1])
-        keys = tf.concat([key_m_expand, keys], axis=1)
-        values = tf.concat([value_m_expand, values], axis=1)
-
-    keys = split_heads(keys, num_heads)
-    values = split_heads(values, num_heads)
-
-    if cache is not None:
-        keys = tf.concat([cache['self_keys'], keys], axis=2)
-        values = tf.concat([cache['self_values'], values], axis=2)
-        cache['self_keys'] = keys
-        cache['self_values'] = values
-
-    queries = split_heads(queries, num_heads)
-    queries *= (num_units // num_heads)**-0.5
-
-    heads, attn = dot_product_attention(
-        queries, keys, values, mode, mask=san_mask, dropout=dropout)
-
-    # Concatenate all heads output.
-    combined = combine_heads(heads)
-    outputs = tf.layers.conv1d(combined, num_units, 1)
-    outputs = outputs + fsmn_memory
-
-    if not return_attention:
-        return outputs
-    return outputs, attn
-
-
-def multi_head_attention_wpa(num_heads,
-                             queries,
-                             memory,
-                             mode,
-                             attention_left_window=-1,
-                             attention_right_window=0,
-                             num_units=None,
-                             mask=None,
-                             cache=None,
-                             max_id_cache=None,
-                             dropout=0.0,
-                             mono=False,
-                             peak_delay=-1,
-                             return_attention=False):
-    """Computes the multi-head attention as described in
-    https://arxiv.org/abs/1706.03762.
-
-    Args:
-      num_heads: The number of attention heads.
-      queries: The sequence of queries. A tensor of shape :math:`[B, T_1, ...]`.
-      memory: The sequence to attend. A tensor of shape :math:`[B, T_2, ...]`.
-        If ``None``, computes self-attention.
-      mode: A ``tf.estimator.ModeKeys`` mode.
-      num_units: The number of hidden units. If not set, it is set to the input
-        dimension.
-      mask: A ``tf.Tensor`` applied to the dot product.
-      cache: A dictionary containing pre-projected keys and values.
-      dropout: The probability to drop units from the inputs.
-      return_attention: Return the attention head probabilities in addition to the
-        context.
-
-    Returns:
-      The concatenated attention context of each head and the attention
-      probabilities (if :obj:`return_attention` is set).
-    """
-    num_units = num_units or queries.get_shape().as_list()[-1]
-
-    if num_units % num_heads != 0:
-        raise ValueError('Multi head attention requires that num_units is a'
-                         ' multiple of {}'.format(num_heads))
-
-    if memory is None:
-        queries, keys, values = fused_projection(
-            queries, num_units, num_outputs=3)
-
-        keys = split_heads(keys, num_heads)
-        values = split_heads(values, num_heads)
-
-        if cache is not None:
-            keys = tf.concat([cache['self_keys'], keys], axis=2)
-            values = tf.concat([cache['self_values'], values], axis=2)
-            cache['self_keys'] = keys
-            cache['self_values'] = values
-    else:
-        queries = tf.layers.conv1d(queries, num_units, 1)
-
-        if cache is not None:
-
-            def _project_and_split():
-                k, v = fused_projection(memory, num_units, num_outputs=2)
-                return split_heads(k, num_heads), split_heads(v, num_heads)
-
-            keys, values = tf.cond(
-                tf.equal(tf.shape(cache['memory_keys'])[2], 0),
-                true_fn=_project_and_split,
-                false_fn=lambda:
-                (cache['memory_keys'], cache['memory_values']))
-            cache['memory_keys'] = keys
-            cache['memory_values'] = values
-        else:
-            keys, values = fused_projection(memory, num_units, num_outputs=2)
-            keys = split_heads(keys, num_heads)
-            values = split_heads(values, num_heads)
-
-    queries = split_heads(queries, num_heads)
-    queries *= (num_units // num_heads)**-0.5
-
-    heads, attn = dot_product_attention_wpa(
-        num_heads,
-        queries,
-        keys,
-        values,
-        mode,
-        attention_left_window=attention_left_window,
-        attention_right_window=attention_right_window,
-        mask=mask,
-        max_id_cache=max_id_cache,
-        mono=mono,
-        peak_delay=peak_delay,
-        dropout=dropout)
-
-    # Concatenate all heads output.
-    combined = combine_heads(heads)
-    outputs = tf.layers.conv1d(combined, num_units, 1)
-
-    if not return_attention:
-        return outputs
-    return outputs, attn
-
-
-def feed_forward(x, inner_dim, mode, dropout=0.0, mask=None):
-    """Implements the Transformer's "Feed Forward" layer.
-
-    .. math::
-
-        ffn(x) = max(0, x*W_1 + b_1)*W_2 + b_2
-
-    Args:
-      x: The input.
-      inner_dim: The number of units of the inner linear transformation.
-      mode: A ``tf.estimator.ModeKeys`` mode.
-      dropout: The probability to drop units from the inner transformation.
-
-    Returns:
-      The transformed input.
-    """
-    input_dim = x.get_shape().as_list()[-1]
-
-    if mask is not None:
-        x = x * tf.expand_dims(mask, -1)
-
-    inner = tf.layers.conv1d(
-        x, inner_dim, 3, padding='same', activation=tf.nn.relu)
-
-    if mask is not None:
-        inner = inner * tf.expand_dims(mask, -1)
-    inner = tf.layers.dropout(inner, rate=dropout, training=mode)
-    outer = tf.layers.conv1d(inner, input_dim, 1)
-
-    return outer
-
-
-def feed_forward_ori(x, inner_dim, mode, dropout=0.0):
-    """Implements the Transformer's "Feed Forward" layer.
-
-    .. math::
-
-        ffn(x) = max(0, x*W_1 + b_1)*W_2 + b_2
-
-    Args:
-      x: The input.
-      inner_dim: The number of units of the inner linear transformation.
-      mode: A ``tf.estimator.ModeKeys`` mode.
-      dropout: The probability to drop units from the inner transformation.
-
-    Returns:
-      The transformed input.
-    """
-    input_dim = x.get_shape().as_list()[-1]
-
-    inner = tf.layers.conv1d(x, inner_dim, 1, activation=tf.nn.relu)
-    inner = tf.layers.dropout(inner, rate=dropout, training=mode)
-    outer = tf.layers.conv1d(inner, input_dim, 1)
-
-    return outer
-
-
-def norm(inputs):
-    """Layer normalizes :obj:`inputs`."""
-    return tf.contrib.layers.layer_norm(inputs, begin_norm_axis=-1)
-
-
-def drop_and_add(inputs, outputs, mode, dropout=0.1):
-    """Drops units in the outputs and adds the previous values.
-
-    Args:
-      inputs: The input of the previous layer.
-      outputs: The output of the previous layer.
-      mode: A ``tf.estimator.ModeKeys`` mode.
-      dropout: The probability to drop units in :obj:`outputs`.
-
-    Returns:
-      The residual and normalized output.
-    """
-    outputs = tf.layers.dropout(outputs, rate=dropout, training=mode)
-
-    input_dim = inputs.get_shape().as_list()[-1]
-    output_dim = outputs.get_shape().as_list()[-1]
-
-    if input_dim == output_dim:
-        outputs += inputs
-    return outputs
-
-
-class FeedForwardNetwork(tf.keras.layers.Layer):
-    """Implements the Transformer's "Feed Forward" layer.
-
-    .. math::
-
-        ffn(x) = max(0, x*W_1 + b_1)*W_2 + b_2
-
-    Note:
-      Object-oriented implementation for TensorFlow 2.0.
-    """
-
-    def __init__(self,
-                 inner_dim,
-                 output_dim,
-                 dropout=0.1,
-                 activation=tf.nn.relu,
-                 **kwargs):
-        """Initializes this layer.
-
-        Args:
-          inner_dim: The number of units of the inner linear transformation.
-          output_dim: The number of units of the ouput linear transformation.
-          dropout: The probability to drop units from the activation output.
-          activation: The activation function to apply between the two linear
-            transformations.
-          kwargs: Additional layer arguments.
-        """
-        super(FeedForwardNetwork, self).__init__(**kwargs)
-        self.inner = tf.keras.layers.Dense(
-            inner_dim, activation=activation, name='inner')
-        self.outer = tf.keras.layers.Dense(output_dim, name='outer')
-        self.dropout = dropout
-
-    def call(self, inputs, training=None):  # pylint: disable=arguments-differ
-        """Runs the layer."""
-        inner = self.inner(inputs)
-        inner = tf.layers.dropout(inner, self.dropout, training=training)
-        return self.outer(inner)
-
-
-class MultiHeadAttention(tf.keras.layers.Layer):
-    """Computes the multi-head attention as described in
-    https://arxiv.org/abs/1706.03762.
-
-    Note:
-      Object-oriented implementation for TensorFlow 2.0.
-    """
-
-    def __init__(self,
-                 num_heads,
-                 num_units,
-                 dropout=0.1,
-                 return_attention=False,
-                 **kwargs):
-        """Initializes this layers.
-
-        Args:
-          num_heads: The number of attention heads.
-          num_units: The number of hidden units.
-          dropout: The probability to drop units from the inputs.
-          return_attention: If ``True``, also return the attention weights of the
-            first head.
-          kwargs: Additional layer arguments.
-        """
-        super(MultiHeadAttention, self).__init__(**kwargs)
-        if num_units % num_heads != 0:
-            raise ValueError(
-                'Multi head attention requires that num_units is a'
-                ' multiple of %s' % num_heads)
-        self.num_heads = num_heads
-        self.num_units = num_units
-        self.linear_queries = tf.keras.layers.Dense(
-            num_units, name='linear_queries')
-        self.linear_keys = tf.keras.layers.Dense(num_units, name='linear_keys')
-        self.linear_values = tf.keras.layers.Dense(
-            num_units, name='linear_values')
-        self.linear_output = tf.keras.layers.Dense(
-            num_units, name='linear_output')
-        self.dropout = dropout
-        self.return_attention = return_attention
-
-    def call(self, inputs, memory=None, mask=None, cache=None, training=None):  # pylint: disable=arguments-differ
-        """Runs the layer.
-
-        Args:
-          inputs: The sequence of queries. A tensor of shape :math:`[B, T_1, ...]`.
-          memory: The sequence to attend. A tensor of shape :math:`[B, T_2, ...]`.
-            If ``None``, computes self-attention.
-          mask: A ``tf.Tensor`` applied to the dot product.
-          cache: A dictionary containing pre-projected keys and values.
-          training: Run in training mode.
-
-        Returns:
-          A tuple with the attention context, the updated cache and the attention
-          probabilities of the first head (if :obj:`return_attention` is ``True``).
-        """
-
-        def _compute_kv(x):
-            keys = self.linear_keys(x)
-            keys = split_heads(keys, self.num_heads)
-            values = self.linear_values(x)
-            values = split_heads(values, self.num_heads)
-            return keys, values
-
-        # Compute queries.
-        queries = self.linear_queries(inputs)
-        queries = split_heads(queries, self.num_heads)
-        queries *= (self.num_units // self.num_heads)**-0.5
-
-        # Compute keys and values.
-        if memory is None:
-            keys, values = _compute_kv(inputs)
-            if cache:
-                keys = tf.concat([cache[0], keys], axis=2)
-                values = tf.concat([cache[1], values], axis=2)
-        else:
-            if cache:
-                if not self.linear_keys.built:
-                    # Ensure that the variable names are not impacted by the tf.cond name
-                    # scope if the layers have not already been built.
-                    with tf.name_scope(self.linear_keys.name):
-                        self.linear_keys.build(memory.shape)
-                    with tf.name_scope(self.linear_values.name):
-                        self.linear_values.build(memory.shape)
-                keys, values = tf.cond(
-                    tf.equal(tf.shape(cache[0])[2], 0),
-                    true_fn=lambda: _compute_kv(memory),
-                    false_fn=lambda: cache)
-            else:
-                keys, values = _compute_kv(memory)
-
-        cache = (keys, values)
-
-        # Dot product attention.
-        dot = tf.matmul(queries, keys, transpose_b=True)
-        if mask is not None:
-            mask = tf.expand_dims(tf.cast(mask, tf.float32),
-                                  1)  # Broadcast on heads dimension.
-            dot = tf.cast(
-                tf.cast(dot, tf.float32) * mask
-                + ((1.0 - mask) * tf.float32.min), dot.dtype)  # yapf:disable
-        attn = tf.cast(tf.nn.softmax(tf.cast(dot, tf.float32)), dot.dtype)
-        drop_attn = tf.layers.dropout(attn, self.dropout, training=training)
-        heads = tf.matmul(drop_attn, values)
-
-        # Concatenate all heads output.
-        combined = combine_heads(heads)
-        outputs = self.linear_output(combined)
-        if self.return_attention:
-            return outputs, cache, attn
-        return outputs, cache
diff --git a/modelscope/models/audio/tts/models/utils.py b/modelscope/models/audio/tts/models/utils.py
deleted file mode 100755
index 03e1ef8c..00000000
--- a/modelscope/models/audio/tts/models/utils.py
+++ /dev/null
@@ -1,59 +0,0 @@
-import glob
-import os
-
-import matplotlib
-import matplotlib.pylab as plt
-import torch
-from torch.nn.utils import weight_norm
-
-matplotlib.use('Agg')
-
-
-def plot_spectrogram(spectrogram):
-    fig, ax = plt.subplots(figsize=(10, 2))
-    im = ax.imshow(
-        spectrogram, aspect='auto', origin='lower', interpolation='none')
-    plt.colorbar(im, ax=ax)
-
-    fig.canvas.draw()
-    plt.close()
-
-    return fig
-
-
-def init_weights(m, mean=0.0, std=0.01):
-    classname = m.__class__.__name__
-    if classname.find('Conv') != -1:
-        m.weight.data.normal_(mean, std)
-
-
-def apply_weight_norm(m):
-    classname = m.__class__.__name__
-    if classname.find('Conv') != -1:
-        weight_norm(m)
-
-
-def get_padding(kernel_size, dilation=1):
-    return int((kernel_size * dilation - dilation) / 2)
-
-
-def load_checkpoint(filepath, device):
-    assert os.path.isfile(filepath)
-    print("Loading '{}'".format(filepath))
-    checkpoint_dict = torch.load(filepath, map_location=device)
-    print('Complete.')
-    return checkpoint_dict
-
-
-def save_checkpoint(filepath, obj):
-    print('Saving checkpoint to {}'.format(filepath))
-    torch.save(obj, filepath)
-    print('Complete.')
-
-
-def scan_checkpoint(cp_dir, prefix):
-    pattern = os.path.join(cp_dir, prefix + '????????')
-    cp_list = glob.glob(pattern)
-    if len(cp_list) == 0:
-        return None
-    return sorted(cp_list)[-1]
diff --git a/modelscope/models/audio/tts/models/utils/__init__.py b/modelscope/models/audio/tts/models/utils/__init__.py
new file mode 100644
index 00000000..e07f08ea
--- /dev/null
+++ b/modelscope/models/audio/tts/models/utils/__init__.py
@@ -0,0 +1,3 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+from .utils import *  # noqa F403
diff --git a/modelscope/models/audio/tts/models/utils/utils.py b/modelscope/models/audio/tts/models/utils/utils.py
new file mode 100755
index 00000000..17ac8aee
--- /dev/null
+++ b/modelscope/models/audio/tts/models/utils/utils.py
@@ -0,0 +1,136 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import glob
+import os
+import shutil
+
+import matplotlib
+import matplotlib.pylab as plt
+import torch
+
+matplotlib.use('Agg')
+
+
+class AttrDict(dict):
+
+    def __init__(self, *args, **kwargs):
+        super(AttrDict, self).__init__(*args, **kwargs)
+        self.__dict__ = self
+
+
+def build_env(config, config_name, path):
+    t_path = os.path.join(path, config_name)
+    if config != t_path:
+        os.makedirs(path, exist_ok=True)
+        shutil.copyfile(config, os.path.join(path, config_name))
+
+
+def plot_spectrogram(spectrogram):
+    fig, ax = plt.subplots(figsize=(10, 2))
+    im = ax.imshow(
+        spectrogram, aspect='auto', origin='lower', interpolation='none')
+    plt.colorbar(im, ax=ax)
+
+    fig.canvas.draw()
+    plt.close()
+
+    return fig
+
+
+def plot_alignment(alignment, info=None):
+    fig, ax = plt.subplots()
+    im = ax.imshow(
+        alignment, aspect='auto', origin='lower', interpolation='none')
+    fig.colorbar(im, ax=ax)
+    xlabel = 'Input timestep'
+    if info is not None:
+        xlabel += '\t' + info
+    plt.xlabel(xlabel)
+    plt.ylabel('Output timestep')
+    fig.canvas.draw()
+    plt.close()
+
+    return fig
+
+
+def load_checkpoint(filepath, device):
+    assert os.path.isfile(filepath)
+    checkpoint_dict = torch.load(filepath, map_location=device)
+    return checkpoint_dict
+
+
+def save_checkpoint(filepath, obj):
+    torch.save(obj, filepath)
+
+
+def scan_checkpoint(cp_dir, prefix):
+    pattern = os.path.join(cp_dir, prefix + '????????.pkl')
+    cp_list = glob.glob(pattern)
+    if len(cp_list) == 0:
+        return None
+    return sorted(cp_list)[-1]
+
+
+def get_padding(kernel_size, dilation=1):
+    return int((kernel_size * dilation - dilation) / 2)
+
+
+class ValueWindow():
+
+    def __init__(self, window_size=100):
+        self._window_size = window_size
+        self._values = []
+
+    def append(self, x):
+        self._values = self._values[-(self._window_size - 1):] + [x]
+
+    @property
+    def sum(self):
+        return sum(self._values)
+
+    @property
+    def count(self):
+        return len(self._values)
+
+    @property
+    def average(self):
+        return self.sum / max(1, self.count)
+
+    def reset(self):
+        self._values = []
+
+
+def get_model_size(model):
+    param_num = sum([p.numel() for p in model.parameters() if p.requires_grad])
+    param_size = param_num * 4 / 1024 / 1024
+    return param_size
+
+
+def get_grad_norm(model):
+    total_norm = 0
+    params = [
+        p for p in model.parameters() if p.grad is not None and p.requires_grad
+    ]
+    for p in params:
+        param_norm = p.grad.detach().data.norm(2)
+        total_norm += param_norm.item()**2
+    total_norm = total_norm**0.5
+    return total_norm
+
+
+def init_weights(m, mean=0.0, std=0.01):
+    classname = m.__class__.__name__
+    if classname.find('Conv') != -1:
+        m.weight.data.normal_(mean, std)
+
+
+def get_mask_from_lengths(lengths, max_len=None):
+    batch_size = lengths.shape[0]
+    if max_len is None:
+        max_len = torch.max(lengths).item()
+
+    ids = torch.arange(0, max_len).unsqueeze(0).expand(batch_size,
+                                                       -1).to(lengths.device)
+    mask = ids >= lengths.unsqueeze(1).expand(-1, max_len)
+
+    return mask
diff --git a/modelscope/models/audio/tts/models/vocoder_models.py b/modelscope/models/audio/tts/models/vocoder_models.py
deleted file mode 100755
index c46a9204..00000000
--- a/modelscope/models/audio/tts/models/vocoder_models.py
+++ /dev/null
@@ -1,516 +0,0 @@
-from distutils.version import LooseVersion
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from torch.nn import AvgPool1d, Conv1d, Conv2d, ConvTranspose1d
-from torch.nn.utils import remove_weight_norm, spectral_norm, weight_norm
-
-from .utils import get_padding, init_weights
-
-is_pytorch_17plus = LooseVersion(torch.__version__) >= LooseVersion('1.7')
-
-
-def stft(x, fft_size, hop_size, win_length, window):
-    """Perform STFT and convert to magnitude spectrogram.
-
-    Args:
-        x (Tensor): Input signal tensor (B, T).
-        fft_size (int): FFT size.
-        hop_size (int): Hop size.
-        win_length (int): Window length.
-        window (str): Window function type.
-
-    Returns:
-        Tensor: Magnitude spectrogram (B, #frames, fft_size // 2 + 1).
-
-    """
-    if is_pytorch_17plus:
-        x_stft = torch.stft(
-            x, fft_size, hop_size, win_length, window, return_complex=False)
-    else:
-        x_stft = torch.stft(x, fft_size, hop_size, win_length, window)
-    real = x_stft[..., 0]
-    imag = x_stft[..., 1]
-
-    # NOTE(kan-bayashi): clamp is needed to avoid nan or inf
-    return torch.sqrt(torch.clamp(real**2 + imag**2, min=1e-7)).transpose(2, 1)
-
-
-LRELU_SLOPE = 0.1
-
-
-def get_padding_casual(kernel_size, dilation=1):
-    return int(kernel_size * dilation - dilation)
-
-
-class Conv1dCasual(torch.nn.Module):
-
-    def __init__(self,
-                 in_channels,
-                 out_channels,
-                 kernel_size,
-                 stride=1,
-                 padding=0,
-                 dilation=1,
-                 groups=1,
-                 bias=True,
-                 padding_mode='zeros'):
-        super(Conv1dCasual, self).__init__()
-        self.pad = padding
-        self.conv1d = weight_norm(
-            Conv1d(
-                in_channels,
-                out_channels,
-                kernel_size,
-                stride,
-                padding=0,
-                dilation=dilation,
-                groups=groups,
-                bias=bias,
-                padding_mode=padding_mode))
-        self.conv1d.apply(init_weights)
-
-    def forward(self, x):  # bdt
-        # described starting from the last dimension and moving forward.
-        x = F.pad(x, (self.pad, 0, 0, 0, 0, 0), 'constant')
-        x = self.conv1d(x)
-        return x
-
-    def remove_weight_norm(self):
-        remove_weight_norm(self.conv1d)
-
-
-class ConvTranspose1dCausal(torch.nn.Module):
-    """CausalConvTranspose1d module with customized initialization."""
-
-    def __init__(self,
-                 in_channels,
-                 out_channels,
-                 kernel_size,
-                 stride,
-                 padding=0):
-        """Initialize CausalConvTranspose1d module."""
-        super(ConvTranspose1dCausal, self).__init__()
-        self.deconv = weight_norm(
-            ConvTranspose1d(in_channels, out_channels, kernel_size, stride))
-        self.stride = stride
-        self.deconv.apply(init_weights)
-        self.pad = kernel_size - stride
-
-    def forward(self, x):
-        """Calculate forward propagation.
-        Args:
-            x (Tensor): Input tensor (B, in_channels, T_in).
-        Returns:
-            Tensor: Output tensor (B, out_channels, T_out).
-        """
-        # x = F.pad(x, (self.pad, 0, 0, 0, 0, 0), "constant")
-        return self.deconv(x)[:, :, :-self.pad]
-
-    def remove_weight_norm(self):
-        remove_weight_norm(self.deconv)
-
-
-class ResBlock1(torch.nn.Module):
-
-    def __init__(self, h, channels, kernel_size=3, dilation=(1, 3, 5)):
-        super(ResBlock1, self).__init__()
-        self.h = h
-        self.convs1 = nn.ModuleList([
-            Conv1dCasual(
-                channels,
-                channels,
-                kernel_size,
-                1,
-                dilation=dilation[i],
-                padding=get_padding_casual(kernel_size, dilation[i]))
-            for i in range(len(dilation))
-        ])
-
-        self.convs2 = nn.ModuleList([
-            Conv1dCasual(
-                channels,
-                channels,
-                kernel_size,
-                1,
-                dilation=1,
-                padding=get_padding_casual(kernel_size, 1))
-            for i in range(len(dilation))
-        ])
-
-    def forward(self, x):
-        for c1, c2 in zip(self.convs1, self.convs2):
-            xt = F.leaky_relu(x, LRELU_SLOPE)
-            xt = c1(xt)
-            xt = F.leaky_relu(xt, LRELU_SLOPE)
-            xt = c2(xt)
-            x = xt + x
-        return x
-
-    def remove_weight_norm(self):
-        for layer in self.convs1:
-            layer.remove_weight_norm()
-        for layer in self.convs2:
-            layer.remove_weight_norm()
-
-
-class Generator(torch.nn.Module):
-
-    def __init__(self, h):
-        super(Generator, self).__init__()
-        self.h = h
-        self.num_kernels = len(h.resblock_kernel_sizes)
-        self.num_upsamples = len(h.upsample_rates)
-        print('num_kernels={}, num_upsamples={}'.format(
-            self.num_kernels, self.num_upsamples))
-        self.conv_pre = Conv1dCasual(
-            80, h.upsample_initial_channel, 7, 1, padding=7 - 1)
-        resblock = ResBlock1 if h.resblock == '1' else ResBlock2
-
-        self.ups = nn.ModuleList()
-        self.repeat_ups = nn.ModuleList()
-        for i, (u, k) in enumerate(
-                zip(h.upsample_rates, h.upsample_kernel_sizes)):
-            upsample = nn.Sequential(
-                nn.Upsample(mode='nearest', scale_factor=u),
-                nn.LeakyReLU(LRELU_SLOPE),
-                Conv1dCasual(
-                    h.upsample_initial_channel // (2**i),
-                    h.upsample_initial_channel // (2**(i + 1)),
-                    kernel_size=7,
-                    stride=1,
-                    padding=7 - 1))
-            self.repeat_ups.append(upsample)
-            self.ups.append(
-                ConvTranspose1dCausal(
-                    h.upsample_initial_channel // (2**i),
-                    h.upsample_initial_channel // (2**(i + 1)),
-                    k,
-                    u,
-                    padding=(k - u) // 2))
-
-        self.resblocks = nn.ModuleList()
-        for i in range(len(self.ups)):
-            ch = h.upsample_initial_channel // (2**(i + 1))
-            for j, (k, d) in enumerate(
-                    zip(h.resblock_kernel_sizes, h.resblock_dilation_sizes)):
-                self.resblocks.append(resblock(h, ch, k, d))
-
-        self.conv_post = Conv1dCasual(ch, 1, 7, 1, padding=7 - 1)
-
-    def forward(self, x):
-        x = self.conv_pre(x)
-        for i in range(self.num_upsamples):
-            x = torch.sin(x) + x
-            # transconv
-            x1 = F.leaky_relu(x, LRELU_SLOPE)
-            x1 = self.ups[i](x1)
-            # repeat
-            x2 = self.repeat_ups[i](x)
-            x = x1 + x2
-            xs = None
-            for j in range(self.num_kernels):
-                if xs is None:
-                    xs = self.resblocks[i * self.num_kernels + j](x)
-                else:
-                    xs += self.resblocks[i * self.num_kernels + j](x)
-            x = xs / self.num_kernels
-        x = F.leaky_relu(x)
-        x = self.conv_post(x)
-        x = torch.tanh(x)
-        return x
-
-    def remove_weight_norm(self):
-        print('Removing weight norm...')
-        for layer in self.ups:
-            layer.remove_weight_norm()
-        for layer in self.repeat_ups:
-            layer[-1].remove_weight_norm()
-        for layer in self.resblocks:
-            layer.remove_weight_norm()
-        self.conv_pre.remove_weight_norm()
-        self.conv_post.remove_weight_norm()
-
-
-class DiscriminatorP(torch.nn.Module):
-
-    def __init__(self,
-                 period,
-                 kernel_size=5,
-                 stride=3,
-                 use_spectral_norm=False):
-        super(DiscriminatorP, self).__init__()
-        self.period = period
-        norm_f = weight_norm if use_spectral_norm is False else spectral_norm
-        self.convs = nn.ModuleList([
-            norm_f(
-                Conv2d(
-                    1,
-                    32, (kernel_size, 1), (stride, 1),
-                    padding=(get_padding(5, 1), 0))),
-            norm_f(
-                Conv2d(
-                    32,
-                    128, (kernel_size, 1), (stride, 1),
-                    padding=(get_padding(5, 1), 0))),
-            norm_f(
-                Conv2d(
-                    128,
-                    512, (kernel_size, 1), (stride, 1),
-                    padding=(get_padding(5, 1), 0))),
-            norm_f(
-                Conv2d(
-                    512,
-                    1024, (kernel_size, 1), (stride, 1),
-                    padding=(get_padding(5, 1), 0))),
-            norm_f(Conv2d(1024, 1024, (kernel_size, 1), 1, padding=(2, 0))),
-        ])
-        self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
-
-    def forward(self, x):
-        fmap = []
-
-        # 1d to 2d
-        b, c, t = x.shape
-        if t % self.period != 0:  # pad first
-            n_pad = self.period - (t % self.period)
-            x = F.pad(x, (0, n_pad), 'reflect')
-            t = t + n_pad
-        x = x.view(b, c, t // self.period, self.period)
-
-        for layer in self.convs:
-            x = layer(x)
-            x = F.leaky_relu(x, LRELU_SLOPE)
-            fmap.append(x)
-        x = self.conv_post(x)
-        fmap.append(x)
-        x = torch.flatten(x, 1, -1)
-
-        return x, fmap
-
-
-class MultiPeriodDiscriminator(torch.nn.Module):
-
-    def __init__(self):
-        super(MultiPeriodDiscriminator, self).__init__()
-        self.discriminators = nn.ModuleList([
-            DiscriminatorP(2),
-            DiscriminatorP(3),
-            DiscriminatorP(5),
-            DiscriminatorP(7),
-            DiscriminatorP(11),
-        ])
-
-    def forward(self, y, y_hat):
-        y_d_rs = []
-        y_d_gs = []
-        fmap_rs = []
-        fmap_gs = []
-        for i, d in enumerate(self.discriminators):
-            y_d_r, fmap_r = d(y)
-            y_d_g, fmap_g = d(y_hat)
-            y_d_rs.append(y_d_r)
-            fmap_rs.append(fmap_r)
-            y_d_gs.append(y_d_g)
-            fmap_gs.append(fmap_g)
-
-        return y_d_rs, y_d_gs, fmap_rs, fmap_gs
-
-
-class DiscriminatorS(torch.nn.Module):
-
-    def __init__(self, use_spectral_norm=False):
-        super(DiscriminatorS, self).__init__()
-        norm_f = weight_norm if use_spectral_norm is False else spectral_norm
-        self.convs = nn.ModuleList([
-            norm_f(Conv1d(1, 128, 15, 1, padding=7)),
-            norm_f(Conv1d(128, 128, 41, 2, groups=4, padding=20)),
-            norm_f(Conv1d(128, 256, 41, 2, groups=16, padding=20)),
-            norm_f(Conv1d(256, 512, 41, 4, groups=16, padding=20)),
-            norm_f(Conv1d(512, 1024, 41, 4, groups=16, padding=20)),
-            norm_f(Conv1d(1024, 1024, 41, 1, groups=16, padding=20)),
-            norm_f(Conv1d(1024, 1024, 5, 1, padding=2)),
-        ])
-        self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1))
-
-    def forward(self, x):
-        fmap = []
-        for layer in self.convs:
-            x = layer(x)
-            x = F.leaky_relu(x, LRELU_SLOPE)
-            fmap.append(x)
-        x = self.conv_post(x)
-        fmap.append(x)
-        x = torch.flatten(x, 1, -1)
-
-        return x, fmap
-
-
-class MultiScaleDiscriminator(torch.nn.Module):
-
-    def __init__(self):
-        super(MultiScaleDiscriminator, self).__init__()
-        self.discriminators = nn.ModuleList([
-            DiscriminatorS(use_spectral_norm=True),
-            DiscriminatorS(),
-            DiscriminatorS(),
-        ])
-        from pytorch_wavelets import DWT1DForward
-        self.meanpools = nn.ModuleList(
-            [DWT1DForward(wave='db3', J=1),
-             DWT1DForward(wave='db3', J=1)])
-        self.convs = nn.ModuleList([
-            weight_norm(Conv1d(2, 1, 15, 1, padding=7)),
-            weight_norm(Conv1d(2, 1, 15, 1, padding=7))
-        ])
-
-    def forward(self, y, y_hat):
-        y_d_rs = []
-        y_d_gs = []
-        fmap_rs = []
-        fmap_gs = []
-        for i, d in enumerate(self.discriminators):
-            if i != 0:
-                yl, yh = self.meanpools[i - 1](y)
-                y = torch.cat([yl, yh[0]], dim=1)
-                y = self.convs[i - 1](y)
-                y = F.leaky_relu(y, LRELU_SLOPE)
-
-                yl_hat, yh_hat = self.meanpools[i - 1](y_hat)
-                y_hat = torch.cat([yl_hat, yh_hat[0]], dim=1)
-                y_hat = self.convs[i - 1](y_hat)
-                y_hat = F.leaky_relu(y_hat, LRELU_SLOPE)
-
-            y_d_r, fmap_r = d(y)
-            y_d_g, fmap_g = d(y_hat)
-            y_d_rs.append(y_d_r)
-            fmap_rs.append(fmap_r)
-            y_d_gs.append(y_d_g)
-            fmap_gs.append(fmap_g)
-
-        return y_d_rs, y_d_gs, fmap_rs, fmap_gs
-
-
-class DiscriminatorSTFT(torch.nn.Module):
-
-    def __init__(self,
-                 kernel_size=11,
-                 stride=2,
-                 use_spectral_norm=False,
-                 fft_size=1024,
-                 shift_size=120,
-                 win_length=600,
-                 window='hann_window'):
-        super(DiscriminatorSTFT, self).__init__()
-        self.fft_size = fft_size
-        self.shift_size = shift_size
-        self.win_length = win_length
-        norm_f = weight_norm if use_spectral_norm is False else spectral_norm
-        self.convs = nn.ModuleList([
-            norm_f(
-                Conv2d(
-                    fft_size // 2 + 1,
-                    32, (15, 1), (1, 1),
-                    padding=(get_padding(15, 1), 0))),
-            norm_f(
-                Conv2d(
-                    32,
-                    32, (kernel_size, 1), (stride, 1),
-                    padding=(get_padding(9, 1), 0))),
-            norm_f(
-                Conv2d(
-                    32,
-                    32, (kernel_size, 1), (stride, 1),
-                    padding=(get_padding(9, 1), 0))),
-            norm_f(
-                Conv2d(
-                    32,
-                    32, (kernel_size, 1), (stride, 1),
-                    padding=(get_padding(9, 1), 0))),
-            norm_f(Conv2d(32, 32, (5, 1), (1, 1), padding=(2, 0))),
-        ])
-        self.conv_post = norm_f(Conv2d(32, 1, (3, 1), (1, 1), padding=(1, 0)))
-        self.register_buffer('window', getattr(torch, window)(win_length))
-
-    def forward(self, wav):
-        wav = torch.squeeze(wav, 1)
-        x_mag = stft(wav, self.fft_size, self.shift_size, self.win_length,
-                     self.window)
-        x = torch.transpose(x_mag, 2, 1).unsqueeze(-1)
-        fmap = []
-        for layer in self.convs:
-            x = layer(x)
-            x = F.leaky_relu(x, LRELU_SLOPE)
-            fmap.append(x)
-        x = self.conv_post(x)
-        fmap.append(x)
-        x = x.squeeze(-1)
-
-        return x, fmap
-
-
-class MultiSTFTDiscriminator(torch.nn.Module):
-
-    def __init__(
-        self,
-        fft_sizes=[1024, 2048, 512],
-        hop_sizes=[120, 240, 50],
-        win_lengths=[600, 1200, 240],
-        window='hann_window',
-    ):
-        super(MultiSTFTDiscriminator, self).__init__()
-        self.discriminators = nn.ModuleList()
-        for fs, ss, wl in zip(fft_sizes, hop_sizes, win_lengths):
-            self.discriminators += [
-                DiscriminatorSTFT(fft_size=fs, shift_size=ss, win_length=wl)
-            ]
-
-    def forward(self, y, y_hat):
-        y_d_rs = []
-        y_d_gs = []
-        fmap_rs = []
-        fmap_gs = []
-        for i, d in enumerate(self.discriminators):
-            y_d_r, fmap_r = d(y)
-            y_d_g, fmap_g = d(y_hat)
-            y_d_rs.append(y_d_r)
-            fmap_rs.append(fmap_r)
-            y_d_gs.append(y_d_g)
-            fmap_gs.append(fmap_g)
-
-        return y_d_rs, y_d_gs, fmap_rs, fmap_gs
-
-
-def feature_loss(fmap_r, fmap_g):
-    loss = 0
-    for dr, dg in zip(fmap_r, fmap_g):
-        for rl, gl in zip(dr, dg):
-            loss += torch.mean(torch.abs(rl - gl))
-
-    return loss * 2
-
-
-def discriminator_loss(disc_real_outputs, disc_generated_outputs):
-    loss = 0
-    r_losses = []
-    g_losses = []
-    for dr, dg in zip(disc_real_outputs, disc_generated_outputs):
-        r_loss = torch.mean((1 - dr)**2)
-        g_loss = torch.mean(dg**2)
-        loss += (r_loss + g_loss)
-        r_losses.append(r_loss.item())
-        g_losses.append(g_loss.item())
-
-    return loss, r_losses, g_losses
-
-
-def generator_loss(disc_outputs):
-    loss = 0
-    gen_losses = []
-    for dg in disc_outputs:
-        temp_loss = torch.mean((1 - dg)**2)
-        gen_losses.append(temp_loss)
-        loss += temp_loss
-
-    return loss, gen_losses
diff --git a/modelscope/models/audio/tts/sambert_hifi.py b/modelscope/models/audio/tts/sambert_hifi.py
index 79f8068e..a9b55795 100644
--- a/modelscope/models/audio/tts/sambert_hifi.py
+++ b/modelscope/models/audio/tts/sambert_hifi.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 from __future__ import (absolute_import, division, print_function,
                         unicode_literals)
 import os
@@ -11,13 +13,11 @@ from modelscope.models.base import Model
 from modelscope.models.builder import MODELS
 from modelscope.utils.audio.tts_exceptions import (
     TtsFrontendInitializeFailedException,
-    TtsFrontendLanguageTypeInvalidException, TtsModelConfigurationExcetion,
+    TtsFrontendLanguageTypeInvalidException, TtsModelConfigurationException,
     TtsVoiceNotExistsException)
 from modelscope.utils.constant import Tasks
 from .voice import Voice
 
-import tensorflow as tf  # isort:skip
-
 __all__ = ['SambertHifigan']
 
 
@@ -28,14 +28,15 @@ class SambertHifigan(Model):
     def __init__(self, model_dir, *args, **kwargs):
         super().__init__(model_dir, *args, **kwargs)
         if 'am' not in kwargs:
-            raise TtsModelConfigurationExcetion(
-                'configuration model field missing am!')
+            raise TtsModelConfigurationException(
+                'modelscope error: configuration model field missing am!')
         if 'vocoder' not in kwargs:
-            raise TtsModelConfigurationExcetion(
-                'configuration model field missing vocoder!')
+            raise TtsModelConfigurationException(
+                'modelscope error: configuration model field missing vocoder!')
         if 'lang_type' not in kwargs:
-            raise TtsModelConfigurationExcetion(
-                'configuration model field missing lang_type!')
+            raise TtsModelConfigurationException(
+                'modelscope error: configuration model field missing lang_type!'
+            )
         am_cfg = kwargs['am']
         voc_cfg = kwargs['vocoder']
         # initialize frontend
@@ -47,10 +48,12 @@ class SambertHifigan(Model):
             zip_ref.extractall(model_dir)
         if not frontend.initialize(self.__res_path):
             raise TtsFrontendInitializeFailedException(
-                'resource invalid: {}'.format(self.__res_path))
+                'modelscope error: resource invalid: {}'.format(
+                    self.__res_path))
         if not frontend.set_lang_type(kwargs['lang_type']):
             raise TtsFrontendLanguageTypeInvalidException(
-                'language type invalid: {}'.format(kwargs['lang_type']))
+                'modelscope error: language type invalid: {}'.format(
+                    kwargs['lang_type']))
         self.__frontend = frontend
         zip_file = os.path.join(model_dir, 'voices.zip')
         self.__voice_path = os.path.join(model_dir, 'voices')
@@ -60,7 +63,8 @@ class SambertHifigan(Model):
         with open(voice_cfg_path, 'r') as f:
             voice_cfg = json.load(f)
         if 'voices' not in voice_cfg:
-            raise TtsModelConfigurationExcetion('voices invalid')
+            raise TtsModelConfigurationException(
+                'modelscope error: voices invalid')
         self.__voice = {}
         for name in voice_cfg['voices']:
             voice_path = os.path.join(self.__voice_path, name)
@@ -70,11 +74,13 @@ class SambertHifigan(Model):
         if voice_cfg['voices']:
             self.__default_voice_name = voice_cfg['voices'][0]
         else:
-            raise TtsVoiceNotExistsException('voices is empty in voices.json')
+            raise TtsVoiceNotExistsException(
+                'modelscope error: voices is empty in voices.json')
 
     def __synthesis_one_sentences(self, voice_name, text):
         if voice_name not in self.__voice:
-            raise TtsVoiceNotExistsException(f'Voice {voice_name} not exists')
+            raise TtsVoiceNotExistsException(
+                f'modelscope error: Voice {voice_name} not exists')
         return self.__voice[voice_name].forward(text)
 
     def forward(self, text: str, voice_name: str = None):
diff --git a/modelscope/models/audio/tts/text/cleaners.py b/modelscope/models/audio/tts/text/cleaners.py
deleted file mode 100755
index 19d838d1..00000000
--- a/modelscope/models/audio/tts/text/cleaners.py
+++ /dev/null
@@ -1,89 +0,0 @@
-'''
-Cleaners are transformations that run over the input text at both training and eval time.
-
-Cleaners can be selected by passing a comma-delimited list of cleaner names as the "cleaners"
-hyperparameter. Some cleaners are English-specific. You'll typically want to use:
-  1. "english_cleaners" for English text
-  2. "transliteration_cleaners" for non-English text that can be transliterated to ASCII using
-     the Unidecode library (https://pypi.python.org/pypi/Unidecode)
-  3. "basic_cleaners" if you do not want to transliterate (in this case, you should also update
-     the symbols in symbols.py to match your data).
-'''
-
-import re
-
-from unidecode import unidecode
-
-from .numbers import normalize_numbers
-
-# Regular expression matching whitespace:
-_whitespace_re = re.compile(r'\s+')
-
-# List of (regular expression, replacement) pairs for abbreviations:
-_abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1])
-                  for x in [
-                      ('mrs', 'misess'),
-                      ('mr', 'mister'),
-                      ('dr', 'doctor'),
-                      ('st', 'saint'),
-                      ('co', 'company'),
-                      ('jr', 'junior'),
-                      ('maj', 'major'),
-                      ('gen', 'general'),
-                      ('drs', 'doctors'),
-                      ('rev', 'reverend'),
-                      ('lt', 'lieutenant'),
-                      ('hon', 'honorable'),
-                      ('sgt', 'sergeant'),
-                      ('capt', 'captain'),
-                      ('esq', 'esquire'),
-                      ('ltd', 'limited'),
-                      ('col', 'colonel'),
-                      ('ft', 'fort'), ]]  # yapf:disable
-
-
-def expand_abbreviations(text):
-    for regex, replacement in _abbreviations:
-        text = re.sub(regex, replacement, text)
-    return text
-
-
-def expand_numbers(text):
-    return normalize_numbers(text)
-
-
-def lowercase(text):
-    return text.lower()
-
-
-def collapse_whitespace(text):
-    return re.sub(_whitespace_re, ' ', text)
-
-
-def convert_to_ascii(text):
-    return unidecode(text)
-
-
-def basic_cleaners(text):
-    '''Basic pipeline that lowercases and collapses whitespace without transliteration.'''
-    text = lowercase(text)
-    text = collapse_whitespace(text)
-    return text
-
-
-def transliteration_cleaners(text):
-    '''Pipeline for non-English text that transliterates to ASCII.'''
-    text = convert_to_ascii(text)
-    text = lowercase(text)
-    text = collapse_whitespace(text)
-    return text
-
-
-def english_cleaners(text):
-    '''Pipeline for English text, including number and abbreviation expansion.'''
-    text = convert_to_ascii(text)
-    text = lowercase(text)
-    text = expand_numbers(text)
-    text = expand_abbreviations(text)
-    text = collapse_whitespace(text)
-    return text
diff --git a/modelscope/models/audio/tts/text/cmudict.py b/modelscope/models/audio/tts/text/cmudict.py
deleted file mode 100755
index b4da4be9..00000000
--- a/modelscope/models/audio/tts/text/cmudict.py
+++ /dev/null
@@ -1,64 +0,0 @@
-import re
-
-valid_symbols = [
-    'AA', 'AA0', 'AA1', 'AA2', 'AE', 'AE0', 'AE1', 'AE2', 'AH', 'AH0', 'AH1',
-    'AH2', 'AO', 'AO0', 'AO1', 'AO2', 'AW', 'AW0', 'AW1', 'AW2', 'AY', 'AY0',
-    'AY1', 'AY2', 'B', 'CH', 'D', 'DH', 'EH', 'EH0', 'EH1', 'EH2', 'ER', 'ER0',
-    'ER1', 'ER2', 'EY', 'EY0', 'EY1', 'EY2', 'F', 'G', 'HH', 'IH', 'IH0',
-    'IH1', 'IH2', 'IY', 'IY0', 'IY1', 'IY2', 'JH', 'K', 'L', 'M', 'N', 'NG',
-    'OW', 'OW0', 'OW1', 'OW2', 'OY', 'OY0', 'OY1', 'OY2', 'P', 'R', 'S', 'SH',
-    'T', 'TH', 'UH', 'UH0', 'UH1', 'UH2', 'UW', 'UW0', 'UW1', 'UW2', 'V', 'W',
-    'Y', 'Z', 'ZH'
-]
-
-_valid_symbol_set = set(valid_symbols)
-
-
-class CMUDict:
-    '''Thin wrapper around CMUDict data. http://www.speech.cs.cmu.edu/cgi-bin/cmudict'''
-
-    def __init__(self, file_or_path, keep_ambiguous=True):
-        if isinstance(file_or_path, str):
-            with open(file_or_path, encoding='latin-1') as f:
-                entries = _parse_cmudict(f)
-        else:
-            entries = _parse_cmudict(file_or_path)
-        if not keep_ambiguous:
-            entries = {
-                word: pron
-                for word, pron in entries.items() if len(pron) == 1
-            }
-        self._entries = entries
-
-    def __len__(self):
-        return len(self._entries)
-
-    def lookup(self, word):
-        '''Returns list of ARPAbet pronunciations of the given word.'''
-        return self._entries.get(word.upper())
-
-
-_alt_re = re.compile(r'\([0-9]+\)')
-
-
-def _parse_cmudict(file):
-    cmudict = {}
-    for line in file:
-        if len(line) and (line[0] >= 'A' and line[0] <= 'Z' or line[0] == "'"):
-            parts = line.split('  ')
-            word = re.sub(_alt_re, '', parts[0])
-            pronunciation = _get_pronunciation(parts[1])
-            if pronunciation:
-                if word in cmudict:
-                    cmudict[word].append(pronunciation)
-                else:
-                    cmudict[word] = [pronunciation]
-    return cmudict
-
-
-def _get_pronunciation(s):
-    parts = s.strip().split(' ')
-    for part in parts:
-        if part not in _valid_symbol_set:
-            return None
-    return ' '.join(parts)
diff --git a/modelscope/models/audio/tts/text/symbols.py b/modelscope/models/audio/tts/text/symbols.py
deleted file mode 100644
index 63975abb..00000000
--- a/modelscope/models/audio/tts/text/symbols.py
+++ /dev/null
@@ -1,105 +0,0 @@
-'''
-Defines the set of symbols used in text input to the model.
-
-The default is a set of ASCII characters that works well for English or text that has been run
-through Unidecode. For other data, you can modify _characters. See TRAINING_DATA.md for details.
-'''
-import codecs
-import os
-
-_pad = '_'
-_eos = '~'
-_mask = '@[MASK]'
-
-
-def load_symbols(dict_path, has_mask=True):
-    _characters = ''
-    _ch_symbols = []
-    sy_dict_name = 'sy_dict.txt'
-    sy_dict_path = os.path.join(dict_path, sy_dict_name)
-    f = codecs.open(sy_dict_path, 'r')
-    for line in f:
-        line = line.strip('\r\n')
-        _ch_symbols.append(line)
-
-    _arpabet = ['@' + s for s in _ch_symbols]
-
-    # Export all symbols:
-    sy = list(_characters) + _arpabet + [_pad, _eos]
-    if has_mask:
-        sy.append(_mask)
-
-    _characters = ''
-
-    _ch_tones = []
-    tone_dict_name = 'tone_dict.txt'
-    tone_dict_path = os.path.join(dict_path, tone_dict_name)
-    f = codecs.open(tone_dict_path, 'r')
-    for line in f:
-        line = line.strip('\r\n')
-        _ch_tones.append(line)
-
-    # Export all tones:
-    tone = list(_characters) + _ch_tones + [_pad, _eos]
-    if has_mask:
-        tone.append(_mask)
-
-    _characters = ''
-
-    _ch_syllable_flags = []
-    syllable_flag_name = 'syllable_flag_dict.txt'
-    syllable_flag_path = os.path.join(dict_path, syllable_flag_name)
-    f = codecs.open(syllable_flag_path, 'r')
-    for line in f:
-        line = line.strip('\r\n')
-        _ch_syllable_flags.append(line)
-
-    # Export all syllable_flags:
-    syllable_flag = list(_characters) + _ch_syllable_flags + [_pad, _eos]
-    if has_mask:
-        syllable_flag.append(_mask)
-
-    _characters = ''
-
-    _ch_word_segments = []
-    word_segment_name = 'word_segment_dict.txt'
-    word_segment_path = os.path.join(dict_path, word_segment_name)
-    f = codecs.open(word_segment_path, 'r')
-    for line in f:
-        line = line.strip('\r\n')
-        _ch_word_segments.append(line)
-
-    # Export all syllable_flags:
-    word_segment = list(_characters) + _ch_word_segments + [_pad, _eos]
-    if has_mask:
-        word_segment.append(_mask)
-
-    _characters = ''
-
-    _ch_emo_types = []
-    emo_category_name = 'emo_category_dict.txt'
-    emo_category_path = os.path.join(dict_path, emo_category_name)
-    f = codecs.open(emo_category_path, 'r')
-    for line in f:
-        line = line.strip('\r\n')
-        _ch_emo_types.append(line)
-
-    emo_category = list(_characters) + _ch_emo_types + [_pad, _eos]
-    if has_mask:
-        emo_category.append(_mask)
-
-    _characters = ''
-
-    _ch_speakers = []
-    speaker_name = 'speaker_dict.txt'
-    speaker_path = os.path.join(dict_path, speaker_name)
-    f = codecs.open(speaker_path, 'r')
-    for line in f:
-        line = line.strip('\r\n')
-        _ch_speakers.append(line)
-
-    # Export all syllable_flags:
-    speaker = list(_characters) + _ch_speakers + [_pad, _eos]
-    if has_mask:
-        speaker.append(_mask)
-    return sy, tone, syllable_flag, word_segment, emo_category, speaker
diff --git a/modelscope/models/audio/tts/text/symbols_dict.py b/modelscope/models/audio/tts/text/symbols_dict.py
deleted file mode 100644
index e8f7ed19..00000000
--- a/modelscope/models/audio/tts/text/symbols_dict.py
+++ /dev/null
@@ -1,200 +0,0 @@
-import re
-import sys
-
-from .cleaners import (basic_cleaners, english_cleaners,
-                       transliteration_cleaners)
-
-
-class SymbolsDict:
-
-    def __init__(self, sy, tone, syllable_flag, word_segment, emo_category,
-                 speaker, inputs_dim, lfeat_type_list):
-        self._inputs_dim = inputs_dim
-        self._lfeat_type_list = lfeat_type_list
-        self._sy_to_id = {s: i for i, s in enumerate(sy)}
-        self._id_to_sy = {i: s for i, s in enumerate(sy)}
-        self._tone_to_id = {s: i for i, s in enumerate(tone)}
-        self._id_to_tone = {i: s for i, s in enumerate(tone)}
-        self._syllable_flag_to_id = {s: i for i, s in enumerate(syllable_flag)}
-        self._id_to_syllable_flag = {i: s for i, s in enumerate(syllable_flag)}
-        self._word_segment_to_id = {s: i for i, s in enumerate(word_segment)}
-        self._id_to_word_segment = {i: s for i, s in enumerate(word_segment)}
-        self._emo_category_to_id = {s: i for i, s in enumerate(emo_category)}
-        self._id_to_emo_category = {i: s for i, s in enumerate(emo_category)}
-        self._speaker_to_id = {s: i for i, s in enumerate(speaker)}
-        self._id_to_speaker = {i: s for i, s in enumerate(speaker)}
-        print('_sy_to_id: ')
-        print(self._sy_to_id)
-        print('_tone_to_id: ')
-        print(self._tone_to_id)
-        print('_syllable_flag_to_id: ')
-        print(self._syllable_flag_to_id)
-        print('_word_segment_to_id: ')
-        print(self._word_segment_to_id)
-        print('_emo_category_to_id: ')
-        print(self._emo_category_to_id)
-        print('_speaker_to_id: ')
-        print(self._speaker_to_id)
-        self._curly_re = re.compile(r'(.*?)\{(.+?)\}(.*)')
-        self._cleaners = {
-            basic_cleaners.__name__: basic_cleaners,
-            transliteration_cleaners.__name__: transliteration_cleaners,
-            english_cleaners.__name__: english_cleaners
-        }
-
-    def _clean_text(self, text, cleaner_names):
-        for name in cleaner_names:
-            cleaner = self._cleaners.get(name)
-            if not cleaner:
-                raise Exception('Unknown cleaner: %s' % name)
-            text = cleaner(text)
-        return text
-
-    def _sy_to_sequence(self, sy):
-        return [self._sy_to_id[s] for s in sy if self._should_keep_sy(s)]
-
-    def _arpabet_to_sequence(self, text):
-        return self._sy_to_sequence(['@' + s for s in text.split()])
-
-    def _should_keep_sy(self, s):
-        return s in self._sy_to_id and s != '_' and s != '~'
-
-    def symbol_to_sequence(self, this_lfeat_symbol, lfeat_type, cleaner_names):
-        sequence = []
-        if lfeat_type == 'sy':
-            this_lfeat_symbol = this_lfeat_symbol.strip().split(' ')
-            this_lfeat_symbol_format = ''
-            index = 0
-            while index < len(this_lfeat_symbol):
-                this_lfeat_symbol_format = this_lfeat_symbol_format + '{' + this_lfeat_symbol[
-                    index] + '}' + ' '
-                index = index + 1
-            sequence = self.text_to_sequence(this_lfeat_symbol_format,
-                                             cleaner_names)
-        elif lfeat_type == 'tone':
-            sequence = self.tone_to_sequence(this_lfeat_symbol)
-        elif lfeat_type == 'syllable_flag':
-            sequence = self.syllable_flag_to_sequence(this_lfeat_symbol)
-        elif lfeat_type == 'word_segment':
-            sequence = self.word_segment_to_sequence(this_lfeat_symbol)
-        elif lfeat_type == 'emo_category':
-            sequence = self.emo_category_to_sequence(this_lfeat_symbol)
-        elif lfeat_type == 'speaker':
-            sequence = self.speaker_to_sequence(this_lfeat_symbol)
-        else:
-            raise Exception('Unknown lfeat type: %s' % lfeat_type)
-
-        return sequence
-
-    def text_to_sequence(self, text, cleaner_names):
-        '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
-
-          The text can optionally have ARPAbet sequences enclosed in curly braces embedded
-          in it. For example, "Turn left on {HH AW1 S S T AH0 N} Street."
-
-          Args:
-            text: string to convert to a sequence
-            cleaner_names: names of the cleaner functions to run the text through
-
-          Returns:
-            List of integers corresponding to the symbols in the text
-        '''
-        sequence = []
-
-        # Check for curly braces and treat their contents as ARPAbet:
-        while len(text):
-            m = self._curly_re.match(text)
-            if not m:
-                sequence += self._sy_to_sequence(
-                    self._clean_text(text, cleaner_names))
-                break
-            sequence += self._sy_to_sequence(
-                self._clean_text(m.group(1), cleaner_names))
-            sequence += self._arpabet_to_sequence(m.group(2))
-            text = m.group(3)
-
-        # Append EOS token
-        sequence.append(self._sy_to_id['~'])
-        return sequence
-
-    def tone_to_sequence(self, tone):
-        tones = tone.strip().split(' ')
-        sequence = []
-        for this_tone in tones:
-            sequence.append(self._tone_to_id[this_tone])
-        sequence.append(self._tone_to_id['~'])
-        return sequence
-
-    def syllable_flag_to_sequence(self, syllable_flag):
-        syllable_flags = syllable_flag.strip().split(' ')
-        sequence = []
-        for this_syllable_flag in syllable_flags:
-            sequence.append(self._syllable_flag_to_id[this_syllable_flag])
-        sequence.append(self._syllable_flag_to_id['~'])
-        return sequence
-
-    def word_segment_to_sequence(self, word_segment):
-        word_segments = word_segment.strip().split(' ')
-        sequence = []
-        for this_word_segment in word_segments:
-            sequence.append(self._word_segment_to_id[this_word_segment])
-        sequence.append(self._word_segment_to_id['~'])
-        return sequence
-
-    def emo_category_to_sequence(self, emo_type):
-        emo_categories = emo_type.strip().split(' ')
-        sequence = []
-        for this_category in emo_categories:
-            sequence.append(self._emo_category_to_id[this_category])
-        sequence.append(self._emo_category_to_id['~'])
-        return sequence
-
-    def speaker_to_sequence(self, speaker):
-        speakers = speaker.strip().split(' ')
-        sequence = []
-        for this_speaker in speakers:
-            sequence.append(self._speaker_to_id[this_speaker])
-        sequence.append(self._speaker_to_id['~'])
-        return sequence
-
-    def sequence_to_symbol(self, sequence):
-        result = ''
-        pre_lfeat_dim = 0
-        for lfeat_type in self._lfeat_type_list:
-            current_one_hot_sequence = sequence[:, pre_lfeat_dim:pre_lfeat_dim
-                                                + self._inputs_dim[lfeat_type]]
-            current_sequence = current_one_hot_sequence.argmax(1)
-            length = current_sequence.shape[0]
-
-            index = 0
-            while index < length:
-                this_sequence = current_sequence[index]
-                s = ''
-                if lfeat_type == 'sy':
-                    s = self._id_to_sy[this_sequence]
-                    if len(s) > 1 and s[0] == '@':
-                        s = s[1:]
-                elif lfeat_type == 'tone':
-                    s = self._id_to_tone[this_sequence]
-                elif lfeat_type == 'syllable_flag':
-                    s = self._id_to_syllable_flag[this_sequence]
-                elif lfeat_type == 'word_segment':
-                    s = self._id_to_word_segment[this_sequence]
-                elif lfeat_type == 'emo_category':
-                    s = self._id_to_emo_category[this_sequence]
-                elif lfeat_type == 'speaker':
-                    s = self._id_to_speaker[this_sequence]
-                else:
-                    raise Exception('Unknown lfeat type: %s' % lfeat_type)
-
-                if index == 0:
-                    result = result + lfeat_type + ': '
-
-                result = result + '{' + s + '}'
-
-                if index == length - 1:
-                    result = result + '; '
-
-                index = index + 1
-            pre_lfeat_dim = pre_lfeat_dim + self._inputs_dim[lfeat_type]
-        return result
diff --git a/modelscope/models/audio/tts/voice.py b/modelscope/models/audio/tts/voice.py
index deaebf11..dc830db5 100644
--- a/modelscope/models/audio/tts/voice.py
+++ b/modelscope/models/audio/tts/voice.py
@@ -1,286 +1,111 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import os
+import pickle as pkl
 
 import json
 import numpy as np
 import torch
-from sklearn.preprocessing import MultiLabelBinarizer
 
+from modelscope.utils.audio.tts_exceptions import \
+    TtsModelConfigurationException
 from modelscope.utils.constant import ModelFile, Tasks
-from .models import Generator, create_am_model
-from .text.symbols import load_symbols
-from .text.symbols_dict import SymbolsDict
-
-import tensorflow as tf  # isort:skip
+from .models.datasets.units import KanTtsLinguisticUnit
+from .models.models.hifigan import Generator
+from .models.models.sambert import KanTtsSAMBERT
+from .models.utils import (AttrDict, build_env, init_weights, load_checkpoint,
+                           plot_spectrogram, save_checkpoint, scan_checkpoint)
 
 MAX_WAV_VALUE = 32768.0
 
 
-def multi_label_symbol_to_sequence(my_classes, my_symbol):
-    one_hot = MultiLabelBinarizer(classes=my_classes)
-    tokens = my_symbol.strip().split(' ')
-    sequences = []
-    for token in tokens:
-        sequences.append(tuple(token.split('&')))
-    return one_hot.fit_transform(sequences)
-
-
-def load_checkpoint(filepath, device):
-    assert os.path.isfile(filepath)
-    checkpoint_dict = torch.load(filepath, map_location=device)
-    return checkpoint_dict
-
-
-class AttrDict(dict):
-
-    def __init__(self, *args, **kwargs):
-        super(AttrDict, self).__init__(*args, **kwargs)
-        self.__dict__ = self
-
-
 class Voice:
 
-    def __init__(self, voice_name, voice_path, am_hparams, voc_config):
+    def __init__(self, voice_name, voice_path, am_config, voc_config):
         self.__voice_name = voice_name
         self.__voice_path = voice_path
-        self.__am_hparams = tf.contrib.training.HParams(**am_hparams)
+        self.__am_config = AttrDict(**am_config)
         self.__voc_config = AttrDict(**voc_config)
         self.__model_loaded = False
+        if 'am' not in self.__am_config:
+            raise TtsModelConfigurationException(
+                'modelscope error: am configuration invalid')
+        if 'linguistic_unit' not in self.__am_config:
+            raise TtsModelConfigurationException(
+                'modelscope error: am configuration invalid')
+        self.__am_lingustic_unit_config = self.__am_config['linguistic_unit']
 
     def __load_am(self):
-        local_am_ckpt_path = os.path.join(self.__voice_path,
-                                          ModelFile.TF_CHECKPOINT_FOLDER)
-        self.__am_ckpt_path = os.path.join(local_am_ckpt_path, 'ckpt')
-        self.__dict_path = os.path.join(self.__voice_path, 'dicts')
+        local_am_ckpt_path = os.path.join(self.__voice_path, 'am')
+        self.__am_ckpt_path = os.path.join(local_am_ckpt_path,
+                                           ModelFile.TORCH_MODEL_BIN_FILE)
         has_mask = True
-        if self.__am_hparams.get('has_mask') is not None:
-            has_mask = self.__am_hparams.has_mask
-        model_name = 'robutrans'
-        self.__lfeat_type_list = self.__am_hparams.lfeat_type_list.strip(
-        ).split(',')
-        sy, tone, syllable_flag, word_segment, emo_category, speaker = load_symbols(
-            self.__dict_path, has_mask)
-        self.__sy = sy
-        self.__tone = tone
-        self.__syllable_flag = syllable_flag
-        self.__word_segment = word_segment
-        self.__emo_category = emo_category
-        self.__speaker = speaker
-        self.__inputs_dim = dict()
-        for lfeat_type in self.__lfeat_type_list:
-            if lfeat_type == 'sy':
-                self.__inputs_dim[lfeat_type] = len(sy)
-            elif lfeat_type == 'tone':
-                self.__inputs_dim[lfeat_type] = len(tone)
-            elif lfeat_type == 'syllable_flag':
-                self.__inputs_dim[lfeat_type] = len(syllable_flag)
-            elif lfeat_type == 'word_segment':
-                self.__inputs_dim[lfeat_type] = len(word_segment)
-            elif lfeat_type == 'emo_category':
-                self.__inputs_dim[lfeat_type] = len(emo_category)
-            elif lfeat_type == 'speaker':
-                self.__inputs_dim[lfeat_type] = len(speaker)
-
-        self.__symbols_dict = SymbolsDict(sy, tone, syllable_flag,
-                                          word_segment, emo_category, speaker,
-                                          self.__inputs_dim,
-                                          self.__lfeat_type_list)
-        dim_inputs = sum(self.__inputs_dim.values(
-        )) - self.__inputs_dim['speaker'] - self.__inputs_dim['emo_category']
-        self.__graph = tf.Graph()
-        with self.__graph.as_default():
-            inputs = tf.placeholder(tf.float32, [1, None, dim_inputs],
-                                    'inputs')
-            inputs_emotion = tf.placeholder(
-                tf.float32, [1, None, self.__inputs_dim['emo_category']],
-                'inputs_emotion')
-            inputs_speaker = tf.placeholder(
-                tf.float32, [1, None, self.__inputs_dim['speaker']],
-                'inputs_speaker')
-            input_lengths = tf.placeholder(tf.int32, [1], 'input_lengths')
-            pitch_contours_scale = tf.placeholder(tf.float32, [1, None],
-                                                  'pitch_contours_scale')
-            energy_contours_scale = tf.placeholder(tf.float32, [1, None],
-                                                   'energy_contours_scale')
-            duration_scale = tf.placeholder(tf.float32, [1, None],
-                                            'duration_scale')
-            with tf.variable_scope('model') as _:
-                self.__model = create_am_model(model_name, self.__am_hparams)
-                self.__model.initialize(
-                    inputs,
-                    inputs_emotion,
-                    inputs_speaker,
-                    input_lengths,
-                    duration_scales=duration_scale,
-                    pitch_scales=pitch_contours_scale,
-                    energy_scales=energy_contours_scale)
-                self.__mel_spec = self.__model.mel_outputs[0]
-                self.__duration_outputs = self.__model.duration_outputs[0]
-                self.__duration_outputs_ = self.__model.duration_outputs_[0]
-                self.__pitch_contour_outputs = self.__model.pitch_contour_outputs[
-                    0]
-                self.__energy_contour_outputs = self.__model.energy_contour_outputs[
-                    0]
-                self.__embedded_inputs_emotion = self.__model.embedded_inputs_emotion[
-                    0]
-                self.__embedding_fsmn_outputs = self.__model.embedding_fsmn_outputs[
-                    0]
-                self.__encoder_outputs = self.__model.encoder_outputs[0]
-                self.__pitch_embeddings = self.__model.pitch_embeddings[0]
-                self.__energy_embeddings = self.__model.energy_embeddings[0]
-                self.__LR_outputs = self.__model.LR_outputs[0]
-                self.__postnet_fsmn_outputs = self.__model.postnet_fsmn_outputs[
-                    0]
-                self.__attention_h = self.__model.attention_h
-                self.__attention_x = self.__model.attention_x
-
-                config = tf.ConfigProto()
-                config.gpu_options.allow_growth = True
-                self.__session = tf.Session(config=config)
-                self.__session.run(tf.global_variables_initializer())
-
-                saver = tf.train.Saver()
-                saver.restore(self.__session, self.__am_ckpt_path)
+        if 'has_mask' in self.__am_lingustic_unit_config:
+            has_mask = self.__am_lingustic_unit_config.has_mask
+        self.__ling_unit = KanTtsLinguisticUnit(
+            self.__am_lingustic_unit_config, self.__voice_path, has_mask)
+        self.__am_net = KanTtsSAMBERT(self.__am_config,
+                                      self.__ling_unit.get_unit_size()).to(
+                                          self.__device)
+        state_dict_g = {}
+        try:
+            state_dict_g = load_checkpoint(self.__am_ckpt_path, self.__device)
+        except RuntimeError:
+            with open(self.__am_ckpt_path, 'rb') as f:
+                pth_var_dict = pkl.load(f)
+                state_dict_g['fsnet'] = {
+                    k: torch.FloatTensor(v)
+                    for k, v in pth_var_dict['fsnet'].items()
+                }
+        self.__am_net.load_state_dict(state_dict_g['fsnet'], strict=False)
+        self.__am_net.eval()
 
     def __load_vocoder(self):
-        self.__voc_ckpt_path = os.path.join(self.__voice_path,
+        local_voc_ckpy_path = os.path.join(self.__voice_path, 'vocoder')
+        self.__voc_ckpt_path = os.path.join(local_voc_ckpy_path,
                                             ModelFile.TORCH_MODEL_BIN_FILE)
-        if torch.cuda.is_available():
-            torch.manual_seed(self.__voc_config.seed)
-            self.__device = torch.device('cuda')
-        else:
-            self.__device = torch.device('cpu')
         self.__generator = Generator(self.__voc_config).to(self.__device)
         state_dict_g = load_checkpoint(self.__voc_ckpt_path, self.__device)
         self.__generator.load_state_dict(state_dict_g['generator'])
         self.__generator.eval()
         self.__generator.remove_weight_norm()
 
-    def __am_forward(self,
-                     text,
-                     pitch_control_str='',
-                     duration_control_str='',
-                     energy_control_str=''):
-        duration_cfg_lst = []
-        if len(duration_control_str) != 0:
-            for item in duration_control_str.strip().split('|'):
-                percent, scale = item.lstrip('(').rstrip(')').split(',')
-                duration_cfg_lst.append((float(percent), float(scale)))
-        pitch_contours_cfg_lst = []
-        if len(pitch_control_str) != 0:
-            for item in pitch_control_str.strip().split('|'):
-                percent, scale = item.lstrip('(').rstrip(')').split(',')
-                pitch_contours_cfg_lst.append((float(percent), float(scale)))
-        energy_contours_cfg_lst = []
-        if len(energy_control_str) != 0:
-            for item in energy_control_str.strip().split('|'):
-                percent, scale = item.lstrip('(').rstrip(')').split(',')
-                energy_contours_cfg_lst.append((float(percent), float(scale)))
-        cleaner_names = [
-            x.strip() for x in self.__am_hparams.cleaners.split(',')
-        ]
-
-        lfeat_symbol = text.strip().split(' ')
-        lfeat_symbol_separate = [''] * int(len(self.__lfeat_type_list))
-        for this_lfeat_symbol in lfeat_symbol:
-            this_lfeat_symbol = this_lfeat_symbol.strip('{').strip('}').split(
-                '$')
-            if len(this_lfeat_symbol) != len(self.__lfeat_type_list):
-                raise Exception(
-                    'Length of this_lfeat_symbol in training data'
-                    + ' is not equal to the length of lfeat_type_list, '
-                    + str(len(this_lfeat_symbol)) + ' VS. '
-                    + str(len(self.__lfeat_type_list)))
-            index = 0
-            while index < len(lfeat_symbol_separate):
-                lfeat_symbol_separate[index] = lfeat_symbol_separate[
-                    index] + this_lfeat_symbol[index] + ' '
-                index = index + 1
-
-        index = 0
-        lfeat_type = self.__lfeat_type_list[index]
-        sequence = self.__symbols_dict.symbol_to_sequence(
-            lfeat_symbol_separate[index].strip(), lfeat_type, cleaner_names)
-        sequence_array = np.asarray(
-            sequence[:-1],
-            dtype=np.int32)  # sequence length minus 1 to ignore EOS ~
-        inputs = np.eye(
-            self.__inputs_dim[lfeat_type], dtype=np.float32)[sequence_array]
-        index = index + 1
-        while index < len(self.__lfeat_type_list) - 2:
-            lfeat_type = self.__lfeat_type_list[index]
-            sequence = self.__symbols_dict.symbol_to_sequence(
-                lfeat_symbol_separate[index].strip(), lfeat_type,
-                cleaner_names)
-            sequence_array = np.asarray(
-                sequence[:-1],
-                dtype=np.int32)  # sequence length minus 1 to ignore EOS ~
-            inputs_temp = np.eye(
-                self.__inputs_dim[lfeat_type],
-                dtype=np.float32)[sequence_array]
-            inputs = np.concatenate((inputs, inputs_temp), axis=1)
-            index = index + 1
-        seq = inputs
-
-        lfeat_type = 'emo_category'
-        inputs_emotion = multi_label_symbol_to_sequence(
-            self.__emo_category, lfeat_symbol_separate[index].strip())
-        # inputs_emotion = inputs_emotion * 1.5
-        index = index + 1
-
-        lfeat_type = 'speaker'
-        inputs_speaker = multi_label_symbol_to_sequence(
-            self.__speaker, lfeat_symbol_separate[index].strip())
-
-        duration_scale = np.ones((len(seq), ), dtype=np.float32)
-        start_idx = 0
-        for (percent, scale) in duration_cfg_lst:
-            duration_scale[start_idx:start_idx
-                           + int(percent * len(seq))] = scale
-            start_idx += int(percent * len(seq))
-
-        pitch_contours_scale = np.ones((len(seq), ), dtype=np.float32)
-        start_idx = 0
-        for (percent, scale) in pitch_contours_cfg_lst:
-            pitch_contours_scale[start_idx:start_idx
-                                 + int(percent * len(seq))] = scale
-            start_idx += int(percent * len(seq))
-
-        energy_contours_scale = np.ones((len(seq), ), dtype=np.float32)
-        start_idx = 0
-        for (percent, scale) in energy_contours_cfg_lst:
-            energy_contours_scale[start_idx:start_idx
-                                  + int(percent * len(seq))] = scale
-            start_idx += int(percent * len(seq))
-
-        feed_dict = {
-            self.__model.inputs: [np.asarray(seq, dtype=np.float32)],
-            self.__model.inputs_emotion:
-            [np.asarray(inputs_emotion, dtype=np.float32)],
-            self.__model.inputs_speaker:
-            [np.asarray(inputs_speaker, dtype=np.float32)],
-            self.__model.input_lengths:
-            np.asarray([len(seq)], dtype=np.int32),
-            self.__model.duration_scales: [duration_scale],
-            self.__model.pitch_scales: [pitch_contours_scale],
-            self.__model.energy_scales: [energy_contours_scale]
-        }
-
-        result = self.__session.run([
-            self.__mel_spec, self.__duration_outputs, self.__duration_outputs_,
-            self.__pitch_contour_outputs, self.__embedded_inputs_emotion,
-            self.__embedding_fsmn_outputs, self.__encoder_outputs,
-            self.__pitch_embeddings, self.__LR_outputs,
-            self.__postnet_fsmn_outputs, self.__energy_contour_outputs,
-            self.__energy_embeddings, self.__attention_x, self.__attention_h
-        ], feed_dict=feed_dict)  # yapf:disable
-        return result[0]
+    def __am_forward(self, symbol_seq):
+        with torch.no_grad():
+            inputs_feat_lst = self.__ling_unit.encode_symbol_sequence(
+                symbol_seq)
+            inputs_sy = torch.from_numpy(inputs_feat_lst[0]).long().to(
+                self.__device)
+            inputs_tone = torch.from_numpy(inputs_feat_lst[1]).long().to(
+                self.__device)
+            inputs_syllable = torch.from_numpy(inputs_feat_lst[2]).long().to(
+                self.__device)
+            inputs_ws = torch.from_numpy(inputs_feat_lst[3]).long().to(
+                self.__device)
+            inputs_ling = torch.stack(
+                [inputs_sy, inputs_tone, inputs_syllable, inputs_ws],
+                dim=-1).unsqueeze(0)
+            inputs_emo = torch.from_numpy(inputs_feat_lst[4]).long().to(
+                self.__device).unsqueeze(0)
+            inputs_spk = torch.from_numpy(inputs_feat_lst[5]).long().to(
+                self.__device).unsqueeze(0)
+            inputs_len = torch.zeros(1).to(self.__device).long(
+            ) + inputs_emo.size(1) - 1  # minus 1 for "~"
+            res = self.__am_net(inputs_ling[:, :-1, :], inputs_emo[:, :-1],
+                                inputs_spk[:, :-1], inputs_len)
+            postnet_outputs = res['postnet_outputs']
+            LR_length_rounded = res['LR_length_rounded']
+            valid_length = int(LR_length_rounded[0].item())
+            postnet_outputs = postnet_outputs[
+                0, :valid_length, :].cpu().numpy()
+            return postnet_outputs
 
     def __vocoder_forward(self, melspec):
         dim0 = list(melspec.shape)[-1]
         if dim0 != self.__voc_config.num_mels:
             raise TtsVocoderMelspecShapeMismatchException(
-                'input melspec mismatch require {} but {}'.format(
-                    self.__voc_config.num_mels, dim0))
+                'modelscope error: input melspec mismatch require {} but {}'.
+                format(self.__voc_config.num_mels, dim0))
         with torch.no_grad():
             x = melspec.T
             x = torch.FloatTensor(x).to(self.__device)
@@ -292,9 +117,15 @@ class Voice:
             audio = audio.cpu().numpy().astype('int16')
             return audio
 
-    def forward(self, text):
+    def forward(self, symbol_seq):
         if not self.__model_loaded:
+            torch.manual_seed(self.__am_config.seed)
+            if torch.cuda.is_available():
+                torch.manual_seed(self.__am_config.seed)
+                self.__device = torch.device('cuda')
+            else:
+                self.__device = torch.device('cpu')
             self.__load_am()
             self.__load_vocoder()
             self.__model_loaded = True
-        return self.__vocoder_forward(self.__am_forward(text))
+        return self.__vocoder_forward(self.__am_forward(symbol_seq))
diff --git a/modelscope/models/base/__init__.py b/modelscope/models/base/__init__.py
index ab7901af..8c47ecaf 100644
--- a/modelscope/models/base/__init__.py
+++ b/modelscope/models/base/__init__.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 from .base_head import *  # noqa F403
 from .base_model import *  # noqa F403
 from .base_torch_head import *  # noqa F403
diff --git a/modelscope/models/base/base_head.py b/modelscope/models/base/base_head.py
index 07a68253..11bda32f 100644
--- a/modelscope/models/base/base_head.py
+++ b/modelscope/models/base/base_head.py
@@ -1,6 +1,6 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 from abc import ABC, abstractmethod
-from typing import Dict, Union
+from typing import Any, Dict, Union
 
 from modelscope.models.base.base_model import Model
 from modelscope.utils.config import ConfigDict
@@ -22,25 +22,20 @@ class Head(ABC):
         self.config = ConfigDict(kwargs)
 
     @abstractmethod
-    def forward(self, input: Input) -> Dict[str, Tensor]:
+    def forward(self, *args, **kwargs) -> Dict[str, Any]:
         """
         This method will use the output from backbone model to do any
-        downstream tasks
-        Args:
-            input: The tensor output or a model from backbone model
-            (text generation need a model as input)
-        Returns: The output from downstream taks
+        downstream tasks. Recieve The output from backbone model.
+
+        Returns (Dict[str, Any]): The output from downstream task.
         """
         pass
 
     @abstractmethod
-    def compute_loss(self, outputs: Dict[str, Tensor],
-                     labels) -> Dict[str, Tensor]:
+    def compute_loss(self, *args, **kwargs) -> Dict[str, Any]:
         """
-        compute loss for head during the finetuning
+        compute loss for head during the finetuning.
 
-        Args:
-            outputs (Dict[str, Tensor]):  the output from the model forward
-        Returns:  the loss(Dict[str, Tensor]):
+        Returns (Dict[str, Any]): The loss dict
         """
         pass
diff --git a/modelscope/models/base/base_model.py b/modelscope/models/base/base_model.py
index 279dbba2..cdc71fcf 100644
--- a/modelscope/models/base/base_model.py
+++ b/modelscope/models/base/base_model.py
@@ -1,18 +1,15 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
-
+import os
 import os.path as osp
 from abc import ABC, abstractmethod
-from typing import Dict, Optional, Union
-
-import numpy as np
+from typing import Any, Callable, Dict, List, Optional, Union
 
 from modelscope.hub.snapshot_download import snapshot_download
 from modelscope.models.builder import build_model
+from modelscope.utils.checkpoint import save_pretrained
 from modelscope.utils.config import Config
 from modelscope.utils.constant import DEFAULT_MODEL_REVISION, ModelFile
 from modelscope.utils.device import device_placement, verify_device
-from modelscope.utils.file_utils import func_receive_dict_inputs
-from modelscope.utils.hub import parse_label_mapping
 from modelscope.utils.logger import get_logger
 
 logger = get_logger()
@@ -28,35 +25,31 @@ class Model(ABC):
         verify_device(device_name)
         self._device_name = device_name
 
-    def __call__(self, input: Dict[str, Tensor]) -> Dict[str, Tensor]:
-        return self.postprocess(self.forward(input))
+    def __call__(self, *args, **kwargs) -> Dict[str, Any]:
+        return self.postprocess(self.forward(*args, **kwargs))
 
     @abstractmethod
-    def forward(self, input: Dict[str, Tensor]) -> Dict[str, Tensor]:
+    def forward(self, *args, **kwargs) -> Dict[str, Any]:
         """
         Run the forward pass for a model.
 
-        Args:
-            input (Dict[str, Tensor]): the dict of the model inputs for the forward method
-
         Returns:
-            Dict[str, Tensor]: output from the model forward pass
+            Dict[str, Any]: output from the model forward pass
         """
         pass
 
-    def postprocess(self, input: Dict[str, Tensor],
-                    **kwargs) -> Dict[str, Tensor]:
+    def postprocess(self, inputs: Dict[str, Any], **kwargs) -> Dict[str, Any]:
         """ Model specific postprocess and convert model output to
         standard model outputs.
 
         Args:
-            input:  input data
+            inputs:  input data
 
         Return:
             dict of results:  a dict containing outputs of model, each
                 output should have the standard output name.
         """
-        return input
+        return inputs
 
     @classmethod
     def _instantiate(cls, **kwargs):
@@ -98,7 +91,6 @@ class Model(ABC):
                 osp.join(local_model_dir, ModelFile.CONFIGURATION))
         task_name = cfg.task
         model_cfg = cfg.model
-        framework = cfg.framework
 
         if hasattr(model_cfg, 'model_type') and not hasattr(model_cfg, 'type'):
             model_cfg.type = model_cfg.model_type
@@ -108,9 +100,8 @@ class Model(ABC):
             model_cfg[k] = v
         if device is not None:
             model_cfg.device = device
-            with device_placement(framework, device):
-                model = build_model(
-                    model_cfg, task_name=task_name, default_args=kwargs)
+            model = build_model(
+                model_cfg, task_name=task_name, default_args=kwargs)
         else:
             model = build_model(
                 model_cfg, task_name=task_name, default_args=kwargs)
@@ -119,3 +110,28 @@ class Model(ABC):
         if hasattr(cfg, 'pipeline'):
             model.pipeline = cfg.pipeline
         return model
+
+    def save_pretrained(self,
+                        target_folder: Union[str, os.PathLike],
+                        save_checkpoint_names: Union[str, List[str]] = None,
+                        save_function: Callable = None,
+                        config: Optional[dict] = None,
+                        **kwargs):
+        """save the pretrained model, its configuration and other related files to a directory, so that it can be re-loaded
+
+        Args:
+            target_folder (Union[str, os.PathLike]):
+            Directory to which to save. Will be created if it doesn't exist.
+
+            save_checkpoint_names (Union[str, List[str]]):
+            The checkpoint names to be saved in the target_folder
+
+            save_function (Callable, optional):
+            The function to use to save the state dictionary.
+
+            config (Optional[dict], optional):
+            The config for the configuration.json, might not be identical with model.config
+
+        """
+        save_pretrained(self, target_folder, save_checkpoint_names,
+                        save_function, config, **kwargs)
diff --git a/modelscope/models/base/base_torch_head.py b/modelscope/models/base/base_torch_head.py
index c5a78519..faee4296 100644
--- a/modelscope/models/base/base_torch_head.py
+++ b/modelscope/models/base/base_torch_head.py
@@ -1,5 +1,5 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
-from typing import Dict
+from typing import Any, Dict
 
 import torch
 
@@ -18,10 +18,8 @@ class TorchHead(Head, torch.nn.Module):
         super().__init__(**kwargs)
         torch.nn.Module.__init__(self)
 
-    def forward(self, inputs: Dict[str,
-                                   torch.Tensor]) -> Dict[str, torch.Tensor]:
+    def forward(self, *args, **kwargs) -> Dict[str, Any]:
         raise NotImplementedError
 
-    def compute_loss(self, outputs: Dict[str, torch.Tensor],
-                     labels) -> Dict[str, torch.Tensor]:
+    def compute_loss(self, *args, **kwargs) -> Dict[str, Any]:
         raise NotImplementedError
diff --git a/modelscope/models/base/base_torch_model.py b/modelscope/models/base/base_torch_model.py
index cfc88721..3c99a1f2 100644
--- a/modelscope/models/base/base_torch_model.py
+++ b/modelscope/models/base/base_torch_model.py
@@ -1,6 +1,6 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 
-from typing import Any, Dict, Optional, Union
+from typing import Any, Dict
 
 import torch
 from torch import nn
@@ -21,15 +21,14 @@ class TorchModel(Model, torch.nn.Module):
         super().__init__(model_dir, *args, **kwargs)
         torch.nn.Module.__init__(self)
 
-    def __call__(self, input: Dict[str,
-                                   torch.Tensor]) -> Dict[str, torch.Tensor]:
+    def __call__(self, *args, **kwargs) -> Dict[str, Any]:
+        # Adapting a model with only one dict arg, and the arg name must be input or inputs
         if func_receive_dict_inputs(self.forward):
-            return self.postprocess(self.forward(input))
+            return self.postprocess(self.forward(args[0], **kwargs))
         else:
-            return self.postprocess(self.forward(**input))
+            return self.postprocess(self.forward(*args, **kwargs))
 
-    def forward(self, inputs: Dict[str,
-                                   torch.Tensor]) -> Dict[str, torch.Tensor]:
+    def forward(self, *args, **kwargs) -> Dict[str, Any]:
         raise NotImplementedError
 
     def post_init(self):
diff --git a/modelscope/models/builder.py b/modelscope/models/builder.py
index 33f111a8..7a8e28f4 100644
--- a/modelscope/models/builder.py
+++ b/modelscope/models/builder.py
@@ -37,13 +37,16 @@ def build_backbone(cfg: ConfigDict,
         cfg, BACKBONES, group_key=field, default_args=default_args)
 
 
-def build_head(cfg: ConfigDict, default_args: dict = None):
+def build_head(cfg: ConfigDict,
+               group_key: str = None,
+               default_args: dict = None):
     """ build head given config dict
 
     Args:
         cfg (:obj:`ConfigDict`): config dict for head object.
         default_args (dict, optional): Default initialization arguments.
     """
-
+    if group_key is None:
+        group_key = cfg[TYPE_NAME]
     return build_from_cfg(
-        cfg, HEADS, group_key=cfg[TYPE_NAME], default_args=default_args)
+        cfg, HEADS, group_key=group_key, default_args=default_args)
diff --git a/modelscope/models/cv/__init__.py b/modelscope/models/cv/__init__.py
index 168ac96c..f2798b59 100644
--- a/modelscope/models/cv/__init__.py
+++ b/modelscope/models/cv/__init__.py
@@ -1,10 +1,17 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
+
+# yapf: disable
 from . import (action_recognition, animal_recognition, body_2d_keypoints,
-               cartoon, cmdssl_video_embedding, crowd_counting, face_detection,
+               body_3d_keypoints, cartoon, cmdssl_video_embedding,
+               crowd_counting, face_2d_keypoints, face_detection,
                face_generation, image_classification, image_color_enhance,
                image_colorization, image_denoise, image_instance_segmentation,
-               image_portrait_enhancement, image_reid_person,
+               image_panoptic_segmentation, image_portrait_enhancement,
+               image_reid_person, image_semantic_segmentation,
                image_to_image_generation, image_to_image_translation,
-               object_detection, product_retrieval_embedding,
-               salient_detection, super_resolution,
+               movie_scene_segmentation, object_detection,
+               product_retrieval_embedding, realtime_object_detection,
+               salient_detection, shop_segmentation, super_resolution,
                video_single_object_tracking, video_summarization, virual_tryon)
+
+# yapf: enable
diff --git a/modelscope/models/cv/action_detection/__init__.py b/modelscope/models/cv/action_detection/__init__.py
new file mode 100644
index 00000000..fedbe19c
--- /dev/null
+++ b/modelscope/models/cv/action_detection/__init__.py
@@ -0,0 +1,21 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+
+    from .action_detection_onnx import ActionDetONNX
+
+else:
+    _import_structure = {'action_detection_onnx': ['ActionDetONNX']}
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/models/cv/action_detection/action_detection_onnx.py b/modelscope/models/cv/action_detection/action_detection_onnx.py
new file mode 100644
index 00000000..1c8be354
--- /dev/null
+++ b/modelscope/models/cv/action_detection/action_detection_onnx.py
@@ -0,0 +1,179 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import os
+import os.path as osp
+import shutil
+import subprocess
+
+import cv2
+import numpy as np
+import onnxruntime as rt
+
+from modelscope.models import Model
+from modelscope.utils.constant import Devices
+from modelscope.utils.device import verify_device
+
+
+class ActionDetONNX(Model):
+
+    def __init__(self, model_dir, config, *args, **kwargs):
+        super().__init__(self, model_dir, *args, **kwargs)
+        model_file = osp.join(config['model_file'])
+        device_type, device_id = verify_device(self._device_name)
+        options = rt.SessionOptions()
+        options.intra_op_num_threads = 1
+        options.inter_op_num_threads = 1
+        if device_type == Devices.gpu:
+            sess = rt.InferenceSession(
+                model_file,
+                providers=['CUDAExecutionProvider'],
+                sess_options=options,
+                provider_options=[{
+                    'device_id': device_id
+                }])
+        else:
+            sess = rt.InferenceSession(
+                model_file,
+                providers=['CPUExecutionProvider'],
+                sess_options=options)
+        self.input_name = sess.get_inputs()[0].name
+        self.sess = sess
+        self.num_stride = len(config['fpn_strides'])
+        self.score_thresh = np.asarray(
+            config['pre_nms_thresh'], dtype='float32').reshape((1, -1))
+        self.size_divisibility = config['size_divisibility']
+        self.nms_threshold = config['nms_thresh']
+        self.tmp_dir = config['tmp_dir']
+        self.temporal_stride = config['step']
+        self.input_data_type = config['input_type']
+        self.action_names = config['action_names']
+        self.video_length_limit = config['video_length_limit']
+
+    def resize_box(self, det, height, width, scale_h, scale_w):
+        bboxs = det[0]
+        bboxs[:, [0, 2]] *= scale_w
+        bboxs[:, [1, 3]] *= scale_h
+        bboxs[:, [0, 2]] = bboxs[:, [0, 2]].clip(0, width - 1)
+        bboxs[:, [1, 3]] = bboxs[:, [1, 3]].clip(0, height - 1)
+        result = {
+            'boxes': bboxs.round().astype('int32').tolist(),
+            'scores': det[1].tolist(),
+            'labels': [self.action_names[i] for i in det[2].tolist()]
+        }
+        return result
+
+    def parse_frames(self, frame_names):
+        imgs = [cv2.imread(name)[:, :, ::-1] for name in frame_names]
+        imgs = np.stack(imgs).astype(self.input_data_type).transpose(
+            (3, 0, 1, 2))  # c,t,h,w
+        imgs = imgs[None]
+        return imgs
+
+    def forward_img(self, imgs, h, w):
+        pred = self.sess.run(None, {
+            self.input_name: imgs,
+            'height': np.asarray(h),
+            'width': np.asarray(w)
+        })
+        dets = self.post_nms(
+            pred,
+            score_threshold=self.score_thresh,
+            nms_threshold=self.nms_threshold)
+        return dets
+
+    def forward_video(self, video_name, scale):
+        min_size, max_size = self._get_sizes(scale)
+
+        tmp_dir = osp.join(self.tmp_dir, osp.basename(video_name)[:-4])
+        if osp.exists(tmp_dir):
+            shutil.rmtree(tmp_dir)
+        os.makedirs(tmp_dir)
+        frame_rate = 2
+        cmd = f'ffmpeg -y -loglevel quiet -ss 0 -t {self.video_length_limit}' + \
+              f' -i {video_name} -r {frame_rate} -f image2 {tmp_dir}/%06d.jpg'
+
+        cmd = cmd.split(' ')
+        subprocess.call(cmd)
+
+        frame_names = [
+            osp.join(tmp_dir, name) for name in sorted(os.listdir(tmp_dir))
+            if name.endswith('.jpg')
+        ]
+        frame_names = [
+            frame_names[i:i + frame_rate * 2]
+            for i in range(0,
+                           len(frame_names) - frame_rate * 2 + 1, frame_rate
+                           * self.temporal_stride)
+        ]
+        timestamp = list(
+            range(1,
+                  len(frame_names) * self.temporal_stride,
+                  self.temporal_stride))
+        batch_imgs = [self.parse_frames(names) for names in frame_names]
+
+        N, _, T, H, W = batch_imgs[0].shape
+        scale_min = min_size / min(H, W)
+        h, w = min(int(scale_min * H),
+                   max_size), min(int(scale_min * W), max_size)
+        h = round(h / self.size_divisibility) * self.size_divisibility
+        w = round(w / self.size_divisibility) * self.size_divisibility
+        scale_h, scale_w = H / h, W / w
+
+        results = []
+        for imgs in batch_imgs:
+            det = self.forward_img(imgs, h, w)
+            det = self.resize_box(det[0], H, W, scale_h, scale_w)
+            results.append(det)
+        results = [{
+            'timestamp': t,
+            'actions': res
+        } for t, res in zip(timestamp, results)]
+        shutil.rmtree(tmp_dir)
+        return results
+
+    def forward(self, video_name):
+        return self.forward_video(video_name, scale=1)
+
+    def post_nms(self, pred, score_threshold, nms_threshold=0.3):
+        pred_bboxes, pred_scores = pred
+        N = len(pred_bboxes)
+        dets = []
+        for i in range(N):
+            bboxes, scores = pred_bboxes[i], pred_scores[i]
+            candidate_inds = scores > score_threshold
+            scores = scores[candidate_inds]
+            candidate_nonzeros = candidate_inds.nonzero()
+            bboxes = bboxes[candidate_nonzeros[0]]
+            labels = candidate_nonzeros[1]
+            keep = self._nms(bboxes, scores, labels, nms_threshold)
+            bbox = bboxes[keep]
+            score = scores[keep]
+            label = labels[keep]
+            dets.append((bbox, score, label))
+        return dets
+
+    def _nms(self, boxes, scores, idxs, nms_threshold):
+        if len(boxes) == 0:
+            return []
+        max_coordinate = boxes.max()
+        offsets = idxs * (max_coordinate + 1)
+        boxes_for_nms = boxes + offsets[:, None].astype('float32')
+        boxes_for_nms[:, 2] = boxes_for_nms[:, 2] - boxes_for_nms[:, 0]
+        boxes_for_nms[:, 3] = boxes_for_nms[:, 3] - boxes_for_nms[:, 1]
+        keep = cv2.dnn.NMSBoxes(
+            boxes_for_nms.tolist(),
+            scores.tolist(),
+            score_threshold=0,
+            nms_threshold=nms_threshold)
+        if len(keep.shape) == 2:
+            keep = np.squeeze(keep, 1)
+        return keep
+
+    def _get_sizes(self, scale):
+        if scale == 1:
+            min_size, max_size = 512, 896
+        elif scale == 2:
+            min_size, max_size = 768, 1280
+        else:
+            min_size, max_size = 1024, 1792
+        return min_size, max_size
diff --git a/modelscope/models/cv/action_recognition/__init__.py b/modelscope/models/cv/action_recognition/__init__.py
index 7bdee0cd..5e9dc310 100644
--- a/modelscope/models/cv/action_recognition/__init__.py
+++ b/modelscope/models/cv/action_recognition/__init__.py
@@ -7,11 +7,13 @@ if TYPE_CHECKING:
 
     from .models import BaseVideoModel
     from .tada_convnext import TadaConvNeXt
+    from .temporal_patch_shift_transformer import PatchShiftTransformer
 
 else:
     _import_structure = {
         'models': ['BaseVideoModel'],
         'tada_convnext': ['TadaConvNeXt'],
+        'temporal_patch_shift_transformer': ['PatchShiftTransformer']
     }
 
     import sys
diff --git a/modelscope/models/cv/action_recognition/models.py b/modelscope/models/cv/action_recognition/models.py
index 48e75ae1..f16805fb 100644
--- a/modelscope/models/cv/action_recognition/models.py
+++ b/modelscope/models/cv/action_recognition/models.py
@@ -1,5 +1,9 @@
+# The implementation is also open-sourced by the authors,
+# and available at https://github.com/alibaba-mmai-research/TAdaConv
+# Copyright 2021-2022 The Alibaba FVI Team Authors. All rights reserved.
 import torch.nn as nn
 
+from .s3dg import Inception3D
 from .tada_convnext import TadaConvNeXt
 
 
@@ -26,11 +30,25 @@ class BaseVideoModel(nn.Module):
         super(BaseVideoModel, self).__init__()
         # the backbone is created according to meta-architectures
         # defined in models/base/backbone.py
-        self.backbone = TadaConvNeXt(cfg)
+        if cfg.MODEL.NAME == 'ConvNeXt_tiny':
+            self.backbone = TadaConvNeXt(cfg)
+        elif cfg.MODEL.NAME == 'S3DG':
+            self.backbone = Inception3D(cfg)
+        else:
+            error_str = 'backbone {} is not supported, ConvNeXt_tiny or S3DG is supported'.format(
+                cfg.MODEL.NAME)
+            raise NotImplementedError(error_str)
 
         # the head is created according to the heads
         # defined in models/module_zoo/heads
-        self.head = BaseHead(cfg)
+        if cfg.VIDEO.HEAD.NAME == 'BaseHead':
+            self.head = BaseHead(cfg)
+        elif cfg.VIDEO.HEAD.NAME == 'AvgHead':
+            self.head = AvgHead(cfg)
+        else:
+            error_str = 'head {} is not supported, BaseHead or AvgHead is supported'.format(
+                cfg.VIDEO.HEAD.NAME)
+            raise NotImplementedError(error_str)
 
     def forward(self, x):
         x = self.backbone(x)
@@ -88,3 +106,29 @@ class BaseHead(nn.Module):
         out = self.activation(out)
         out = out.view(out.shape[0], -1)
         return out, x.view(x.shape[0], -1)
+
+
+class AvgHead(nn.Module):
+    """
+    Constructs base head.
+    """
+
+    def __init__(
+        self,
+        cfg,
+    ):
+        """
+        Args:
+            cfg (Config): global config object.
+        """
+        super(AvgHead, self).__init__()
+        self.cfg = cfg
+        self.global_avg_pool = nn.AdaptiveAvgPool3d(1)
+
+    def forward(self, x):
+        if len(x.shape) == 5:
+            x = self.global_avg_pool(x)
+            # (N, C, T, H, W) -> (N, T, H, W, C).
+            x = x.permute((0, 2, 3, 4, 1))
+        out = x.view(x.shape[0], -1)
+        return out, x.view(x.shape[0], -1)
diff --git a/modelscope/models/cv/action_recognition/s3dg.py b/modelscope/models/cv/action_recognition/s3dg.py
new file mode 100644
index 00000000..46e76892
--- /dev/null
+++ b/modelscope/models/cv/action_recognition/s3dg.py
@@ -0,0 +1,304 @@
+# The implementation is adopted from https://github.com/TengdaHan/CoCLR,
+# made pubicly available under the Apache License, Version 2.0 at https://github.com/TengdaHan/CoCLR
+# Copyright 2021-2022 The Alibaba FVI Team Authors. All rights reserved.
+import torch
+import torch.nn as nn
+
+
+class InceptionBaseConv3D(nn.Module):
+    """
+    Constructs basic inception 3D conv.
+    Modified from https://github.com/TengdaHan/CoCLR/blob/main/backbone/s3dg.py.
+    """
+
+    def __init__(self,
+                 cfg,
+                 in_planes,
+                 out_planes,
+                 kernel_size,
+                 stride,
+                 padding=0):
+        super(InceptionBaseConv3D, self).__init__()
+        self.conv = nn.Conv3d(
+            in_planes,
+            out_planes,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            bias=False)
+        self.bn = nn.BatchNorm3d(out_planes)
+        self.relu = nn.ReLU(inplace=True)
+
+        # init
+        self.conv.weight.data.normal_(
+            mean=0, std=0.01)  # original s3d is truncated normal within 2 std
+        self.bn.weight.data.fill_(1)
+        self.bn.bias.data.zero_()
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.bn(x)
+        x = self.relu(x)
+        return x
+
+
+class InceptionBlock3D(nn.Module):
+    """
+    Element constructing the S3D/S3DG.
+    See models/base/backbone.py L99-186.
+
+    Modifed from https://github.com/TengdaHan/CoCLR/blob/main/backbone/s3dg.py.
+    """
+
+    def __init__(self, cfg, in_planes, out_planes):
+        super(InceptionBlock3D, self).__init__()
+
+        _gating = cfg.VIDEO.BACKBONE.BRANCH.GATING
+
+        assert len(out_planes) == 6
+        assert isinstance(out_planes, list)
+
+        [
+            num_out_0_0a, num_out_1_0a, num_out_1_0b, num_out_2_0a,
+            num_out_2_0b, num_out_3_0b
+        ] = out_planes
+
+        self.branch0 = nn.Sequential(
+            InceptionBaseConv3D(
+                cfg, in_planes, num_out_0_0a, kernel_size=1, stride=1), )
+        self.branch1 = nn.Sequential(
+            InceptionBaseConv3D(
+                cfg, in_planes, num_out_1_0a, kernel_size=1, stride=1),
+            STConv3d(
+                cfg,
+                num_out_1_0a,
+                num_out_1_0b,
+                kernel_size=3,
+                stride=1,
+                padding=1),
+        )
+        self.branch2 = nn.Sequential(
+            InceptionBaseConv3D(
+                cfg, in_planes, num_out_2_0a, kernel_size=1, stride=1),
+            STConv3d(
+                cfg,
+                num_out_2_0a,
+                num_out_2_0b,
+                kernel_size=3,
+                stride=1,
+                padding=1),
+        )
+        self.branch3 = nn.Sequential(
+            nn.MaxPool3d(kernel_size=(3, 3, 3), stride=1, padding=1),
+            InceptionBaseConv3D(
+                cfg, in_planes, num_out_3_0b, kernel_size=1, stride=1),
+        )
+
+        self.out_channels = sum(
+            [num_out_0_0a, num_out_1_0b, num_out_2_0b, num_out_3_0b])
+
+        self.gating = _gating
+        if _gating:
+            self.gating_b0 = SelfGating(num_out_0_0a)
+            self.gating_b1 = SelfGating(num_out_1_0b)
+            self.gating_b2 = SelfGating(num_out_2_0b)
+            self.gating_b3 = SelfGating(num_out_3_0b)
+
+    def forward(self, x):
+        x0 = self.branch0(x)
+        x1 = self.branch1(x)
+        x2 = self.branch2(x)
+        x3 = self.branch3(x)
+        if self.gating:
+            x0 = self.gating_b0(x0)
+            x1 = self.gating_b1(x1)
+            x2 = self.gating_b2(x2)
+            x3 = self.gating_b3(x3)
+
+        out = torch.cat((x0, x1, x2, x3), 1)
+
+        return out
+
+
+class SelfGating(nn.Module):
+
+    def __init__(self, input_dim):
+        super(SelfGating, self).__init__()
+        self.fc = nn.Linear(input_dim, input_dim)
+
+    def forward(self, input_tensor):
+        """Feature gating as used in S3D-G"""
+        spatiotemporal_average = torch.mean(input_tensor, dim=[2, 3, 4])
+        weights = self.fc(spatiotemporal_average)
+        weights = torch.sigmoid(weights)
+        return weights[:, :, None, None, None] * input_tensor
+
+
+class STConv3d(nn.Module):
+    """
+    Element constructing the S3D/S3DG.
+    See models/base/backbone.py L99-186.
+
+    Modifed from https://github.com/TengdaHan/CoCLR/blob/main/backbone/s3dg.py.
+    """
+
+    def __init__(self,
+                 cfg,
+                 in_planes,
+                 out_planes,
+                 kernel_size,
+                 stride,
+                 padding=0):
+        super(STConv3d, self).__init__()
+        if isinstance(stride, tuple):
+            t_stride = stride[0]
+            stride = stride[-1]
+        else:  # int
+            t_stride = stride
+
+        self.bn_mmt = cfg.BN.MOMENTUM
+        self.bn_eps = float(cfg.BN.EPS)
+        self._construct_branch(cfg, in_planes, out_planes, kernel_size, stride,
+                               t_stride, padding)
+
+    def _construct_branch(self,
+                          cfg,
+                          in_planes,
+                          out_planes,
+                          kernel_size,
+                          stride,
+                          t_stride,
+                          padding=0):
+        self.conv1 = nn.Conv3d(
+            in_planes,
+            out_planes,
+            kernel_size=(1, kernel_size, kernel_size),
+            stride=(1, stride, stride),
+            padding=(0, padding, padding),
+            bias=False)
+        self.conv2 = nn.Conv3d(
+            out_planes,
+            out_planes,
+            kernel_size=(kernel_size, 1, 1),
+            stride=(t_stride, 1, 1),
+            padding=(padding, 0, 0),
+            bias=False)
+
+        self.bn1 = nn.BatchNorm3d(
+            out_planes, eps=self.bn_eps, momentum=self.bn_mmt)
+        self.bn2 = nn.BatchNorm3d(
+            out_planes, eps=self.bn_eps, momentum=self.bn_mmt)
+        self.relu = nn.ReLU(inplace=True)
+
+        # init
+        self.conv1.weight.data.normal_(
+            mean=0, std=0.01)  # original s3d is truncated normal within 2 std
+        self.conv2.weight.data.normal_(
+            mean=0, std=0.01)  # original s3d is truncated normal within 2 std
+        self.bn1.weight.data.fill_(1)
+        self.bn1.bias.data.zero_()
+        self.bn2.weight.data.fill_(1)
+        self.bn2.bias.data.zero_()
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = self.relu(x)
+        x = self.conv2(x)
+        x = self.bn2(x)
+        x = self.relu(x)
+        return x
+
+
+class Inception3D(nn.Module):
+    """
+    Backbone architecture for I3D/S3DG.
+    Modifed from https://github.com/TengdaHan/CoCLR/blob/main/backbone/s3dg.py.
+    """
+
+    def __init__(self, cfg):
+        """
+        Args:
+            cfg (Config): global config object.
+        """
+        super(Inception3D, self).__init__()
+        _input_channel = cfg.DATA.NUM_INPUT_CHANNELS
+        self._construct_backbone(cfg, _input_channel)
+
+    def _construct_backbone(self, cfg, input_channel):
+        # ------------------- Block 1 -------------------
+        self.Conv_1a = STConv3d(
+            cfg, input_channel, 64, kernel_size=7, stride=2, padding=3)
+
+        self.block1 = nn.Sequential(self.Conv_1a)  # (64, 32, 112, 112)
+
+        # ------------------- Block 2 -------------------
+        self.MaxPool_2a = nn.MaxPool3d(
+            kernel_size=(1, 3, 3), stride=(1, 2, 2), padding=(0, 1, 1))
+        self.Conv_2b = InceptionBaseConv3D(
+            cfg, 64, 64, kernel_size=1, stride=1)
+        self.Conv_2c = STConv3d(
+            cfg, 64, 192, kernel_size=3, stride=1, padding=1)
+
+        self.block2 = nn.Sequential(
+            self.MaxPool_2a,  # (64, 32, 56, 56)
+            self.Conv_2b,  # (64, 32, 56, 56)
+            self.Conv_2c)  # (192, 32, 56, 56)
+
+        # ------------------- Block 3 -------------------
+        self.MaxPool_3a = nn.MaxPool3d(
+            kernel_size=(1, 3, 3), stride=(1, 2, 2), padding=(0, 1, 1))
+        self.Mixed_3b = InceptionBlock3D(
+            cfg, in_planes=192, out_planes=[64, 96, 128, 16, 32, 32])
+        self.Mixed_3c = InceptionBlock3D(
+            cfg, in_planes=256, out_planes=[128, 128, 192, 32, 96, 64])
+
+        self.block3 = nn.Sequential(
+            self.MaxPool_3a,  # (192, 32, 28, 28)
+            self.Mixed_3b,  # (256, 32, 28, 28)
+            self.Mixed_3c)  # (480, 32, 28, 28)
+
+        # ------------------- Block 4 -------------------
+        self.MaxPool_4a = nn.MaxPool3d(
+            kernel_size=(3, 3, 3), stride=(2, 2, 2), padding=(1, 1, 1))
+        self.Mixed_4b = InceptionBlock3D(
+            cfg, in_planes=480, out_planes=[192, 96, 208, 16, 48, 64])
+        self.Mixed_4c = InceptionBlock3D(
+            cfg, in_planes=512, out_planes=[160, 112, 224, 24, 64, 64])
+        self.Mixed_4d = InceptionBlock3D(
+            cfg, in_planes=512, out_planes=[128, 128, 256, 24, 64, 64])
+        self.Mixed_4e = InceptionBlock3D(
+            cfg, in_planes=512, out_planes=[112, 144, 288, 32, 64, 64])
+        self.Mixed_4f = InceptionBlock3D(
+            cfg, in_planes=528, out_planes=[256, 160, 320, 32, 128, 128])
+
+        self.block4 = nn.Sequential(
+            self.MaxPool_4a,  # (480, 16, 14, 14)
+            self.Mixed_4b,  # (512, 16, 14, 14)
+            self.Mixed_4c,  # (512, 16, 14, 14)
+            self.Mixed_4d,  # (512, 16, 14, 14)
+            self.Mixed_4e,  # (528, 16, 14, 14)
+            self.Mixed_4f)  # (832, 16, 14, 14)
+
+        # ------------------- Block 5 -------------------
+        self.MaxPool_5a = nn.MaxPool3d(
+            kernel_size=(2, 2, 2), stride=(2, 2, 2), padding=(0, 0, 0))
+        self.Mixed_5b = InceptionBlock3D(
+            cfg, in_planes=832, out_planes=[256, 160, 320, 32, 128, 128])
+        self.Mixed_5c = InceptionBlock3D(
+            cfg, in_planes=832, out_planes=[384, 192, 384, 48, 128, 128])
+
+        self.block5 = nn.Sequential(
+            self.MaxPool_5a,  # (832, 8, 7, 7)
+            self.Mixed_5b,  # (832, 8, 7, 7)
+            self.Mixed_5c)  # (1024, 8, 7, 7)
+
+    def forward(self, x):
+        if isinstance(x, dict):
+            x = x['video']
+        x = self.block1(x)
+        x = self.block2(x)
+        x = self.block3(x)
+        x = self.block4(x)
+        x = self.block5(x)
+        return x
diff --git a/modelscope/models/cv/action_recognition/tada_convnext.py b/modelscope/models/cv/action_recognition/tada_convnext.py
index 379b5271..b1de7af8 100644
--- a/modelscope/models/cv/action_recognition/tada_convnext.py
+++ b/modelscope/models/cv/action_recognition/tada_convnext.py
@@ -1,3 +1,7 @@
+# The implementation is adopted from https://github.com/facebookresearch/ConvNeXt,
+# made pubicly available under the MIT License at https://github.com/facebookresearch/ConvNeXt
+# Copyright 2021-2022 The Alibaba FVI Team Authors. All rights reserved.
+
 import math
 
 import torch
diff --git a/modelscope/models/cv/action_recognition/temporal_patch_shift_transformer.py b/modelscope/models/cv/action_recognition/temporal_patch_shift_transformer.py
new file mode 100644
index 00000000..46596afd
--- /dev/null
+++ b/modelscope/models/cv/action_recognition/temporal_patch_shift_transformer.py
@@ -0,0 +1,1198 @@
+# Part of the implementation is borrowed and modified from Video Swin Transformer,
+# publicly available at https://github.com/SwinTransformer/Video-Swin-Transformer
+
+from abc import ABCMeta, abstractmethod
+from functools import lru_cache, reduce
+from operator import mul
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.checkpoint as checkpoint
+import torchvision.transforms as T
+from einops import rearrange
+from timm.models.layers import DropPath, Mlp, trunc_normal_
+
+from modelscope.models import TorchModel
+
+
+def normal_init(module, mean=0., std=1., bias=0.):
+    if hasattr(module, 'weight') and module.weight is not None:
+        nn.init.normal_(module.weight, mean, std)
+    if hasattr(module, 'bias') and module.bias is not None:
+        nn.init.constant_(module.bias, bias)
+
+
+def window_partition(x, window_size):
+    """ window_partition function.
+    Args:
+        x: (B, D, H, W, C)
+        window_size (tuple[int]): window size
+
+    Returns:
+        windows: (B*num_windows, window_size*window_size, C)
+    """
+    B, D, H, W, C = x.shape
+    x = x.view(B, D // window_size[0], window_size[0], H // window_size[1],
+               window_size[1], W // window_size[2], window_size[2], C)
+    windows = x.permute(0, 1, 3, 5, 2, 4, 6,
+                        7).contiguous().view(-1, reduce(mul, window_size), C)
+    return windows
+
+
+def window_reverse(windows, window_size, B, D, H, W):
+    """ window_reverse function.
+    Args:
+        windows: (B*num_windows, window_size, window_size, C)
+        window_size (tuple[int]): Window size
+        H (int): Height of image
+        W (int): Width of image
+
+    Returns:
+        x: (B, D, H, W, C)
+    """
+    x = windows.view(B, D // window_size[0], H // window_size[1],
+                     W // window_size[2], window_size[0], window_size[1],
+                     window_size[2], -1)
+    x = x.permute(0, 1, 4, 2, 5, 3, 6, 7).contiguous().view(B, D, H, W, -1)
+    return x
+
+
+def get_window_size(x_size, window_size, shift_size=None):
+    use_window_size = list(window_size)
+    if shift_size is not None:
+        use_shift_size = list(shift_size)
+    for i in range(len(x_size)):
+        if x_size[i] <= window_size[i]:
+            use_window_size[i] = x_size[i]
+            if shift_size is not None:
+                use_shift_size[i] = 0
+
+    if shift_size is None:
+        return tuple(use_window_size)
+    else:
+        return tuple(use_window_size), tuple(use_shift_size)
+
+
+class WindowAttention3D(nn.Module):
+    """ This is PyTorch impl of TPS
+
+    Window based multi-head self attention (W-MSA) module with relative position bias.
+    The coordinates of patches and patches are shifted together using Pattern C.
+    It supports both of shifted and non-shifted window.
+
+    Args:
+        dim (int): Number of input channels.
+        window_size (tuple[int]): The temporal length, height and width of the window.
+        num_heads (int): Number of attention heads.
+        qkv_bias (bool, optional):  If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set
+        attn_drop (float, optional): Dropout ratio of attention weight. Default: 0.0
+        proj_drop (float, optional): Dropout ratio of output. Default: 0.0
+        shift (bool, optional): If True, conduct shift operation
+        shift_type (str, optional): shift operation type, either using 'psm' or 'tsm'
+    """
+
+    def __init__(self,
+                 dim,
+                 window_size,
+                 num_heads,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 attn_drop=0.,
+                 proj_drop=0.,
+                 shift=False,
+                 shift_type='psm'):
+
+        super().__init__()
+        self.dim = dim
+        window_size = (16, 7, 7)
+        self.window_size = window_size  # Wd, Wh, Ww
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = qk_scale or head_dim**-0.5
+        self.shift = shift
+        self.shift_type = shift_type
+
+        # define a parameter table of relative position bias
+        self.relative_position_bias_table = nn.Parameter(
+            torch.zeros(
+                np.prod([2 * ws - 1 for ws in window_size]),
+                num_heads))  # 2*Wd-1 * 2*Wh-1 * 2*Ww-1, nH
+
+        # get pair-wise relative position index for each token inside the window
+        coords_d = torch.arange(self.window_size[0])
+        coords_h = torch.arange(self.window_size[1])
+        coords_w = torch.arange(self.window_size[2])
+        coords = torch.stack(
+            torch.meshgrid(coords_d, coords_h, coords_w,
+                           indexing='ij'))  # 3, Wd, Wh, Ww
+        # Do the same rotation to coords
+        coords_old = coords.clone()
+
+        # pattern patternC - 9
+        coords[:, :, 0::3, 0::3] = torch.roll(
+            coords[:, :, 0::3, 0::3], shifts=-4, dims=1)
+        coords[:, :, 0::3, 1::3] = torch.roll(
+            coords[:, :, 0::3, 1::3], shifts=1, dims=1)
+        coords[:, :, 0::3, 2::3] = torch.roll(
+            coords[:, :, 0::3, 2::3], shifts=2, dims=1)
+        coords[:, :, 1::3, 2::3] = torch.roll(
+            coords[:, :, 1::3, 2::3], shifts=3, dims=1)
+        coords[:, :, 1::3, 0::3] = torch.roll(
+            coords[:, :, 1::3, 0::3], shifts=-1, dims=1)
+        coords[:, :, 2::3, 0::3] = torch.roll(
+            coords[:, :, 2::3, 0::3], shifts=-2, dims=1)
+        coords[:, :, 2::3, 1::3] = torch.roll(
+            coords[:, :, 2::3, 1::3], shifts=-3, dims=1)
+        coords[:, :, 2::3, 2::3] = torch.roll(
+            coords[:, :, 2::3, 2::3], shifts=4, dims=1)
+
+        coords_flatten = torch.flatten(coords, 1)  # 3, Wd*Wh*Ww
+        coords_old_flatten = torch.flatten(coords_old, 1)
+        relative_coords = coords_flatten[:, :,
+                                         None] - coords_flatten[:,
+                                                                None, :]  # 3, Wd*Wh*Ww, Wd*Wh*Ww
+        relative_coords_old = coords_old_flatten[:, :,
+                                                 None] - coords_old_flatten[:,
+                                                                            None, :]  # 3, Wd*Wh*Ww, Wd*Wh*Ww
+
+        relative_coords = relative_coords.permute(
+            1, 2, 0).contiguous()  # Wd*Wh*Ww, Wd*Wh*Ww, 3
+        relative_coords_old = relative_coords_old.permute(
+            1, 2, 0).contiguous()  # Wd*Wh*Ww, Wd*Wh*Ww, 3
+
+        relative_coords[:, :,
+                        0] += self.window_size[0] - 1  # shift to start from 0
+        relative_coords[:, :, 1] += self.window_size[1] - 1
+        relative_coords[:, :, 2] += self.window_size[2] - 1
+
+        relative_coords_old[:, :, 0] += self.window_size[
+            0] - 1  # shift to start from 0
+        relative_coords_old[:, :, 1] += self.window_size[1] - 1
+        relative_coords_old[:, :, 2] += self.window_size[2] - 1
+
+        relative_coords[:, :, 0] *= (2 * self.window_size[1]
+                                     - 1) * (2 * self.window_size[2] - 1)
+        relative_coords[:, :, 1] *= (2 * self.window_size[2] - 1)
+
+        relative_coords_old[:, :, 0] *= (2 * self.window_size[1]
+                                         - 1) * (2 * self.window_size[2] - 1)
+        relative_coords_old[:, :, 1] *= (2 * self.window_size[2] - 1)
+
+        relative_position_index = relative_coords.sum(-1)  # Wd*Wh*Ww, Wd*Wh*Ww
+
+        relative_position_index_old = relative_coords_old.sum(-1)
+        relative_position_index = relative_position_index.view(
+            window_size[0], window_size[1] * window_size[2], window_size[0],
+            window_size[1] * window_size[2]).permute(0, 2, 1, 3).reshape(
+                window_size[0] * window_size[0],
+                window_size[1] * window_size[2],
+                window_size[1] * window_size[2])[::window_size[0], :, :]
+
+        relative_position_index_old = relative_position_index_old.view(
+            window_size[0], window_size[1] * window_size[2], window_size[0],
+            window_size[1] * window_size[2]).permute(0, 2, 1, 3).reshape(
+                window_size[0] * window_size[0],
+                window_size[1] * window_size[2],
+                window_size[1] * window_size[2])[::window_size[0], :, :]
+
+        self.register_buffer('relative_position_index',
+                             relative_position_index)
+        self.register_buffer('relative_position_index_old',
+                             relative_position_index_old)
+
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+
+        trunc_normal_(self.relative_position_bias_table, std=.02)
+        self.softmax = nn.Softmax(dim=-1)
+
+        if self.shift and self.shift_type == 'psm':
+            self.shift_op = PatchShift(False, 1)
+            self.shift_op_back = PatchShift(True, 1)
+        elif self.shift and self.shift_type == 'tsm':
+            self.shift_op = TemporalShift(8)
+
+    def forward(self, x, mask=None, batch_size=8, frame_len=8):
+        """ Forward function.
+        Args:
+            x: input features with shape of (num_windows*B, N, C)
+            mask: (0/-inf) mask with shape of (num_windows, N, N) or None
+        """
+        B_, N, C = x.shape
+        if self.shift:
+            x = x.view(B_, N, self.num_heads,
+                       C // self.num_heads).permute(0, 2, 1, 3)
+
+            x = self.shift_op(x, batch_size, frame_len)
+            x = x.permute(0, 2, 1, 3).reshape(B_, N, C)
+        qkv = self.qkv(x).reshape(B_, N, 3, self.num_heads,
+                                  C // self.num_heads).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv[0], qkv[1], qkv[2]  # B_, nH, N, C
+
+        q = q * self.scale
+        attn = q @ k.transpose(-2, -1)
+
+        if self.shift and self.shift_type == 'psm':
+            relative_position_bias = self.relative_position_bias_table[
+                self.relative_position_index[:].reshape(-1), :].reshape(
+                    frame_len, N, N, -1)  # 8frames ,Wd*Wh*Ww,Wd*Wh*Ww,nH
+        else:
+            relative_position_bias = self.relative_position_bias_table[
+                self.relative_position_index_old[:].reshape(-1), :].reshape(
+                    frame_len, N, N, -1)  # 8frames ,Wd*Wh*Ww,Wd*Wh*Ww,nH
+
+        relative_position_bias = relative_position_bias.permute(
+            0, 3, 1, 2).contiguous()  # Frames, nH, Wd*Wh*Ww, Wd*Wh*Ww
+
+        attn = attn.view(
+            batch_size, frame_len, -1, self.num_heads, N, N).permute(
+                0,
+                2, 1, 3, 4, 5) + relative_position_bias.unsqueeze(0).unsqueeze(
+                    1)  # B_, nH, N, N
+        attn = attn.permute(0, 2, 1, 3, 4, 5).view(-1, self.num_heads, N, N)
+
+        if mask is not None:
+            nW = mask.shape[0]
+            attn = attn.view(B_ // nW, nW, self.num_heads, N,
+                             N) + mask.unsqueeze(1).unsqueeze(0)
+            attn = attn.view(-1, self.num_heads, N, N)
+            attn = self.softmax(attn)
+        else:
+            attn = self.softmax(attn)
+
+        attn = self.attn_drop(attn)
+        # Shift back for psm
+        if self.shift and self.shift_type == 'psm':
+            x = self.shift_op_back(attn @ v, batch_size,
+                                   frame_len).transpose(1,
+                                                        2).reshape(B_, N, C)
+        else:
+            x = (attn @ v).transpose(1, 2).reshape(B_, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+
+
+class PatchShift(nn.Module):
+    """ This is PyTorch impl of TPS
+
+    The patches are shifted using Pattern C.
+
+    It supports both of shifted and shift back.
+
+    Args:
+        inv (bool): whether using inverse shifted (shift back)
+        ratio (float): ratio of channels to be shifted, patch shift using 1.0
+    """
+
+    def __init__(self, inv=False, ratio=1):
+        super(PatchShift, self).__init__()
+        self.inv = inv
+        self.ratio = ratio
+        # if inv:
+        # print('=> Using inverse PatchShift, ratio {}, tps'.format(ratio))
+        # else:
+        # print('=> Using bayershift, ratio {}, tps'.format(ratio))
+
+    def forward(self, x, batch_size, frame_len):
+        x = self.shift(
+            x,
+            inv=self.inv,
+            ratio=self.ratio,
+            batch_size=batch_size,
+            frame_len=frame_len)
+        return x
+
+    @staticmethod
+    def shift(x, inv=False, ratio=0.5, batch_size=8, frame_len=8):
+        B, num_heads, N, c = x.size()
+        fold = int(num_heads * ratio)
+        feat = x
+        feat = feat.view(batch_size, frame_len, -1, num_heads, 7, 7, c)
+        out = feat.clone()
+        multiplier = 1
+        stride = 1
+        if inv:
+            multiplier = -1
+
+        # Pattern C
+        out[:, :, :, :fold, 0::3, 0::3, :] = torch.roll(
+            feat[:, :, :, :fold, 0::3, 0::3, :],
+            shifts=-4 * multiplier * stride,
+            dims=1)
+        out[:, :, :, :fold, 0::3, 1::3, :] = torch.roll(
+            feat[:, :, :, :fold, 0::3, 1::3, :],
+            shifts=multiplier * stride,
+            dims=1)
+        out[:, :, :, :fold, 1::3, 0::3, :] = torch.roll(
+            feat[:, :, :, :fold, 1::3, 0::3, :],
+            shifts=-multiplier * stride,
+            dims=1)
+        out[:, :, :, :fold, 0::3, 2::3, :] = torch.roll(
+            feat[:, :, :, :fold, 0::3, 2::3, :],
+            shifts=2 * multiplier * stride,
+            dims=1)
+        out[:, :, :, :fold, 2::3, 0::3, :] = torch.roll(
+            feat[:, :, :, :fold, 2::3, 0::3, :],
+            shifts=-2 * multiplier * stride,
+            dims=1)
+        out[:, :, :, :fold, 1::3, 2::3, :] = torch.roll(
+            feat[:, :, :, :fold, 1::3, 2::3, :],
+            shifts=3 * multiplier * stride,
+            dims=1)
+        out[:, :, :, :fold, 2::3, 1::3, :] = torch.roll(
+            feat[:, :, :, :fold, 2::3, 1::3, :],
+            shifts=-3 * multiplier * stride,
+            dims=1)
+        out[:, :, :, :fold, 2::3, 2::3, :] = torch.roll(
+            feat[:, :, :, :fold, 2::3, 2::3, :],
+            shifts=4 * multiplier * stride,
+            dims=1)
+
+        out = out.view(B, num_heads, N, c)
+        return out
+
+
+class TemporalShift(nn.Module):
+    """ This is PyTorch impl of TPS
+
+    The temporal channel shift.
+
+    The code is adopted from TSM: Temporal Shift Module for Efficient Video Understanding. ICCV19
+
+    https://github.com/mit-han-lab/temporal-shift-module/blob/master/ops/temporal_shift.py
+
+    Args:
+        n_div (int): propotion of channel to be shifted.
+    """
+
+    def __init__(self, n_div=8):
+        super(TemporalShift, self).__init__()
+        self.fold_div = n_div
+
+    def forward(self, x, batch_size, frame_len):
+        x = self.shift(
+            x,
+            fold_div=self.fold_div,
+            batch_size=batch_size,
+            frame_len=frame_len)
+        return x
+
+    @staticmethod
+    def shift(x, fold_div=8, batch_size=8, frame_len=8):
+        B, num_heads, N, c = x.size()
+        fold = c // fold_div
+        feat = x
+        feat = feat.view(batch_size, frame_len, -1, num_heads, N, c)
+        out = feat.clone()
+
+        out[:, 1:, :, :, :, :fold] = feat[:, :-1, :, :, :, :fold]  # shift left
+        out[:, :-1, :, :, :,
+            fold:2 * fold] = feat[:, 1:, :, :, :, fold:2 * fold]  # shift right
+
+        out = out.view(B, num_heads, N, c)
+
+        return out
+
+
+class SwinTransformerBlock3D(nn.Module):
+    """ Swin Transformer Block from Video Swin Transformer.
+
+    Args:
+        dim (int): Number of input channels.
+        num_heads (int): Number of attention heads.
+        window_size (tuple[int]): Window size.
+        shift_size (tuple[int]): Shift size for SW-MSA.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
+        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
+        drop (float, optional): Dropout rate. Default: 0.0
+        attn_drop (float, optional): Attention dropout rate. Default: 0.0
+        drop_path (float, optional): Stochastic depth rate. Default: 0.0
+        act_layer (nn.Module, optional): Activation layer. Default: nn.GELU
+        norm_layer (nn.Module, optional): Normalization layer.  Default: nn.LayerNorm
+    """
+
+    def __init__(self,
+                 dim,
+                 num_heads,
+                 window_size=(2, 7, 7),
+                 shift_size=(0, 0, 0),
+                 mlp_ratio=4.,
+                 qkv_bias=True,
+                 qk_scale=None,
+                 drop=0.,
+                 attn_drop=0.,
+                 drop_path=0.,
+                 act_layer=nn.GELU,
+                 norm_layer=nn.LayerNorm,
+                 use_checkpoint=False,
+                 shift=False,
+                 shift_type='psm'):
+        super().__init__()
+        self.dim = dim
+        self.num_heads = num_heads
+        self.window_size = window_size
+        self.shift_size = shift_size
+        self.mlp_ratio = mlp_ratio
+        self.use_checkpoint = use_checkpoint
+        self.shift = shift
+        self.shift_type = shift_type
+
+        assert 0 <= self.shift_size[0] < self.window_size[
+            0], 'shift_size must in 0-window_size'
+        assert 0 <= self.shift_size[1] < self.window_size[
+            1], 'shift_size must in 0-window_size'
+        assert 0 <= self.shift_size[2] < self.window_size[
+            2], 'shift_size must in 0-window_size'
+
+        self.norm1 = norm_layer(dim)
+        self.attn = WindowAttention3D(
+            dim,
+            window_size=self.window_size,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            qk_scale=qk_scale,
+            attn_drop=attn_drop,
+            proj_drop=drop,
+            shift=self.shift,
+            shift_type=self.shift_type)
+
+        self.drop_path = DropPath(
+            drop_path) if drop_path > 0. else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(
+            in_features=dim,
+            hidden_features=mlp_hidden_dim,
+            act_layer=act_layer,
+            drop=drop)
+
+    def forward_part1(self, x, mask_matrix):
+        B, D, H, W, C = x.shape
+        window_size, shift_size = get_window_size((D, H, W), self.window_size,
+                                                  self.shift_size)
+
+        x = self.norm1(x)
+        # pad feature maps to multiples of window size
+        pad_l = pad_t = pad_d0 = 0
+        pad_d1 = (window_size[0] - D % window_size[0]) % window_size[0]
+        pad_b = (window_size[1] - H % window_size[1]) % window_size[1]
+        pad_r = (window_size[2] - W % window_size[2]) % window_size[2]
+        x = F.pad(x, (0, 0, pad_l, pad_r, pad_t, pad_b, pad_d0, pad_d1))
+        _, Dp, Hp, Wp, _ = x.shape
+        # cyclic shift
+        if any(i > 0 for i in shift_size):
+            shifted_x = torch.roll(
+                x,
+                shifts=(-shift_size[0], -shift_size[1], -shift_size[2]),
+                dims=(1, 2, 3))
+            attn_mask = mask_matrix
+        else:
+            shifted_x = x
+            attn_mask = None
+        # partition windows
+        x_windows = window_partition(shifted_x,
+                                     window_size)  # B*nW, Wd*Wh*Ww, C
+        # W-MSA/SW-MSA
+        attn_windows = self.attn(
+            x_windows, mask=attn_mask, batch_size=B,
+            frame_len=D)  # B*nW, Wd*Wh*Ww, C
+        # merge windows
+        attn_windows = attn_windows.view(-1, *(window_size + (C, )))
+        shifted_x = window_reverse(attn_windows, window_size, B, Dp, Hp,
+                                   Wp)  # B D' H' W' C
+        # reverse cyclic shift
+        if any(i > 0 for i in shift_size):
+            x = torch.roll(
+                shifted_x,
+                shifts=(shift_size[0], shift_size[1], shift_size[2]),
+                dims=(1, 2, 3))
+        else:
+            x = shifted_x
+
+        if pad_d1 > 0 or pad_r > 0 or pad_b > 0:
+            x = x[:, :D, :H, :W, :].contiguous()
+        return x
+
+    def forward_part2(self, x):
+        return self.drop_path(self.mlp(self.norm2(x)))
+
+    def forward(self, x, mask_matrix):
+        """ Forward function.
+
+        Args:
+            x: Input feature, tensor size (B, D, H, W, C).
+            mask_matrix: Attention mask for cyclic shift.
+        """
+
+        shortcut = x
+        if self.use_checkpoint:
+            x = checkpoint.checkpoint(self.forward_part1, x, mask_matrix)
+        else:
+            x = self.forward_part1(x, mask_matrix)
+        x = shortcut + self.drop_path(x)
+
+        if self.use_checkpoint:
+            x = x + checkpoint.checkpoint(self.forward_part2, x)
+        else:
+            x = x + self.forward_part2(x)
+
+        return x
+
+
+class PatchMerging(nn.Module):
+    """ Patch Merging Layer from Video Swin Transformer.
+    Args:
+        dim (int): Number of input channels.
+        norm_layer (nn.Module, optional): Normalization layer.  Default: nn.LayerNorm
+    """
+
+    def __init__(self, dim, norm_layer=nn.LayerNorm):
+        super().__init__()
+        self.dim = dim
+        self.reduction = nn.Linear(4 * dim, 2 * dim, bias=False)
+        self.norm = norm_layer(4 * dim)
+
+    def forward(self, x):
+        """ Forward function.
+
+        Args:
+            x: Input feature, tensor size (B, D, H, W, C).
+        """
+        B, D, H, W, C = x.shape
+
+        # padding
+        pad_input = (H % 2 == 1) or (W % 2 == 1)
+        if pad_input:
+            x = F.pad(x, (0, 0, 0, W % 2, 0, H % 2))
+
+        x0 = x[:, :, 0::2, 0::2, :]  # B D H/2 W/2 C
+        x1 = x[:, :, 1::2, 0::2, :]  # B D H/2 W/2 C
+        x2 = x[:, :, 0::2, 1::2, :]  # B D H/2 W/2 C
+        x3 = x[:, :, 1::2, 1::2, :]  # B D H/2 W/2 C
+        x = torch.cat([x0, x1, x2, x3], -1)  # B D H/2 W/2 4*C
+
+        x = self.norm(x)
+        x = self.reduction(x)
+
+        return x
+
+
+@lru_cache()
+def compute_mask(D, H, W, window_size, shift_size, device):
+    img_mask = torch.zeros((1, D, H, W, 1), device=device)  # 1 Dp Hp Wp 1
+    cnt = 0
+    for d in slice(-window_size[0]), slice(-window_size[0],
+                                           -shift_size[0]), slice(
+                                               -shift_size[0], None):
+        for h in slice(-window_size[1]), slice(-window_size[1],
+                                               -shift_size[1]), slice(
+                                                   -shift_size[1], None):
+            for w in slice(-window_size[2]), slice(-window_size[2],
+                                                   -shift_size[2]), slice(
+                                                       -shift_size[2], None):
+                img_mask[:, d, h, w, :] = cnt
+                cnt += 1
+    mask_windows = window_partition(img_mask,
+                                    window_size)  # nW, ws[0]*ws[1]*ws[2], 1
+    mask_windows = mask_windows.squeeze(-1)  # nW, ws[0]*ws[1]*ws[2]
+    attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)
+    attn_mask = attn_mask.masked_fill(attn_mask != 0,
+                                      float(-100.0)).masked_fill(
+                                          attn_mask == 0, float(0.0))
+    return attn_mask
+
+
+class BasicLayer(nn.Module):
+    """ A basic Swin Transformer layer for one stage from Video Swin Transformer.
+
+    Args:
+        dim (int): Number of feature channels
+        depth (int): Depths of this stage.
+        num_heads (int): Number of attention head.
+        window_size (tuple[int]): Local window size. Default: (1,7,7).
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.
+        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
+        drop (float, optional): Dropout rate. Default: 0.0
+        attn_drop (float, optional): Attention dropout rate. Default: 0.0
+        drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0
+        norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm
+        downsample (nn.Module | None, optional): Downsample layer at the end of the layer. Default: None
+    """
+
+    def __init__(self,
+                 dim,
+                 depth,
+                 num_heads,
+                 window_size=(1, 7, 7),
+                 mlp_ratio=4.,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 drop=0.,
+                 attn_drop=0.,
+                 drop_path=0.,
+                 norm_layer=nn.LayerNorm,
+                 downsample=None,
+                 use_checkpoint=False,
+                 shift_type='psm'):
+        super().__init__()
+        self.window_size = window_size
+        self.shift_size = tuple(i // 2 for i in window_size)
+        self.depth = depth
+        self.use_checkpoint = use_checkpoint
+        self.shift_type = shift_type
+
+        # build blocks
+        self.blocks = nn.ModuleList([
+            SwinTransformerBlock3D(
+                dim=dim,
+                num_heads=num_heads,
+                window_size=window_size,
+                shift_size=(0, 0, 0) if (i % 2 == 0) else self.shift_size,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                drop=drop,
+                attn_drop=attn_drop,
+                drop_path=drop_path[i]
+                if isinstance(drop_path, list) else drop_path,
+                norm_layer=norm_layer,
+                use_checkpoint=use_checkpoint,
+                shift=True,
+                shift_type='tsm' if (i % 2 == 0 and self.shift_type == 'psm')
+                or self.shift_type == 'tsm' else 'psm',
+            ) for i in range(depth)
+        ])
+
+        self.downsample = downsample
+        if self.downsample is not None:
+            self.downsample = downsample(dim=dim, norm_layer=norm_layer)
+
+    def forward(self, x):
+        """ Forward function.
+
+        Args:
+            x: Input feature, tensor size (B, C, D, H, W).
+        """
+        # calculate attention mask for SW-MSA
+        B, C, D, H, W = x.shape
+        window_size, shift_size = get_window_size((D, H, W), self.window_size,
+                                                  self.shift_size)
+        x = rearrange(x, 'b c d h w -> b d h w c')
+        Dp = int(np.ceil(D / window_size[0])) * window_size[0]
+        Hp = int(np.ceil(H / window_size[1])) * window_size[1]
+        Wp = int(np.ceil(W / window_size[2])) * window_size[2]
+        attn_mask = compute_mask(Dp, Hp, Wp, window_size, shift_size, x.device)
+        for blk in self.blocks:
+            x = blk(x, attn_mask)
+        x = x.view(B, D, H, W, -1)
+
+        if self.downsample is not None:
+            x = self.downsample(x)
+        x = rearrange(x, 'b d h w c -> b c d h w')
+        return x
+
+
+class PatchEmbed3D(nn.Module):
+    """ Video to Patch Embedding from Video Swin Transformer.
+
+    Args:
+        patch_size (int): Patch token size. Default: (2,4,4).
+        in_chans (int): Number of input video channels. Default: 3.
+        embed_dim (int): Number of linear projection output channels. Default: 96.
+        norm_layer (nn.Module, optional): Normalization layer. Default: None
+    """
+
+    def __init__(self,
+                 patch_size=(2, 4, 4),
+                 in_chans=3,
+                 embed_dim=96,
+                 norm_layer=None):
+        super().__init__()
+        self.patch_size = patch_size
+
+        self.in_chans = in_chans
+        self.embed_dim = embed_dim
+
+        self.proj = nn.Conv3d(
+            in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
+        if norm_layer is not None:
+            self.norm = norm_layer(embed_dim)
+        else:
+            self.norm = None
+
+    def forward(self, x):
+        """Forward function."""
+        # padding
+        _, _, D, H, W = x.size()
+        if W % self.patch_size[2] != 0:
+            x = F.pad(x, (0, self.patch_size[2] - W % self.patch_size[2]))
+        if H % self.patch_size[1] != 0:
+            x = F.pad(x,
+                      (0, 0, 0, self.patch_size[1] - H % self.patch_size[1]))
+        if D % self.patch_size[0] != 0:
+            x = F.pad(
+                x,
+                (0, 0, 0, 0, 0, self.patch_size[0] - D % self.patch_size[0]))
+
+        x = self.proj(x)  # B C D Wh Ww
+        if self.norm is not None:
+            D, Wh, Ww = x.size(2), x.size(3), x.size(4)
+            x = x.flatten(2).transpose(1, 2)
+            x = self.norm(x)
+            x = x.transpose(1, 2).view(-1, self.embed_dim, D, Wh, Ww)
+
+        return x
+
+
+class SwinTransformer2D_TPS(nn.Module):
+    """
+        Code is adopted from Video Swin Transformer.
+
+    Args:
+        patch_size (int | tuple(int)): Patch size. Default: (4,4,4).
+        in_chans (int): Number of input image channels. Default: 3.
+        embed_dim (int): Number of linear projection output channels. Default: 96.
+        depths (tuple[int]): Depths of each Swin Transformer stage.
+        num_heads (tuple[int]): Number of attention head of each stage.
+        window_size (int): Window size. Default: 7.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.
+        qkv_bias (bool): If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float): Override default qk scale of head_dim ** -0.5 if set.
+        drop_rate (float): Dropout rate.
+        attn_drop_rate (float): Attention dropout rate. Default: 0.
+        drop_path_rate (float): Stochastic depth rate. Default: 0.2.
+        norm_layer: Normalization layer. Default: nn.LayerNorm.
+        patch_norm (bool): If True, add normalization after patch embedding. Default: False.
+        frozen_stages (int): Stages to be frozen (stop grad and set eval mode).
+            -1 means not freezing any parameters.
+    """
+
+    def __init__(self,
+                 pretrained=None,
+                 pretrained2d=True,
+                 patch_size=(4, 4, 4),
+                 in_chans=3,
+                 embed_dim=96,
+                 depths=[2, 2, 6, 2],
+                 num_heads=[3, 6, 12, 24],
+                 window_size=(2, 7, 7),
+                 mlp_ratio=4.,
+                 qkv_bias=True,
+                 qk_scale=None,
+                 drop_rate=0.,
+                 attn_drop_rate=0.,
+                 drop_path_rate=0.2,
+                 norm_layer=nn.LayerNorm,
+                 patch_norm=False,
+                 frozen_stages=-1,
+                 use_checkpoint=False):
+        super().__init__()
+
+        self.pretrained = pretrained
+        self.pretrained2d = pretrained2d
+        self.num_layers = len(depths)
+        self.embed_dim = embed_dim
+        self.patch_norm = patch_norm
+        self.frozen_stages = frozen_stages
+        self.window_size = window_size
+        self.patch_size = patch_size
+
+        # split image into non-overlapping patches
+        self.patch_embed = PatchEmbed3D(
+            patch_size=patch_size,
+            in_chans=in_chans,
+            embed_dim=embed_dim,
+            norm_layer=norm_layer if self.patch_norm else None)
+
+        self.pos_drop = nn.Dropout(p=drop_rate)
+
+        # stochastic depth
+        dpr = [
+            x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))
+        ]  # stochastic depth decay rule
+
+        # build layers
+        self.layers = nn.ModuleList()
+        for i_layer in range(self.num_layers):
+            layer = BasicLayer(
+                dim=int(embed_dim * 2**i_layer),
+                depth=depths[i_layer],
+                num_heads=num_heads[i_layer],
+                window_size=window_size,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                drop=drop_rate,
+                attn_drop=attn_drop_rate,
+                drop_path=dpr[sum(depths[:i_layer]):sum(depths[:i_layer + 1])],
+                norm_layer=norm_layer,
+                downsample=PatchMerging
+                if i_layer < self.num_layers - 1 else None,
+                use_checkpoint=use_checkpoint,
+                shift_type='psm')
+            self.layers.append(layer)
+
+        self.num_features = int(embed_dim * 2**(self.num_layers - 1))
+
+        # add a norm layer for each output
+        self.norm = norm_layer(self.num_features)
+
+        self._freeze_stages()
+
+    def _freeze_stages(self):
+        if self.frozen_stages >= 0:
+            self.patch_embed.eval()
+            for param in self.patch_embed.parameters():
+                param.requires_grad = False
+
+        if self.frozen_stages >= 1:
+            self.pos_drop.eval()
+            for i in range(0, self.frozen_stages):
+                m = self.layers[i]
+                m.eval()
+                for param in m.parameters():
+                    param.requires_grad = False
+
+    def inflate_weights(self):
+        """Inflate the swin2d parameters to swin3d.
+
+        The differences between swin3d and swin2d mainly lie in an extra
+        axis. To utilize the pretrained parameters in 2d model,
+        the weight of swin2d models should be inflated to fit in the shapes of
+        the 3d counterpart.
+
+        Args:
+            logger (logging.Logger): The logger used to print
+                debugging infomation.
+        """
+        checkpoint = torch.load(self.pretrained, map_location='cpu')
+        state_dict = checkpoint['model']
+
+        # delete relative_position_index since we always re-init it
+        relative_position_index_keys = [
+            k for k in state_dict.keys() if 'relative_position_index' in k
+        ]
+        for k in relative_position_index_keys:
+            del state_dict[k]
+
+        # delete attn_mask since we always re-init it
+        attn_mask_keys = [k for k in state_dict.keys() if 'attn_mask' in k]
+        for k in attn_mask_keys:
+            del state_dict[k]
+
+        state_dict['patch_embed.proj.weight'] = state_dict[
+            'patch_embed.proj.weight'].unsqueeze(2).repeat(
+                1, 1, self.patch_size[0], 1, 1) / self.patch_size[0]
+
+        # bicubic interpolate relative_position_bias_table if not match
+        relative_position_bias_table_keys = [
+            k for k in state_dict.keys() if 'relative_position_bias_table' in k
+        ]
+        for k in relative_position_bias_table_keys:
+            relative_position_bias_table_pretrained = state_dict[k]
+            relative_position_bias_table_current = self.state_dict()[k]
+            L1, nH1 = relative_position_bias_table_pretrained.size()
+            L2, nH2 = relative_position_bias_table_current.size()
+            L2 = (2 * self.window_size[1] - 1) * (2 * self.window_size[2] - 1)
+            # wd = self.window_size[0]
+            # to make it match
+            wd = 16
+            if nH1 != nH2:
+                print(f'Error in loading {k}, passing')
+            else:
+                if L1 != L2:
+                    S1 = int(L1**0.5)
+                    relative_position_bias_table_pretrained_resized = torch.nn.functional.interpolate(
+                        relative_position_bias_table_pretrained.permute(
+                            1, 0).view(1, nH1, S1, S1),
+                        size=(2 * self.window_size[1] - 1,
+                              2 * self.window_size[2] - 1),
+                        mode='bicubic')
+                    relative_position_bias_table_pretrained = relative_position_bias_table_pretrained_resized.view(
+                        nH2, L2).permute(1, 0)
+            state_dict[k] = relative_position_bias_table_pretrained.repeat(
+                2 * wd - 1, 1)
+
+        msg = self.load_state_dict(state_dict, strict=False)
+        print(msg)
+        print(f"=> loaded successfully '{self.pretrained}'")
+        del checkpoint
+        torch.cuda.empty_cache()
+
+    def init_weights(self, pretrained=None):
+        """Initialize the weights in backbone.
+
+        Args:
+            pretrained (str, optional): Path to pre-trained weights.
+                Defaults to None.
+        """
+
+        def _init_weights(m):
+            if isinstance(m, nn.Linear):
+                trunc_normal_(m.weight, std=.02)
+                if isinstance(m, nn.Linear) and m.bias is not None:
+                    nn.init.constant_(m.bias, 0)
+            elif isinstance(m, nn.LayerNorm):
+                nn.init.constant_(m.bias, 0)
+                nn.init.constant_(m.weight, 1.0)
+
+        if pretrained:
+            self.pretrained = pretrained
+        if isinstance(self.pretrained, str):
+            self.apply(_init_weights)
+            print(f'load model from: {self.pretrained}')
+
+            if self.pretrained2d:
+                # Inflate 2D model into 3D model.
+                # self.inflate_weights(logger)
+                self.inflate_weights()
+            else:
+                # Directly load 3D model.
+                torch.load_checkpoint(self, self.pretrained, strict=False)
+        elif self.pretrained is None:
+            self.apply(_init_weights)
+        else:
+            raise TypeError('pretrained must be a str or None')
+
+    def forward(self, x):
+        """Forward function."""
+        x = self.patch_embed(x)
+
+        x = self.pos_drop(x)
+
+        for layer in self.layers:
+            x = layer(x.contiguous())
+
+        x = rearrange(x, 'n c d h w -> n d h w c')
+        x = self.norm(x)
+        x = rearrange(x, 'n d h w c -> n c d h w')
+
+        return x
+
+    def train(self, mode=True):
+        """Convert the model into training mode while keep layers freezed."""
+        super(SwinTransformer2D_TPS, self).train(mode)
+        self._freeze_stages()
+
+
+def top_k_accuracy(scores, labels, topk=(1, )):
+    """Calculate top k accuracy score from mmaction.
+
+    Args:
+        scores (list[np.ndarray]): Prediction scores for each class.
+        labels (list[int]): Ground truth labels.
+        topk (tuple[int]): K value for top_k_accuracy. Default: (1, ).
+
+    Returns:
+        list[float]: Top k accuracy score for each k.
+    """
+    res = []
+    labels = np.array(labels)[:, np.newaxis]
+    for k in topk:
+        max_k_preds = np.argsort(scores, axis=1)[:, -k:][:, ::-1]
+        match_array = np.logical_or.reduce(max_k_preds == labels, axis=1)
+        topk_acc_score = match_array.sum() / match_array.shape[0]
+        res.append(topk_acc_score)
+
+    return res
+
+
+class BaseHead(nn.Module, metaclass=ABCMeta):
+    """Base class for head from mmaction.
+
+    All Head should subclass it.
+    All subclass should overwrite:
+    - Methods:``init_weights``, initializing weights in some modules.
+    - Methods:``forward``, supporting to forward both for training and testing.
+
+    Args:
+        num_classes (int): Number of classes to be classified.
+        in_channels (int): Number of channels in input feature.
+        loss_cls (dict): Config for building loss.
+            Default: dict(type='CrossEntropyLoss', loss_weight=1.0).
+        multi_class (bool): Determines whether it is a multi-class
+            recognition task. Default: False.
+        label_smooth_eps (float): Epsilon used in label smooth.
+            Reference: arxiv.org/abs/1906.02629. Default: 0.
+    """
+
+    def __init__(self,
+                 num_classes,
+                 in_channels,
+                 loss_cls=dict(type='CrossEntropyLoss', loss_weight=1.0),
+                 multi_class=False,
+                 label_smooth_eps=0.0):
+        super().__init__()
+        self.num_classes = num_classes
+        self.in_channels = in_channels
+        self.loss_cls = torch.nn.CrossEntropyLoss()
+        self.multi_class = multi_class
+        self.label_smooth_eps = label_smooth_eps
+
+    @abstractmethod
+    def init_weights(self):
+        """Initiate the parameters either from existing checkpoint or from
+        scratch."""
+
+    @abstractmethod
+    def forward(self, x):
+        """Defines the computation performed at every call."""
+
+    def loss(self, cls_score, labels, **kwargs):
+        """Calculate the loss given output ``cls_score``, target ``labels``.
+
+        Args:
+            cls_score (torch.Tensor): The output of the model.
+            labels (torch.Tensor): The target output of the model.
+
+        Returns:
+            dict: A dict containing field 'loss_cls'(mandatory)
+            and 'top1_acc', 'top5_acc'(optional).
+        """
+        losses = dict()
+        if labels.shape == torch.Size([]):
+            labels = labels.unsqueeze(0)
+        elif labels.dim() == 1 and labels.size()[0] == self.num_classes \
+                and cls_score.size()[0] == 1:
+            # Fix a bug when training with soft labels and batch size is 1.
+            # When using soft labels, `labels` and `cls_socre` share the same
+            # shape.
+            labels = labels.unsqueeze(0)
+
+        if not self.multi_class and cls_score.size() != labels.size():
+            top_k_acc = top_k_accuracy(cls_score.detach().cpu().numpy(),
+                                       labels.detach().cpu().numpy(), (1, 5))
+            losses['top1_acc'] = torch.tensor(
+                top_k_acc[0], device=cls_score.device)
+            losses['top5_acc'] = torch.tensor(
+                top_k_acc[1], device=cls_score.device)
+
+        elif self.multi_class and self.label_smooth_eps != 0:
+            labels = ((1 - self.label_smooth_eps) * labels
+                      + self.label_smooth_eps / self.num_classes)
+
+        loss_cls = self.loss_cls(cls_score, labels, **kwargs)
+        # loss_cls may be dictionary or single tensor
+        if isinstance(loss_cls, dict):
+            losses.update(loss_cls)
+        else:
+            losses['loss_cls'] = loss_cls
+
+        return losses
+
+
+class I3DHead(BaseHead):
+    """Classification head for I3D from mmaction.
+
+    Args:
+        num_classes (int): Number of classes to be classified.
+        in_channels (int): Number of channels in input feature.
+        loss_cls (dict): Config for building loss.
+            Default: dict(type='CrossEntropyLoss')
+        spatial_type (str): Pooling type in spatial dimension. Default: 'avg'.
+        dropout_ratio (float): Probability of dropout layer. Default: 0.5.
+        init_std (float): Std value for Initiation. Default: 0.01.
+        kwargs (dict, optional): Any keyword argument to be used to initialize
+            the head.
+    """
+
+    def __init__(self,
+                 num_classes,
+                 in_channels,
+                 loss_cls=dict(type='CrossEntropyLoss'),
+                 spatial_type='avg',
+                 dropout_ratio=0.5,
+                 init_std=0.01,
+                 **kwargs):
+        super().__init__(num_classes, in_channels, loss_cls, **kwargs)
+
+        self.spatial_type = spatial_type
+        self.dropout_ratio = dropout_ratio
+        self.init_std = init_std
+        if self.dropout_ratio != 0:
+            self.dropout = nn.Dropout(p=self.dropout_ratio)
+        else:
+            self.dropout = None
+        self.fc_cls = nn.Linear(self.in_channels, self.num_classes)
+
+        if self.spatial_type == 'avg':
+            # use `nn.AdaptiveAvgPool3d` to adaptively match the in_channels.
+            self.avg_pool = nn.AdaptiveAvgPool3d((1, 1, 1))
+        else:
+            self.avg_pool = None
+
+    def init_weights(self):
+        """Initiate the parameters from scratch."""
+        normal_init(self.fc_cls, std=self.init_std)
+
+    def forward(self, x):
+        """Defines the computation performed at every call.
+
+        Args:
+            x (torch.Tensor): The input data.
+
+        Returns:
+            torch.Tensor: The classification scores for input samples.
+        """
+        # [N, in_channels, 4, 7, 7]
+        if self.avg_pool is not None:
+            x = self.avg_pool(x)
+        # [N, in_channels, 1, 1, 1]
+        if self.dropout is not None:
+            x = self.dropout(x)
+        # [N, in_channels, 1, 1, 1]
+        x = x.view(x.shape[0], -1)
+        # [N, in_channels]
+        cls_score = self.fc_cls(x)
+        # [N, num_classes]
+        return cls_score
+
+
+class PatchShiftTransformer(TorchModel):
+    """  This is PyTorch impl of PST:
+    Spatiotemporal Self-attention Modeling with Temporal Patch Shift for Action Recognition, ECCV22.
+    """
+
+    def __init__(self,
+                 model_dir=None,
+                 num_classes=400,
+                 depths=[2, 2, 6, 2],
+                 num_heads=[3, 6, 12, 24],
+                 embed_dim=96,
+                 in_channels=768,
+                 pretrained=None):
+        super().__init__(model_dir)
+        self.backbone = SwinTransformer2D_TPS(
+            pretrained=pretrained,
+            pretrained2d=True,
+            patch_size=(2, 4, 4),
+            in_chans=3,
+            embed_dim=embed_dim,
+            depths=depths,
+            num_heads=num_heads,
+            window_size=(1, 7, 7),
+            mlp_ratio=4.,
+            qkv_bias=True,
+            qk_scale=None,
+            drop_rate=0.,
+            attn_drop_rate=0.,
+            drop_path_rate=0.2,
+            norm_layer=nn.LayerNorm,
+            patch_norm=True,
+            frozen_stages=-1,
+            use_checkpoint=False)
+        self.cls_head = I3DHead(
+            num_classes=num_classes, in_channels=in_channels)
+
+    def forward(self, x):
+        feature = self.backbone(x)
+        output = self.cls_head(feature)
+        return output
diff --git a/modelscope/models/cv/animal_recognition/resnet.py b/modelscope/models/cv/animal_recognition/resnet.py
index 73953de4..d7c03c29 100644
--- a/modelscope/models/cv/animal_recognition/resnet.py
+++ b/modelscope/models/cv/animal_recognition/resnet.py
@@ -1,3 +1,6 @@
+# The implementation is adopted from Split-Attention Network, A New ResNet Variant,
+# made pubicly available under the Apache License 2.0 License
+# at https://github.com/zhanghang1989/ResNeSt/blob/master/resnest/torch/models/resnet.py
 import math
 
 import torch
diff --git a/modelscope/models/cv/animal_recognition/splat.py b/modelscope/models/cv/animal_recognition/splat.py
index 0aab555e..a10d0abe 100644
--- a/modelscope/models/cv/animal_recognition/splat.py
+++ b/modelscope/models/cv/animal_recognition/splat.py
@@ -1,3 +1,6 @@
+# The implementation is adopted from Split-Attention Network, A New ResNet Variant,
+# made pubicly available under the Apache License 2.0 License
+# at https://github.com/zhanghang1989/ResNeSt/blob/master/resnest/torch/models/splat.py
 """Split-Attention"""
 
 import torch
diff --git a/modelscope/models/cv/body_2d_keypoints/hrnet_v2.py b/modelscope/models/cv/body_2d_keypoints/hrnet_v2.py
index 1570c8cc..ebd69adb 100644
--- a/modelscope/models/cv/body_2d_keypoints/hrnet_v2.py
+++ b/modelscope/models/cv/body_2d_keypoints/hrnet_v2.py
@@ -1,3 +1,5 @@
+# The implementation is based on HRNET, available at https://github.com/HRNet/HigherHRNet-Human-Pose-Estimation.
+
 import os
 
 import numpy as np
diff --git a/modelscope/models/cv/body_2d_keypoints/w48.py b/modelscope/models/cv/body_2d_keypoints/w48.py
index 7140f8fe..e0317991 100644
--- a/modelscope/models/cv/body_2d_keypoints/w48.py
+++ b/modelscope/models/cv/body_2d_keypoints/w48.py
@@ -1,3 +1,5 @@
+# The implementation is based on HRNET, available at https://github.com/HRNet/HigherHRNet-Human-Pose-Estimation.
+
 cfg_128x128_15 = {
     'DATASET': {
         'TYPE': 'DAMO',
diff --git a/modelscope/models/cv/body_3d_keypoints/__init__.py b/modelscope/models/cv/body_3d_keypoints/__init__.py
new file mode 100644
index 00000000..4bb83936
--- /dev/null
+++ b/modelscope/models/cv/body_3d_keypoints/__init__.py
@@ -0,0 +1,23 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+
+    from .body_3d_pose import BodyKeypointsDetection3D
+
+else:
+    _import_structure = {
+        'body_3d_pose': ['BodyKeypointsDetection3D'],
+    }
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/models/cv/body_3d_keypoints/body_3d_pose.py b/modelscope/models/cv/body_3d_keypoints/body_3d_pose.py
new file mode 100644
index 00000000..3e920d12
--- /dev/null
+++ b/modelscope/models/cv/body_3d_keypoints/body_3d_pose.py
@@ -0,0 +1,248 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import logging
+import os.path as osp
+from typing import Any, Dict, List, Union
+
+import numpy as np
+import torch
+
+from modelscope.metainfo import Models
+from modelscope.models.base.base_torch_model import TorchModel
+from modelscope.models.builder import MODELS
+from modelscope.models.cv.body_3d_keypoints.canonical_pose_modules import (
+    TemporalModel, TransCan3Dkeys)
+from modelscope.utils.config import Config
+from modelscope.utils.constant import ModelFile, Tasks
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+__all__ = ['BodyKeypointsDetection3D']
+
+
+class KeypointsTypes(object):
+    POSES_CAMERA = 'poses_camera'
+    POSES_TRAJ = 'poses_traj'
+
+
+@MODELS.register_module(
+    Tasks.body_3d_keypoints, module_name=Models.body_3d_keypoints)
+class BodyKeypointsDetection3D(TorchModel):
+
+    def __init__(self, model_dir: str, *args, **kwargs):
+
+        super().__init__(model_dir, *args, **kwargs)
+
+        self.model_dir = model_dir
+        model_path = osp.join(self.model_dir, ModelFile.TORCH_MODEL_FILE)
+        cfg_path = osp.join(self.model_dir, ModelFile.CONFIGURATION)
+        self.cfg = Config.from_file(cfg_path)
+        self._create_model()
+
+        if not osp.exists(model_path):
+            raise IOError(f'{model_path} is not exists.')
+
+        if torch.cuda.is_available():
+            self._device = torch.device('cuda')
+        else:
+            self._device = torch.device('cpu')
+        self.pretrained_state_dict = torch.load(
+            model_path, map_location=self._device)
+
+        self.load_pretrained()
+        self.to_device(self._device)
+        self.eval()
+
+    def _create_model(self):
+        self.model_pos = TemporalModel(
+            self.cfg.model.MODEL.IN_NUM_JOINTS,
+            self.cfg.model.MODEL.IN_2D_FEATURE,
+            self.cfg.model.MODEL.OUT_NUM_JOINTS,
+            filter_widths=self.cfg.model.MODEL.FILTER_WIDTHS,
+            causal=self.cfg.model.MODEL.CAUSAL,
+            dropout=self.cfg.model.MODEL.DROPOUT,
+            channels=self.cfg.model.MODEL.CHANNELS,
+            dense=self.cfg.model.MODEL.DENSE)
+
+        receptive_field = self.model_pos.receptive_field()
+        self.pad = (receptive_field - 1) // 2
+        if self.cfg.model.MODEL.CAUSAL:
+            self.causal_shift = self.pad
+        else:
+            self.causal_shift = 0
+
+        self.model_traj = TransCan3Dkeys(
+            in_channels=self.cfg.model.MODEL.IN_NUM_JOINTS
+            * self.cfg.model.MODEL.IN_2D_FEATURE,
+            num_features=1024,
+            out_channels=self.cfg.model.MODEL.OUT_3D_FEATURE,
+            num_blocks=4,
+            time_window=receptive_field)
+
+    def eval(self):
+        self.model_pos.eval()
+        self.model_traj.eval()
+
+    def train(self):
+        self.model_pos.train()
+        self.model_traj.train()
+
+    def to_device(self, device):
+        self.model_pos = self.model_pos.to(device)
+        self.model_traj = self.model_traj.to(device)
+
+    def load_pretrained(self):
+        if 'model_pos' in self.pretrained_state_dict:
+            self.model_pos.load_state_dict(
+                self.pretrained_state_dict['model_pos'], strict=False)
+        else:
+            logging.error(
+                'Not load model pos from pretrained_state_dict, not in pretrained_state_dict'
+            )
+
+        if 'model_traj' in self.pretrained_state_dict:
+            self.model_traj.load_state_dict(
+                self.pretrained_state_dict['model_traj'], strict=False)
+        else:
+            logging.error(
+                'Not load model traj from pretrained_state_dict, not in pretrained_state_dict'
+            )
+        logging.info('Load pretrained model done.')
+
+    def preprocess(self, input: Dict[str, Any]) -> Dict[str, Any]:
+        """Proprocess of 2D input joints.
+
+        Args:
+            input (Dict[str, Any]): [NUM_FRAME, NUM_JOINTS, 2], input 2d human body keypoints.
+
+        Returns:
+            Dict[str, Any]: canonical 2d points and root relative joints.
+        """
+        if 'cuda' == input.device.type:
+            input = input.data.cpu().numpy()
+        elif 'cpu' == input.device.type:
+            input = input.data.numpy()
+        pose2d = input
+
+        pose2d_canonical = self.canonicalize_2Ds(
+            pose2d, self.cfg.model.INPUT.FOCAL_LENGTH,
+            self.cfg.model.INPUT.CENTER)
+        pose2d_normalized = self.normalize_screen_coordinates(
+            pose2d, self.cfg.model.INPUT.RES_W, self.cfg.model.INPUT.RES_H)
+        pose2d_rr = pose2d_normalized
+        pose2d_rr[:, 1:] -= pose2d_rr[:, :1]
+
+        # expand [NUM_FRAME, NUM_JOINTS, 2] to [1, NUM_FRAME, NUM_JOINTS, 2]
+        pose2d_rr = np.expand_dims(
+            np.pad(
+                pose2d_rr,
+                ((self.pad + self.causal_shift, self.pad - self.causal_shift),
+                 (0, 0), (0, 0)), 'edge'),
+            axis=0)
+        pose2d_canonical = np.expand_dims(
+            np.pad(
+                pose2d_canonical,
+                ((self.pad + self.causal_shift, self.pad - self.causal_shift),
+                 (0, 0), (0, 0)), 'edge'),
+            axis=0)
+        pose2d_rr = torch.from_numpy(pose2d_rr.astype(np.float32))
+        pose2d_canonical = torch.from_numpy(
+            pose2d_canonical.astype(np.float32))
+
+        inputs_2d = pose2d_rr.clone()
+        if torch.cuda.is_available():
+            inputs_2d = inputs_2d.cuda(non_blocking=True)
+
+        # Positional model
+        if self.cfg.model.MODEL.USE_2D_OFFSETS:
+            inputs_2d[:, :, 0] = 0
+        else:
+            inputs_2d[:, :, 1:] += inputs_2d[:, :, :1]
+
+        return {
+            'inputs_2d': inputs_2d,
+            'pose2d_rr': pose2d_rr,
+            'pose2d_canonical': pose2d_canonical
+        }
+
+    def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
+        """3D human pose estimation.
+
+        Args:
+            input (Dict):
+                inputs_2d:  [1, NUM_FRAME, NUM_JOINTS, 2]
+                pose2d_rr:  [1, NUM_FRAME, NUM_JOINTS, 2]
+                pose2d_canonical: [1, NUM_FRAME, NUM_JOINTS, 2]
+                NUM_FRAME = max(receptive_filed + video_frame_number, video_frame_number)
+
+        Returns:
+            Dict[str, Any]:
+                "camera_pose": Tensor, [1, NUM_FRAME, OUT_NUM_JOINTS, OUT_3D_FEATURE_DIM],
+                    3D human pose keypoints in camera frame.
+                "camera_traj": Tensor, [1, NUM_FRAME, 1, 3],
+                    root keypoints coordinates in camere frame.
+        """
+        inputs_2d = input['inputs_2d']
+        pose2d_rr = input['pose2d_rr']
+        pose2d_canonical = input['pose2d_canonical']
+        with torch.no_grad():
+            # predict 3D pose keypoints
+            predicted_3d_pos = self.model_pos(inputs_2d)
+
+            # predict global trajectory
+            b1, w1, n1, d1 = inputs_2d.shape
+
+            input_pose2d_abs = self.get_abs_2d_pts(w1, pose2d_rr,
+                                                   pose2d_canonical)
+            b1, w1, n1, d1 = input_pose2d_abs.size()
+            b2, w2, n2, d2 = predicted_3d_pos.size()
+
+            if torch.cuda.is_available():
+                input_pose2d_abs = input_pose2d_abs.cuda(non_blocking=True)
+
+            predicted_3d_traj = self.model_traj(
+                input_pose2d_abs.view(b1, w1, n1 * d1),
+                predicted_3d_pos.view(b2 * w2, n2 * d2)).view(b2, w2, -1, 3)
+
+            predict_dict = {
+                KeypointsTypes.POSES_CAMERA: predicted_3d_pos,
+                KeypointsTypes.POSES_TRAJ: predicted_3d_traj
+            }
+
+        return predict_dict
+
+    def get_abs_2d_pts(self, input_video_frame_num, pose2d_rr,
+                       pose2d_canonical):
+        pad = self.pad
+        w = input_video_frame_num - pad * 2
+
+        lst_pose2d_rr = []
+        lst_pose2d_cannoical = []
+        for i in range(pad, w + pad):
+            lst_pose2d_rr.append(pose2d_rr[:, i - pad:i + pad + 1])
+            lst_pose2d_cannoical.append(pose2d_canonical[:,
+                                                         i - pad:i + pad + 1])
+
+        input_pose2d_rr = torch.concat(lst_pose2d_cannoical, axis=0)
+        input_pose2d_cannoical = torch.concat(lst_pose2d_cannoical, axis=0)
+
+        if self.cfg.model.MODEL.USE_CANONICAL_COORDS:
+            input_pose2d_abs = input_pose2d_cannoical.clone()
+        else:
+            input_pose2d_abs = input_pose2d_rr.clone()
+            input_pose2d_abs[:, :, 1:] += input_pose2d_abs[:, :, :1]
+
+        return input_pose2d_abs
+
+    def canonicalize_2Ds(self, pos2d, f, c):
+        cs = np.array([c[0], c[1]]).reshape(1, 1, 2)
+        fs = np.array([f[0], f[1]]).reshape(1, 1, 2)
+        canoical_2Ds = (pos2d - cs) / fs
+        return canoical_2Ds
+
+    def normalize_screen_coordinates(self, X, w, h):
+        assert X.shape[-1] == 2
+
+        # Normalize so that [0, w] is mapped to [-1, 1], while preserving the aspect ratio
+        return X / w * 2 - [1, h / w]
diff --git a/modelscope/models/cv/body_3d_keypoints/canonical_pose_modules.py b/modelscope/models/cv/body_3d_keypoints/canonical_pose_modules.py
new file mode 100644
index 00000000..b7f0c4a3
--- /dev/null
+++ b/modelscope/models/cv/body_3d_keypoints/canonical_pose_modules.py
@@ -0,0 +1,233 @@
+# The implementation is based on VideoPose3D, available at https://github.com/facebookresearch/VideoPose3D
+import torch
+import torch.nn as nn
+
+
+class TemporalModelBase(nn.Module):
+    """
+    Do not instantiate this class.
+    """
+
+    def __init__(self, num_joints_in, in_features, num_joints_out,
+                 filter_widths, causal, dropout, channels):
+        super().__init__()
+
+        # Validate input
+        for fw in filter_widths:
+            assert fw % 2 != 0, 'Only odd filter widths are supported'
+
+        self.num_joints_in = num_joints_in
+        self.in_features = in_features
+        self.num_joints_out = num_joints_out
+        self.filter_widths = filter_widths
+
+        self.drop = nn.Dropout(dropout)
+        self.relu = nn.ReLU(inplace=True)
+
+        self.pad = [filter_widths[0] // 2]
+        self.expand_bn = nn.BatchNorm1d(channels, momentum=0.1)
+        self.shrink = nn.Conv1d(channels, num_joints_out * 3, 1)
+
+    def set_bn_momentum(self, momentum):
+        self.expand_bn.momentum = momentum
+        for bn in self.layers_bn:
+            bn.momentum = momentum
+
+    def receptive_field(self):
+        """
+        Return the total receptive field of this model as # of frames.
+        """
+        frames = 0
+        for f in self.pad:
+            frames += f
+        return 1 + 2 * frames
+
+    def total_causal_shift(self):
+        """
+        Return the asymmetric offset for sequence padding.
+        The returned value is typically 0 if causal convolutions are disabled,
+        otherwise it is half the receptive field.
+        """
+        frames = self.causal_shift[0]
+        next_dilation = self.filter_widths[0]
+        for i in range(1, len(self.filter_widths)):
+            frames += self.causal_shift[i] * next_dilation
+            next_dilation *= self.filter_widths[i]
+        return frames
+
+    def forward(self, x):
+        assert len(x.shape) == 4
+        assert x.shape[-2] == self.num_joints_in
+        assert x.shape[-1] == self.in_features
+
+        sz = x.shape[:3]
+        x = x.view(x.shape[0], x.shape[1], -1)
+        x = x.permute(0, 2, 1)
+
+        x = self._forward_blocks(x)
+
+        x = x.permute(0, 2, 1)
+        x = x.view(sz[0], -1, self.num_joints_out, 3)
+
+        return x
+
+
+class TemporalModel(TemporalModelBase):
+    """
+    Reference 3D pose estimation model with temporal convolutions.
+    This implementation can be used for all use-cases.
+    """
+
+    def __init__(self,
+                 num_joints_in,
+                 in_features,
+                 num_joints_out,
+                 filter_widths,
+                 causal=False,
+                 dropout=0.25,
+                 channels=1024,
+                 dense=False):
+        """
+        Initialize this model.
+
+        Arguments:
+        num_joints_in -- number of input joints (e.g. 17 for Human3.6M)
+        in_features -- number of input features for each joint (typically 2 for 2D input)
+        num_joints_out -- number of output joints (can be different than input)
+        filter_widths -- list of convolution widths, which also determines the # of blocks and receptive field
+        causal -- use causal convolutions instead of symmetric convolutions (for real-time applications)
+        dropout -- dropout probability
+        channels -- number of convolution channels
+        dense -- use regular dense convolutions instead of dilated convolutions (ablation experiment)
+        """
+        super().__init__(num_joints_in, in_features, num_joints_out,
+                         filter_widths, causal, dropout, channels)
+
+        self.expand_conv = nn.Conv1d(
+            num_joints_in * in_features,
+            channels,
+            filter_widths[0],
+            bias=False)
+
+        layers_conv = []
+        layers_bn = []
+
+        self.causal_shift = [(filter_widths[0]) // 2 if causal else 0]
+        next_dilation = filter_widths[0]
+        for i in range(1, len(filter_widths)):
+            self.pad.append((filter_widths[i] - 1) * next_dilation // 2)
+            self.causal_shift.append((filter_widths[i] // 2
+                                      * next_dilation) if causal else 0)
+
+            layers_conv.append(
+                nn.Conv1d(
+                    channels,
+                    channels,
+                    filter_widths[i] if not dense else (2 * self.pad[-1] + 1),
+                    dilation=next_dilation if not dense else 1,
+                    bias=False))
+            layers_bn.append(nn.BatchNorm1d(channels, momentum=0.1))
+            layers_conv.append(
+                nn.Conv1d(channels, channels, 1, dilation=1, bias=False))
+            layers_bn.append(nn.BatchNorm1d(channels, momentum=0.1))
+
+            next_dilation *= filter_widths[i]
+
+        self.layers_conv = nn.ModuleList(layers_conv)
+        self.layers_bn = nn.ModuleList(layers_bn)
+
+    def _forward_blocks(self, x):
+        x = self.drop(self.relu(self.expand_bn(self.expand_conv(x))))
+        for i in range(len(self.pad) - 1):
+            pad = self.pad[i + 1]
+            shift = self.causal_shift[i + 1]
+            res = x[:, :, pad + shift:x.shape[2] - pad + shift]
+            x = self.drop(
+                self.relu(self.layers_bn[2 * i](self.layers_conv[2 * i](x))))
+            x = res + self.drop(
+                self.relu(self.layers_bn[2 * i + 1](
+                    self.layers_conv[2 * i + 1](x))))
+
+        x = self.shrink(x)
+        return x
+
+
+# regression of the trajectory
+class TransCan3Dkeys(nn.Module):
+
+    def __init__(self,
+                 in_channels=74,
+                 num_features=256,
+                 out_channels=44,
+                 time_window=10,
+                 num_blocks=2):
+        super().__init__()
+        self.in_channels = in_channels
+        self.num_features = num_features
+        self.out_channels = out_channels
+        self.num_blocks = num_blocks
+        self.time_window = time_window
+
+        self.expand_bn = nn.BatchNorm1d(self.num_features, momentum=0.1)
+        self.conv1 = nn.Sequential(
+            nn.ReplicationPad1d(1),
+            nn.Conv1d(
+                self.in_channels, self.num_features, kernel_size=3,
+                bias=False), self.expand_bn, nn.ReLU(inplace=True),
+            nn.Dropout(p=0.25))
+        self._make_blocks()
+        self.pad = nn.ReplicationPad1d(4)
+        self.relu = nn.ReLU(inplace=True)
+        self.drop = nn.Dropout(p=0.25)
+        self.reduce = nn.Conv1d(
+            self.num_features, self.num_features, kernel_size=self.time_window)
+        self.embedding_3d_1 = nn.Linear(in_channels // 2 * 3, 500)
+        self.embedding_3d_2 = nn.Linear(500, 500)
+        self.LReLU1 = nn.LeakyReLU()
+        self.LReLU2 = nn.LeakyReLU()
+        self.LReLU3 = nn.LeakyReLU()
+        self.out1 = nn.Linear(self.num_features + 500, self.num_features)
+        self.out2 = nn.Linear(self.num_features, self.out_channels)
+
+    def _make_blocks(self):
+        layers_conv = []
+        layers_bn = []
+        for i in range(self.num_blocks):
+            layers_conv.append(
+                nn.Conv1d(
+                    self.num_features,
+                    self.num_features,
+                    kernel_size=5,
+                    bias=False,
+                    dilation=2))
+            layers_bn.append(nn.BatchNorm1d(self.num_features))
+        self.layers_conv = nn.ModuleList(layers_conv)
+        self.layers_bn = nn.ModuleList(layers_bn)
+
+    def set_bn_momentum(self, momentum):
+        self.expand_bn.momentum = momentum
+        for bn in self.layers_bn:
+            bn.momentum = momentum
+
+    def forward(self, p2ds, p3d):
+        """
+        Args:
+        x - (B x T x J x C)
+        """
+        B, T, C = p2ds.shape
+        x = p2ds.permute((0, 2, 1))
+        x = self.conv1(x)
+        for i in range(self.num_blocks):
+            pre = x
+            x = self.pad(x)
+            x = self.layers_conv[i](x)
+            x = self.layers_bn[i](x)
+            x = self.drop(self.relu(x))
+            x = pre + x
+        x_2d = self.relu(self.reduce(x))
+        x_2d = x_2d.view(B, -1)
+        x_3d = self.LReLU1(self.embedding_3d_1(p3d))
+        x = torch.cat((x_2d, x_3d), 1)
+        x = self.LReLU3(self.out1(x))
+        x = self.out2(x)
+        return x
diff --git a/modelscope/models/cv/cartoon/facelib/LK/lk.py b/modelscope/models/cv/cartoon/facelib/LK/lk.py
index df05e3f9..6fd95ad6 100644
--- a/modelscope/models/cv/cartoon/facelib/LK/lk.py
+++ b/modelscope/models/cv/cartoon/facelib/LK/lk.py
@@ -1,3 +1,5 @@
+# The implementation is adopted from https://github.com/610265158/Peppa_Pig_Face_Engine
+
 import numpy as np
 
 from modelscope.models.cv.cartoon.facelib.config import config as cfg
diff --git a/modelscope/models/cv/cartoon/facelib/config.py b/modelscope/models/cv/cartoon/facelib/config.py
index d795fdde..92b39db0 100644
--- a/modelscope/models/cv/cartoon/facelib/config.py
+++ b/modelscope/models/cv/cartoon/facelib/config.py
@@ -1,3 +1,5 @@
+# The implementation is adopted from https://github.com/610265158/Peppa_Pig_Face_Engine
+
 import os
 
 import numpy as np
diff --git a/modelscope/models/cv/cartoon/facelib/face_detector.py b/modelscope/models/cv/cartoon/facelib/face_detector.py
index e5589719..fa36d662 100644
--- a/modelscope/models/cv/cartoon/facelib/face_detector.py
+++ b/modelscope/models/cv/cartoon/facelib/face_detector.py
@@ -1,3 +1,5 @@
+# The implementation is adopted from https://github.com/610265158/Peppa_Pig_Face_Engine
+
 import time
 
 import cv2
diff --git a/modelscope/models/cv/cartoon/facelib/face_landmark.py b/modelscope/models/cv/cartoon/facelib/face_landmark.py
index 063d40c3..3b7cc1b9 100644
--- a/modelscope/models/cv/cartoon/facelib/face_landmark.py
+++ b/modelscope/models/cv/cartoon/facelib/face_landmark.py
@@ -1,3 +1,5 @@
+# The implementation is adopted from https://github.com/610265158/Peppa_Pig_Face_Engine
+
 import cv2
 import numpy as np
 import tensorflow as tf
diff --git a/modelscope/models/cv/cartoon/facelib/facer.py b/modelscope/models/cv/cartoon/facelib/facer.py
index 62388ab9..c6f34e9c 100644
--- a/modelscope/models/cv/cartoon/facelib/facer.py
+++ b/modelscope/models/cv/cartoon/facelib/facer.py
@@ -1,3 +1,5 @@
+# The implementation is adopted from https://github.com/610265158/Peppa_Pig_Face_Engine
+
 import time
 
 import cv2
diff --git a/modelscope/models/cv/cartoon/mtcnn_pytorch/src/align_trans.py b/modelscope/models/cv/cartoon/mtcnn_pytorch/src/align_trans.py
index baa3ba73..eb542042 100644
--- a/modelscope/models/cv/cartoon/mtcnn_pytorch/src/align_trans.py
+++ b/modelscope/models/cv/cartoon/mtcnn_pytorch/src/align_trans.py
@@ -1,7 +1,5 @@
-"""
-Created on Mon Apr 24 15:43:29 2017
-@author: zhaoy
-"""
+# The implementation is adopted from https://github.com/TreB1eN/InsightFace_Pytorch/tree/master/mtcnn_pytorch
+
 import cv2
 import numpy as np
 
diff --git a/modelscope/models/cv/cartoon/mtcnn_pytorch/src/matlab_cp2tform.py b/modelscope/models/cv/cartoon/mtcnn_pytorch/src/matlab_cp2tform.py
index 96a5f965..ea9fbacf 100644
--- a/modelscope/models/cv/cartoon/mtcnn_pytorch/src/matlab_cp2tform.py
+++ b/modelscope/models/cv/cartoon/mtcnn_pytorch/src/matlab_cp2tform.py
@@ -1,8 +1,4 @@
-"""
-Created on Tue Jul 11 06:54:28 2017
-
-@author: zhaoyafei
-"""
+# The implementation is adopted from https://github.com/TreB1eN/InsightFace_Pytorch/tree/master/mtcnn_pytorch
 
 import numpy as np
 from numpy.linalg import inv, lstsq
diff --git a/modelscope/models/cv/cartoon/utils.py b/modelscope/models/cv/cartoon/utils.py
index 39712653..59b4e879 100644
--- a/modelscope/models/cv/cartoon/utils.py
+++ b/modelscope/models/cv/cartoon/utils.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import os
 
 import cv2
diff --git a/modelscope/models/cv/cmdssl_video_embedding/__init__.py b/modelscope/models/cv/cmdssl_video_embedding/__init__.py
index e7e156a5..5bc67b63 100644
--- a/modelscope/models/cv/cmdssl_video_embedding/__init__.py
+++ b/modelscope/models/cv/cmdssl_video_embedding/__init__.py
@@ -1,4 +1,5 @@
-# Copyright (c) Alibaba, Inc. and its affiliates.
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
+
 from typing import TYPE_CHECKING
 
 from modelscope.utils.import_utils import LazyImportModule
diff --git a/modelscope/models/cv/cmdssl_video_embedding/c3d.py b/modelscope/models/cv/cmdssl_video_embedding/c3d.py
index 62f0e0b9..53dd05a1 100644
--- a/modelscope/models/cv/cmdssl_video_embedding/c3d.py
+++ b/modelscope/models/cv/cmdssl_video_embedding/c3d.py
@@ -1,3 +1,11 @@
+# Copyright 2022 Davide Abati.
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
+
+# The implementation here is modified based on c3d-pytorch,
+# originally MIT License, Copyright (c) 2022 Davide Abati,
+# and publicly available at https://github.com/DavideA/c3d-pytorch
+""" C3D Model Architecture."""
+
 import torch
 import torch.nn as nn
 
diff --git a/modelscope/models/cv/cmdssl_video_embedding/resnet2p1d.py b/modelscope/models/cv/cmdssl_video_embedding/resnet2p1d.py
index 3b03cc74..b49069d1 100644
--- a/modelscope/models/cv/cmdssl_video_embedding/resnet2p1d.py
+++ b/modelscope/models/cv/cmdssl_video_embedding/resnet2p1d.py
@@ -1,3 +1,11 @@
+# Copyright (c) 2022 Kensho Hara.
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
+
+# The implementation here is modified based on 3D-ResNets-PyTorch,
+# originally MIT License, Copyright (c) 2022 Kensho Hara,
+# and publicly available at https://github.com/kenshohara/3D-ResNets-PyTorch/blob/master/models/resnet2p1d.py
+""" ResNet2plus1d Model Architecture."""
+
 import torch
 import torch.nn as nn
 
diff --git a/modelscope/models/cv/cmdssl_video_embedding/resnet3d.py b/modelscope/models/cv/cmdssl_video_embedding/resnet3d.py
index 24d50a8e..dddba06f 100644
--- a/modelscope/models/cv/cmdssl_video_embedding/resnet3d.py
+++ b/modelscope/models/cv/cmdssl_video_embedding/resnet3d.py
@@ -1,3 +1,11 @@
+# Copyright (c) 2022 Kensho Hara.
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
+
+# The implementation here is modified based on 3D-ResNets-PyTorch,
+# originally MIT License, Copyright (c) 2022 Kensho Hara,
+# and publicly available at https://github.com/kenshohara/3D-ResNets-PyTorch/blob/master/models/resnet.py
+""" ResNet3D Model Architecture."""
+
 import torch
 import torch.nn as nn
 
diff --git a/modelscope/models/cv/easycv_base.py b/modelscope/models/cv/easycv_base.py
new file mode 100644
index 00000000..7bc35e84
--- /dev/null
+++ b/modelscope/models/cv/easycv_base.py
@@ -0,0 +1,25 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from easycv.models.base import BaseModel
+from easycv.utils.ms_utils import EasyCVMeta
+
+from modelscope.models.base import TorchModel
+
+
+class EasyCVBaseModel(BaseModel, TorchModel):
+    """Base model for EasyCV."""
+
+    def __init__(self, model_dir=None, args=(), kwargs={}):
+        kwargs.pop(EasyCVMeta.ARCH, None)  # pop useless keys
+        BaseModel.__init__(self)
+        TorchModel.__init__(self, model_dir=model_dir)
+
+    def forward(self, img, mode='train', **kwargs):
+        if self.training:
+            losses = self.forward_train(img, **kwargs)
+            loss, log_vars = self._parse_losses(losses)
+            return dict(loss=loss, log_vars=log_vars)
+        else:
+            return self.forward_test(img, **kwargs)
+
+    def __call__(self, *args, **kwargs):
+        return self.forward(*args, **kwargs)
diff --git a/modelscope/models/cv/face_2d_keypoints/__init__.py b/modelscope/models/cv/face_2d_keypoints/__init__.py
new file mode 100644
index 00000000..636ba0f4
--- /dev/null
+++ b/modelscope/models/cv/face_2d_keypoints/__init__.py
@@ -0,0 +1,20 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .face_2d_keypoints_align import Face2DKeypoints
+
+else:
+    _import_structure = {'face_2d_keypoints_align': ['Face2DKeypoints']}
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/models/cv/face_2d_keypoints/face_2d_keypoints_align.py b/modelscope/models/cv/face_2d_keypoints/face_2d_keypoints_align.py
new file mode 100644
index 00000000..468662a0
--- /dev/null
+++ b/modelscope/models/cv/face_2d_keypoints/face_2d_keypoints_align.py
@@ -0,0 +1,16 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from easycv.models.face.face_keypoint import FaceKeypoint
+
+from modelscope.metainfo import Models
+from modelscope.models.builder import MODELS
+from modelscope.models.cv.easycv_base import EasyCVBaseModel
+from modelscope.utils.constant import Tasks
+
+
+@MODELS.register_module(
+    group_key=Tasks.face_2d_keypoints, module_name=Models.face_2d_keypoints)
+class Face2DKeypoints(EasyCVBaseModel, FaceKeypoint):
+
+    def __init__(self, model_dir=None, *args, **kwargs):
+        EasyCVBaseModel.__init__(self, model_dir, args, kwargs)
+        FaceKeypoint.__init__(self, *args, **kwargs)
diff --git a/modelscope/models/cv/face_detection/__init__.py b/modelscope/models/cv/face_detection/__init__.py
index e69de29b..a2a845d2 100644
--- a/modelscope/models/cv/face_detection/__init__.py
+++ b/modelscope/models/cv/face_detection/__init__.py
@@ -0,0 +1,27 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .mogface import MogFaceDetector
+    from .mtcnn import MtcnnFaceDetector
+    from .retinaface import RetinaFaceDetection
+    from .ulfd_slim import UlfdFaceDetector
+else:
+    _import_structure = {
+        'ulfd_slim': ['UlfdFaceDetector'],
+        'retinaface': ['RetinaFaceDetection'],
+        'mtcnn': ['MtcnnFaceDetector'],
+        'mogface': ['MogFaceDetector']
+    }
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/models/cv/face_detection/mmdet_patch/__init__.py b/modelscope/models/cv/face_detection/mmdet_patch/__init__.py
index 921bdc08..5a895582 100755
--- a/modelscope/models/cv/face_detection/mmdet_patch/__init__.py
+++ b/modelscope/models/cv/face_detection/mmdet_patch/__init__.py
@@ -1,5 +1,4 @@
 """
-mmdet_patch is based on
-https://github.com/deepinsight/insightface/tree/master/detection/scrfd/mmdet,
-all duplicate functions from official mmdetection are removed.
+The implementation here is modified based on insightface, originally MIT license and publicly avaialbe at
+https://github.com/deepinsight/insightface/tree/master/detection/scrfd/mmdet
 """
diff --git a/modelscope/models/cv/face_detection/mmdet_patch/core/bbox/__init__.py b/modelscope/models/cv/face_detection/mmdet_patch/core/bbox/__init__.py
index 8375649c..cf1b7313 100644
--- a/modelscope/models/cv/face_detection/mmdet_patch/core/bbox/__init__.py
+++ b/modelscope/models/cv/face_detection/mmdet_patch/core/bbox/__init__.py
@@ -1,3 +1,7 @@
+"""
+The implementation here is modified based on insightface, originally MIT license and publicly avaialbe at
+https://github.com/deepinsight/insightface/tree/master/detection/scrfd/mmdet/core/bbox
+"""
 from .transforms import bbox2result, distance2kps, kps2distance
 
 __all__ = ['bbox2result', 'distance2kps', 'kps2distance']
diff --git a/modelscope/models/cv/face_detection/mmdet_patch/core/bbox/transforms.py b/modelscope/models/cv/face_detection/mmdet_patch/core/bbox/transforms.py
index 26278837..d65480eb 100755
--- a/modelscope/models/cv/face_detection/mmdet_patch/core/bbox/transforms.py
+++ b/modelscope/models/cv/face_detection/mmdet_patch/core/bbox/transforms.py
@@ -1,5 +1,6 @@
 """
-based on https://github.com/deepinsight/insightface/tree/master/detection/scrfd/mmdet/core/bbox/transforms.py
+The implementation here is modified based on insightface, originally MIT license and publicly avaialbe at
+https://github.com/deepinsight/insightface/tree/master/detection/scrfd/mmdet/core/bbox/transforms.py
 """
 import numpy as np
 import torch
diff --git a/modelscope/models/cv/face_detection/mmdet_patch/core/post_processing/__init__.py b/modelscope/models/cv/face_detection/mmdet_patch/core/post_processing/__init__.py
index 8cd31348..61602fd3 100755
--- a/modelscope/models/cv/face_detection/mmdet_patch/core/post_processing/__init__.py
+++ b/modelscope/models/cv/face_detection/mmdet_patch/core/post_processing/__init__.py
@@ -1,3 +1,7 @@
+"""
+The implementation here is modified based on insightface, originally MIT license and publicly avaialbe at
+https://github.com/deepinsight/insightface/tree/master/detection/scrfd/mmdet/core/post_processing/bbox_nms.py
+"""
 from .bbox_nms import multiclass_nms
 
 __all__ = ['multiclass_nms']
diff --git a/modelscope/models/cv/face_detection/mmdet_patch/core/post_processing/bbox_nms.py b/modelscope/models/cv/face_detection/mmdet_patch/core/post_processing/bbox_nms.py
index efe8813f..7a4f5b3a 100644
--- a/modelscope/models/cv/face_detection/mmdet_patch/core/post_processing/bbox_nms.py
+++ b/modelscope/models/cv/face_detection/mmdet_patch/core/post_processing/bbox_nms.py
@@ -1,5 +1,6 @@
 """
-based on https://github.com/deepinsight/insightface/tree/master/detection/scrfd/mmdet/core/post_processing/bbox_nms.py
+The implementation here is modified based on insightface, originally MIT license and publicly avaialbe at
+https://github.com/deepinsight/insightface/tree/master/detection/scrfd/mmdet/core/post_processing/bbox_nms.py
 """
 import torch
 
diff --git a/modelscope/models/cv/face_detection/mmdet_patch/datasets/__init__.py b/modelscope/models/cv/face_detection/mmdet_patch/datasets/__init__.py
index 07a45208..cea179b0 100644
--- a/modelscope/models/cv/face_detection/mmdet_patch/datasets/__init__.py
+++ b/modelscope/models/cv/face_detection/mmdet_patch/datasets/__init__.py
@@ -1,3 +1,7 @@
+"""
+The implementation here is modified based on insightface, originally MIT license and publicly avaialbe at
+https://github.com/deepinsight/insightface/tree/master/detection/scrfd/mmdet/datasets
+"""
 from .retinaface import RetinaFaceDataset
 
 __all__ = ['RetinaFaceDataset']
diff --git a/modelscope/models/cv/face_detection/mmdet_patch/datasets/pipelines/__init__.py b/modelscope/models/cv/face_detection/mmdet_patch/datasets/pipelines/__init__.py
index 979212a3..85288910 100755
--- a/modelscope/models/cv/face_detection/mmdet_patch/datasets/pipelines/__init__.py
+++ b/modelscope/models/cv/face_detection/mmdet_patch/datasets/pipelines/__init__.py
@@ -1,3 +1,7 @@
+"""
+The implementation here is modified based on insightface, originally MIT license and publicly avaialbe at
+https://github.com/deepinsight/insightface/tree/master/detection/scrfd/mmdet/datasets/pipelines
+"""
 from .transforms import RandomSquareCrop
 
 __all__ = ['RandomSquareCrop']
diff --git a/modelscope/models/cv/face_detection/mmdet_patch/datasets/pipelines/transforms.py b/modelscope/models/cv/face_detection/mmdet_patch/datasets/pipelines/transforms.py
index 3048cefa..241f2c0e 100755
--- a/modelscope/models/cv/face_detection/mmdet_patch/datasets/pipelines/transforms.py
+++ b/modelscope/models/cv/face_detection/mmdet_patch/datasets/pipelines/transforms.py
@@ -1,5 +1,6 @@
 """
-based on https://github.com/deepinsight/insightface/tree/master/detection/scrfd/mmdet/datasets/pipelines/transforms.py
+The implementation here is modified based on insightface, originally MIT license and publicly avaialbe at
+https://github.com/deepinsight/insightface/tree/master/detection/scrfd/mmdet/datasets/pipelines/transforms.py
 """
 import numpy as np
 from mmdet.datasets.builder import PIPELINES
diff --git a/modelscope/models/cv/face_detection/mmdet_patch/datasets/retinaface.py b/modelscope/models/cv/face_detection/mmdet_patch/datasets/retinaface.py
index bf20764b..bbacd9be 100755
--- a/modelscope/models/cv/face_detection/mmdet_patch/datasets/retinaface.py
+++ b/modelscope/models/cv/face_detection/mmdet_patch/datasets/retinaface.py
@@ -1,5 +1,6 @@
 """
-based on https://github.com/deepinsight/insightface/tree/master/detection/scrfd/mmdet/datasets/retinaface.py
+The implementation here is modified based on insightface, originally MIT license and publicly avaialbe at
+https://github.com/deepinsight/insightface/tree/master/detection/scrfd/mmdet/datasets/retinaface.py
 """
 import numpy as np
 from mmdet.datasets.builder import DATASETS
diff --git a/modelscope/models/cv/face_detection/mmdet_patch/models/__init__.py b/modelscope/models/cv/face_detection/mmdet_patch/models/__init__.py
index 38c8ff5b..bd5d5f5f 100755
--- a/modelscope/models/cv/face_detection/mmdet_patch/models/__init__.py
+++ b/modelscope/models/cv/face_detection/mmdet_patch/models/__init__.py
@@ -1,2 +1,6 @@
+"""
+The implementation here is modified based on insightface, originally MIT license and publicly avaialbe at
+https://github.com/deepinsight/insightface/tree/master/detection/scrfd/mmdet/models
+"""
 from .dense_heads import *  # noqa: F401,F403
 from .detectors import *  # noqa: F401,F403
diff --git a/modelscope/models/cv/face_detection/mmdet_patch/models/backbones/__init__.py b/modelscope/models/cv/face_detection/mmdet_patch/models/backbones/__init__.py
index 2d930bf4..5c3b190e 100755
--- a/modelscope/models/cv/face_detection/mmdet_patch/models/backbones/__init__.py
+++ b/modelscope/models/cv/face_detection/mmdet_patch/models/backbones/__init__.py
@@ -1,3 +1,7 @@
+"""
+The implementation here is modified based on insightface, originally MIT license and publicly avaialbe at
+https://github.com/deepinsight/insightface/tree/master/detection/scrfd/mmdet/models/backbones
+"""
 from .resnet import ResNetV1e
 
 __all__ = ['ResNetV1e']
diff --git a/modelscope/models/cv/face_detection/mmdet_patch/models/backbones/resnet.py b/modelscope/models/cv/face_detection/mmdet_patch/models/backbones/resnet.py
index 54bcb127..a5862a58 100644
--- a/modelscope/models/cv/face_detection/mmdet_patch/models/backbones/resnet.py
+++ b/modelscope/models/cv/face_detection/mmdet_patch/models/backbones/resnet.py
@@ -1,5 +1,6 @@
 """
-based on https://github.com/deepinsight/insightface/tree/master/detection/scrfd/mmdet/models/backbones/resnet.py
+The implementation here is modified based on insightface, originally MIT license and publicly avaialbe at
+https://github.com/deepinsight/insightface/tree/master/detection/scrfd/mmdet/models/backbones/resnet.py
 """
 import torch.nn as nn
 import torch.utils.checkpoint as cp
diff --git a/modelscope/models/cv/face_detection/mmdet_patch/models/dense_heads/__init__.py b/modelscope/models/cv/face_detection/mmdet_patch/models/dense_heads/__init__.py
index e67031bc..9ba63b68 100755
--- a/modelscope/models/cv/face_detection/mmdet_patch/models/dense_heads/__init__.py
+++ b/modelscope/models/cv/face_detection/mmdet_patch/models/dense_heads/__init__.py
@@ -1,3 +1,7 @@
+"""
+The implementation here is modified based on insightface, originally MIT license and publicly avaialbe at
+https://github.com/deepinsight/insightface/tree/master/detection/scrfd/mmdet/models/dense_heads
+"""
 from .scrfd_head import SCRFDHead
 
 __all__ = ['SCRFDHead']
diff --git a/modelscope/models/cv/face_detection/mmdet_patch/models/dense_heads/scrfd_head.py b/modelscope/models/cv/face_detection/mmdet_patch/models/dense_heads/scrfd_head.py
index 1667f29f..acc45670 100755
--- a/modelscope/models/cv/face_detection/mmdet_patch/models/dense_heads/scrfd_head.py
+++ b/modelscope/models/cv/face_detection/mmdet_patch/models/dense_heads/scrfd_head.py
@@ -1,5 +1,6 @@
 """
-based on https://github.com/deepinsight/insightface/tree/master/detection/scrfd/mmdet/models/dense_heads/scrfd_head.py
+The implementation here is modified based on insightface, originally MIT license and publicly avaialbe at
+https://github.com/deepinsight/insightface/tree/master/detection/scrfd/mmdet/models/dense_heads/scrfd_head.py
 """
 import numpy as np
 import torch
diff --git a/modelscope/models/cv/face_detection/mmdet_patch/models/detectors/__init__.py b/modelscope/models/cv/face_detection/mmdet_patch/models/detectors/__init__.py
index 1c16028f..7935606a 100755
--- a/modelscope/models/cv/face_detection/mmdet_patch/models/detectors/__init__.py
+++ b/modelscope/models/cv/face_detection/mmdet_patch/models/detectors/__init__.py
@@ -1,3 +1,7 @@
+"""
+The implementation here is modified based on insightface, originally MIT license and publicly avaialbe at
+https://github.com/deepinsight/insightface/tree/master/detection/scrfd/mmdet/models/detectors
+"""
 from .scrfd import SCRFD
 
 __all__ = ['SCRFD']
diff --git a/modelscope/models/cv/face_detection/mmdet_patch/models/detectors/scrfd.py b/modelscope/models/cv/face_detection/mmdet_patch/models/detectors/scrfd.py
index 98b6702c..a5f5cac2 100755
--- a/modelscope/models/cv/face_detection/mmdet_patch/models/detectors/scrfd.py
+++ b/modelscope/models/cv/face_detection/mmdet_patch/models/detectors/scrfd.py
@@ -1,5 +1,6 @@
 """
-based on https://github.com/deepinsight/insightface/tree/master/detection/scrfd/mmdet/models/detectors/scrfd.py
+The implementation here is modified based on insightface, originally MIT license and publicly avaialbe at
+https://github.com/deepinsight/insightface/tree/master/detection/scrfd/mmdet/models/detectors/scrfd.py
 """
 import torch
 from mmdet.models.builder import DETECTORS
diff --git a/modelscope/models/cv/face_detection/mogface/__init__.py b/modelscope/models/cv/face_detection/mogface/__init__.py
new file mode 100644
index 00000000..a58268d0
--- /dev/null
+++ b/modelscope/models/cv/face_detection/mogface/__init__.py
@@ -0,0 +1,2 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from .models.detectors import MogFaceDetector
diff --git a/modelscope/models/cv/face_detection/mogface/models/__init__.py b/modelscope/models/cv/face_detection/mogface/models/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/modelscope/models/cv/face_detection/mogface/models/detectors.py b/modelscope/models/cv/face_detection/mogface/models/detectors.py
new file mode 100644
index 00000000..5ae67104
--- /dev/null
+++ b/modelscope/models/cv/face_detection/mogface/models/detectors.py
@@ -0,0 +1,96 @@
+import os
+
+import cv2
+import numpy as np
+import torch
+import torch.backends.cudnn as cudnn
+
+from modelscope.metainfo import Models
+from modelscope.models.base import TorchModel
+from modelscope.models.builder import MODELS
+from modelscope.utils.constant import Tasks
+from .mogface import MogFace
+from .utils import MogPriorBox, mogdecode, py_cpu_nms
+
+
+@MODELS.register_module(Tasks.face_detection, module_name=Models.mogface)
+class MogFaceDetector(TorchModel):
+
+    def __init__(self, model_path, device='cuda'):
+        super().__init__(model_path)
+        torch.set_grad_enabled(False)
+        cudnn.benchmark = True
+        self.model_path = model_path
+        self.device = device
+        self.net = MogFace()
+        self.load_model()
+        self.net = self.net.to(device)
+
+        self.mean = np.array([[104, 117, 123]])
+
+    def load_model(self, load_to_cpu=False):
+        pretrained_dict = torch.load(
+            self.model_path, map_location=torch.device('cpu'))
+        self.net.load_state_dict(pretrained_dict, strict=False)
+        self.net.eval()
+
+    def forward(self, input):
+        img_raw = input['img']
+        img = np.array(img_raw.cpu().detach())
+        img = img[:, :, ::-1]
+
+        im_height, im_width = img.shape[:2]
+        ss = 1.0
+        # tricky
+        if max(im_height, im_width) > 1500:
+            ss = 1000.0 / max(im_height, im_width)
+            img = cv2.resize(img, (0, 0), fx=ss, fy=ss)
+            im_height, im_width = img.shape[:2]
+
+        scale = torch.Tensor(
+            [img.shape[1], img.shape[0], img.shape[1], img.shape[0]])
+        img -= np.array([[103.53, 116.28, 123.675]])
+        img /= np.array([[57.375, 57.120003, 58.395]])
+        img /= 255
+        img = img[:, :, ::-1].copy()
+        img = img.transpose(2, 0, 1)
+        img = torch.from_numpy(img).unsqueeze(0)
+        img = img.to(self.device)
+        scale = scale.to(self.device)
+
+        conf, loc = self.net(img)  # forward pass
+
+        confidence_threshold = 0.82
+        nms_threshold = 0.4
+        top_k = 5000
+        keep_top_k = 750
+
+        priorbox = MogPriorBox(scale_list=[0.68])
+        priors = priorbox(im_height, im_width)
+        priors = torch.tensor(priors).to(self.device)
+        prior_data = priors.data
+
+        boxes = mogdecode(loc.data.squeeze(0), prior_data)
+        boxes = boxes.cpu().numpy()
+        scores = conf.squeeze(0).data.cpu().numpy()[:, 0]
+
+        # ignore low scores
+        inds = np.where(scores > confidence_threshold)[0]
+        boxes = boxes[inds]
+        scores = scores[inds]
+
+        # keep top-K before NMS
+        order = scores.argsort()[::-1][:top_k]
+        boxes = boxes[order]
+        scores = scores[order]
+
+        # do NMS
+        dets = np.hstack((boxes, scores[:, np.newaxis])).astype(
+            np.float32, copy=False)
+        keep = py_cpu_nms(dets, nms_threshold)
+        dets = dets[keep, :]
+
+        # keep top-K faster NMS
+        dets = dets[:keep_top_k, :]
+
+        return dets / ss
diff --git a/modelscope/models/cv/face_detection/mogface/models/mogface.py b/modelscope/models/cv/face_detection/mogface/models/mogface.py
new file mode 100644
index 00000000..294c2c6b
--- /dev/null
+++ b/modelscope/models/cv/face_detection/mogface/models/mogface.py
@@ -0,0 +1,135 @@
+# --------------------------------------------------------
+# The implementation is also open-sourced by the authors as Yang Liu, and is available publicly on
+# https://github.com/damo-cv/MogFace
+# --------------------------------------------------------
+import torch.nn as nn
+import torch.nn.functional as F
+
+from .mogprednet import MogPredNet
+from .resnet import ResNet
+
+
+class MogFace(nn.Module):
+
+    def __init__(self):
+        super(MogFace, self).__init__()
+        self.backbone = ResNet(depth=101)
+        self.fpn = LFPN()
+        self.pred_net = MogPredNet()
+
+    def forward(self, x):
+        feature_list = self.backbone(x)
+        fpn_list = self.fpn(feature_list)
+        pyramid_feature_list = fpn_list[0]
+        conf, loc = self.pred_net(pyramid_feature_list)
+        return conf, loc
+
+
+class FeatureFusion(nn.Module):
+
+    def __init__(self, lat_ch=256, **channels):
+        super(FeatureFusion, self).__init__()
+        self.main_conv = nn.Conv2d(channels['main'], lat_ch, kernel_size=1)
+
+    def forward(self, up, main):
+        main = self.main_conv(main)
+        _, _, H, W = main.size()
+        res = F.upsample(up, scale_factor=2, mode='bilinear')
+        if res.size(2) != main.size(2) or res.size(3) != main.size(3):
+            res = res[:, :, 0:H, 0:W]
+        res = res + main
+        return res
+
+
+class LFPN(nn.Module):
+
+    def __init__(self,
+                 c2_out_ch=256,
+                 c3_out_ch=512,
+                 c4_out_ch=1024,
+                 c5_out_ch=2048,
+                 c6_mid_ch=512,
+                 c6_out_ch=512,
+                 c7_mid_ch=128,
+                 c7_out_ch=256,
+                 out_dsfd_ft=True):
+        super(LFPN, self).__init__()
+        self.out_dsfd_ft = out_dsfd_ft
+        if self.out_dsfd_ft:
+            dsfd_module = []
+            dsfd_module.append(nn.Conv2d(256, 256, kernel_size=3, padding=1))
+            dsfd_module.append(nn.Conv2d(512, 256, kernel_size=3, padding=1))
+            dsfd_module.append(nn.Conv2d(1024, 256, kernel_size=3, padding=1))
+            dsfd_module.append(nn.Conv2d(2048, 256, kernel_size=3, padding=1))
+            dsfd_module.append(nn.Conv2d(256, 256, kernel_size=3, padding=1))
+            dsfd_module.append(nn.Conv2d(256, 256, kernel_size=3, padding=1))
+            self.dsfd_modules = nn.ModuleList(dsfd_module)
+
+        c6_input_ch = c5_out_ch
+        self.c6 = nn.Sequential(*[
+            nn.Conv2d(
+                c6_input_ch,
+                c6_mid_ch,
+                kernel_size=1,
+            ),
+            nn.BatchNorm2d(c6_mid_ch),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(
+                c6_mid_ch, c6_out_ch, kernel_size=3, padding=1, stride=2),
+            nn.BatchNorm2d(c6_out_ch),
+            nn.ReLU(inplace=True)
+        ])
+        self.c7 = nn.Sequential(*[
+            nn.Conv2d(
+                c6_out_ch,
+                c7_mid_ch,
+                kernel_size=1,
+            ),
+            nn.BatchNorm2d(c7_mid_ch),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(
+                c7_mid_ch, c7_out_ch, kernel_size=3, padding=1, stride=2),
+            nn.BatchNorm2d(c7_out_ch),
+            nn.ReLU(inplace=True)
+        ])
+
+        self.p2_lat = nn.Conv2d(256, 256, kernel_size=3, padding=1)
+        self.p3_lat = nn.Conv2d(256, 256, kernel_size=3, padding=1)
+        self.p4_lat = nn.Conv2d(256, 256, kernel_size=3, padding=1)
+
+        self.c5_lat = nn.Conv2d(c6_input_ch, 256, kernel_size=3, padding=1)
+        self.c6_lat = nn.Conv2d(c6_out_ch, 256, kernel_size=3, padding=1)
+        self.c7_lat = nn.Conv2d(c7_out_ch, 256, kernel_size=3, padding=1)
+
+        self.ff_c5_c4 = FeatureFusion(main=c4_out_ch)
+        self.ff_c4_c3 = FeatureFusion(main=c3_out_ch)
+        self.ff_c3_c2 = FeatureFusion(main=c2_out_ch)
+
+    def forward(self, feature_list):
+        c2, c3, c4, c5 = feature_list
+        c6 = self.c6(c5)
+        c7 = self.c7(c6)
+
+        c5 = self.c5_lat(c5)
+        c6 = self.c6_lat(c6)
+        c7 = self.c7_lat(c7)
+
+        if self.out_dsfd_ft:
+            dsfd_fts = []
+            dsfd_fts.append(self.dsfd_modules[0](c2))
+            dsfd_fts.append(self.dsfd_modules[1](c3))
+            dsfd_fts.append(self.dsfd_modules[2](c4))
+            dsfd_fts.append(self.dsfd_modules[3](feature_list[-1]))
+            dsfd_fts.append(self.dsfd_modules[4](c6))
+            dsfd_fts.append(self.dsfd_modules[5](c7))
+
+        p4 = self.ff_c5_c4(c5, c4)
+        p3 = self.ff_c4_c3(p4, c3)
+        p2 = self.ff_c3_c2(p3, c2)
+
+        p2 = self.p2_lat(p2)
+        p3 = self.p3_lat(p3)
+        p4 = self.p4_lat(p4)
+
+        if self.out_dsfd_ft:
+            return ([p2, p3, p4, c5, c6, c7], dsfd_fts)
diff --git a/modelscope/models/cv/face_detection/mogface/models/mogprednet.py b/modelscope/models/cv/face_detection/mogface/models/mogprednet.py
new file mode 100644
index 00000000..31384976
--- /dev/null
+++ b/modelscope/models/cv/face_detection/mogface/models/mogprednet.py
@@ -0,0 +1,164 @@
+# --------------------------------------------------------
+# The implementation is also open-sourced by the authors as Yang Liu, and is available publicly on
+# https://github.com/damo-cv/MogFace
+# --------------------------------------------------------
+import math
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class conv_bn(nn.Module):
+    """docstring for conv"""
+
+    def __init__(self, in_plane, out_plane, kernel_size, stride, padding):
+        super(conv_bn, self).__init__()
+        self.conv1 = nn.Conv2d(
+            in_plane,
+            out_plane,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding)
+        self.bn1 = nn.BatchNorm2d(out_plane)
+
+    def forward(self, x):
+        x = self.conv1(x)
+        return self.bn1(x)
+
+
+class SSHContext(nn.Module):
+
+    def __init__(self, channels, Xchannels=256):
+        super(SSHContext, self).__init__()
+
+        self.conv1 = nn.Conv2d(
+            channels, Xchannels, kernel_size=3, stride=1, padding=1)
+        self.conv2 = nn.Conv2d(
+            channels,
+            Xchannels // 2,
+            kernel_size=3,
+            dilation=2,
+            stride=1,
+            padding=2)
+        self.conv2_1 = nn.Conv2d(
+            Xchannels // 2, Xchannels // 2, kernel_size=3, stride=1, padding=1)
+        self.conv2_2 = nn.Conv2d(
+            Xchannels // 2,
+            Xchannels // 2,
+            kernel_size=3,
+            dilation=2,
+            stride=1,
+            padding=2)
+        self.conv2_2_1 = nn.Conv2d(
+            Xchannels // 2, Xchannels // 2, kernel_size=3, stride=1, padding=1)
+
+    def forward(self, x):
+        x1 = F.relu(self.conv1(x), inplace=True)
+        x2 = F.relu(self.conv2(x), inplace=True)
+        x2_1 = F.relu(self.conv2_1(x2), inplace=True)
+        x2_2 = F.relu(self.conv2_2(x2), inplace=True)
+        x2_2 = F.relu(self.conv2_2_1(x2_2), inplace=True)
+
+        return torch.cat([x1, x2_1, x2_2], 1)
+
+
+class DeepHead(nn.Module):
+
+    def __init__(self,
+                 in_channel=256,
+                 out_channel=256,
+                 use_gn=False,
+                 num_conv=4):
+        super(DeepHead, self).__init__()
+        self.use_gn = use_gn
+        self.num_conv = num_conv
+        self.conv1 = nn.Conv2d(in_channel, out_channel, 3, 1, 1)
+        self.conv2 = nn.Conv2d(out_channel, out_channel, 3, 1, 1)
+        self.conv3 = nn.Conv2d(out_channel, out_channel, 3, 1, 1)
+        self.conv4 = nn.Conv2d(out_channel, out_channel, 3, 1, 1)
+        if self.use_gn:
+            self.gn1 = nn.GroupNorm(16, out_channel)
+            self.gn2 = nn.GroupNorm(16, out_channel)
+            self.gn3 = nn.GroupNorm(16, out_channel)
+            self.gn4 = nn.GroupNorm(16, out_channel)
+
+    def forward(self, x):
+        if self.use_gn:
+            x1 = F.relu(self.gn1(self.conv1(x)), inplace=True)
+            x2 = F.relu(self.gn2(self.conv1(x1)), inplace=True)
+            x3 = F.relu(self.gn3(self.conv1(x2)), inplace=True)
+            x4 = F.relu(self.gn4(self.conv1(x3)), inplace=True)
+        else:
+            x1 = F.relu(self.conv1(x), inplace=True)
+            x2 = F.relu(self.conv1(x1), inplace=True)
+            if self.num_conv == 2:
+                return x2
+            x3 = F.relu(self.conv1(x2), inplace=True)
+            x4 = F.relu(self.conv1(x3), inplace=True)
+
+        return x4
+
+
+class MogPredNet(nn.Module):
+
+    def __init__(self,
+                 num_anchor_per_pixel=1,
+                 num_classes=1,
+                 input_ch_list=[256, 256, 256, 256, 256, 256],
+                 use_deep_head=True,
+                 deep_head_with_gn=True,
+                 use_ssh=True,
+                 deep_head_ch=512):
+        super(MogPredNet, self).__init__()
+        self.num_classes = num_classes
+        self.use_deep_head = use_deep_head
+        self.deep_head_with_gn = deep_head_with_gn
+
+        self.use_ssh = use_ssh
+
+        self.deep_head_ch = deep_head_ch
+
+        if self.use_ssh:
+            self.conv_SSH = SSHContext(input_ch_list[0],
+                                       self.deep_head_ch // 2)
+
+        if self.use_deep_head:
+            if self.deep_head_with_gn:
+                self.deep_loc_head = DeepHead(
+                    self.deep_head_ch, self.deep_head_ch, use_gn=True)
+                self.deep_cls_head = DeepHead(
+                    self.deep_head_ch, self.deep_head_ch, use_gn=True)
+
+            self.pred_cls = nn.Conv2d(self.deep_head_ch,
+                                      1 * num_anchor_per_pixel, 3, 1, 1)
+            self.pred_loc = nn.Conv2d(self.deep_head_ch,
+                                      4 * num_anchor_per_pixel, 3, 1, 1)
+
+        self.sigmoid = nn.Sigmoid()
+
+    def forward(self, pyramid_feature_list, dsfd_ft_list=None):
+        loc = []
+        conf = []
+
+        if self.use_deep_head:
+            for x in pyramid_feature_list:
+                if self.use_ssh:
+                    x = self.conv_SSH(x)
+                x_cls = self.deep_cls_head(x)
+                x_loc = self.deep_loc_head(x)
+
+                conf.append(
+                    self.pred_cls(x_cls).permute(0, 2, 3, 1).contiguous())
+                loc.append(
+                    self.pred_loc(x_loc).permute(0, 2, 3, 1).contiguous())
+
+        loc = torch.cat([o.view(o.size(0), -1, 4) for o in loc], 1)
+        conf = torch.cat(
+            [o.view(o.size(0), -1, self.num_classes) for o in conf], 1)
+        output = (
+            self.sigmoid(conf.view(conf.size(0), -1, self.num_classes)),
+            loc.view(loc.size(0), -1, 4),
+        )
+
+        return output
diff --git a/modelscope/models/cv/face_detection/mogface/models/resnet.py b/modelscope/models/cv/face_detection/mogface/models/resnet.py
new file mode 100644
index 00000000..045f6fa3
--- /dev/null
+++ b/modelscope/models/cv/face_detection/mogface/models/resnet.py
@@ -0,0 +1,193 @@
+# The implementation is modified from original resent implementaiton, which is
+#  also open-sourced by the authors as Yang Liu,
+#  and is available publicly on  https://github.com/damo-cv/MogFace
+
+import torch.nn as nn
+
+
+def conv3x3(in_planes, out_planes, stride=1, groups=1, dilation=1):
+    """3x3 convolution with padding"""
+    return nn.Conv2d(
+        in_planes,
+        out_planes,
+        kernel_size=3,
+        stride=stride,
+        padding=dilation,
+        groups=groups,
+        bias=False,
+        dilation=dilation)
+
+
+def conv1x1(in_planes, out_planes, stride=1):
+    """1x1 convolution"""
+    return nn.Conv2d(
+        in_planes, out_planes, kernel_size=1, stride=stride, bias=False)
+
+
+class Bottleneck(nn.Module):
+    expansion = 4
+
+    def __init__(self,
+                 inplanes,
+                 planes,
+                 stride=1,
+                 downsample=None,
+                 groups=1,
+                 base_width=64,
+                 dilation=1,
+                 norm_layer=None):
+        super(Bottleneck, self).__init__()
+        if norm_layer is None:
+            norm_layer = nn.BatchNorm2d
+        width = int(planes * (base_width / 64.)) * groups
+        # Both self.conv2 and self.downsample layers downsample the input when stride != 1
+        self.conv1 = conv1x1(inplanes, width)
+        self.bn1 = norm_layer(width)
+        self.conv2 = conv3x3(width, width, stride, groups, dilation)
+        self.bn2 = norm_layer(width)
+        self.conv3 = conv1x1(width, planes * self.expansion)
+        self.bn3 = norm_layer(planes * self.expansion)
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = downsample
+        self.stride = stride
+
+    def forward(self, x):
+        identity = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out = self.relu(out)
+
+        out = self.conv3(out)
+        out = self.bn3(out)
+
+        if self.downsample is not None:
+            identity = self.downsample(x)
+
+        out += identity
+        out = self.relu(out)
+
+        return out
+
+
+class ResNet(nn.Module):
+
+    def __init__(self,
+                 depth=50,
+                 groups=1,
+                 width_per_group=64,
+                 replace_stride_with_dilation=None,
+                 norm_layer=None,
+                 inplanes=64,
+                 shrink_ch_ratio=1):
+        super(ResNet, self).__init__()
+        if norm_layer is None:
+            norm_layer = nn.BatchNorm2d
+        self._norm_layer = norm_layer
+
+        if depth == 50:
+            block = Bottleneck
+            layers = [3, 4, 6, 3]
+        elif depth == 101:
+            block = Bottleneck
+            layers = [3, 4, 23, 3]
+        elif depth == 152:
+            block = Bottleneck
+            layers = [3, 4, 36, 3]
+        elif depth == 18:
+            block = BasicBlock
+            layers = [2, 2, 2, 2]
+        else:
+            raise ValueError('only support depth in [18, 50, 101, 152]')
+
+        shrink_input_ch = int(inplanes * shrink_ch_ratio)
+        self.inplanes = int(inplanes * shrink_ch_ratio)
+        if shrink_ch_ratio == 0.125:
+            layers = [2, 3, 3, 3]
+
+        self.dilation = 1
+        if replace_stride_with_dilation is None:
+            # each element in the tuple indicates if we should replace
+            # the 2x2 stride with a dilated convolution instead
+            replace_stride_with_dilation = [False, False, False]
+        if len(replace_stride_with_dilation) != 3:
+            raise ValueError('replace_stride_with_dilation should be None '
+                             'or a 3-element tuple, got {}'.format(
+                                 replace_stride_with_dilation))
+        self.groups = groups
+        self.base_width = width_per_group
+        self.conv1 = nn.Conv2d(
+            3, self.inplanes, kernel_size=7, stride=2, padding=3, bias=False)
+        self.bn1 = norm_layer(self.inplanes)
+        self.relu = nn.ReLU(inplace=True)
+        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+        self.layer1 = self._make_layer(block, shrink_input_ch, layers[0])
+        self.layer2 = self._make_layer(
+            block,
+            shrink_input_ch * 2,
+            layers[1],
+            stride=2,
+            dilate=replace_stride_with_dilation[0])
+        self.layer3 = self._make_layer(
+            block,
+            shrink_input_ch * 4,
+            layers[2],
+            stride=2,
+            dilate=replace_stride_with_dilation[1])
+        self.layer4 = self._make_layer(
+            block,
+            shrink_input_ch * 8,
+            layers[3],
+            stride=2,
+            dilate=replace_stride_with_dilation[2])
+
+    def _make_layer(self, block, planes, blocks, stride=1, dilate=False):
+        norm_layer = self._norm_layer
+        downsample = None
+        previous_dilation = self.dilation
+        if dilate:
+            self.dilation *= stride
+            stride = 1
+        if stride != 1 or self.inplanes != planes * block.expansion:
+            downsample = nn.Sequential(
+                conv1x1(self.inplanes, planes * block.expansion, stride),
+                norm_layer(planes * block.expansion),
+            )
+
+        layers = []
+        layers.append(
+            block(self.inplanes, planes, stride, downsample, self.groups,
+                  self.base_width, previous_dilation, norm_layer))
+        self.inplanes = planes * block.expansion
+        for _ in range(1, blocks):
+            layers.append(
+                block(
+                    self.inplanes,
+                    planes,
+                    groups=self.groups,
+                    base_width=self.base_width,
+                    dilation=self.dilation,
+                    norm_layer=norm_layer))
+
+        return nn.Sequential(*layers)
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = self.relu(x)
+        x = self.maxpool(x)
+        four_conv_layer = []
+        x = self.layer1(x)
+        four_conv_layer.append(x)
+        x = self.layer2(x)
+        four_conv_layer.append(x)
+        x = self.layer3(x)
+        four_conv_layer.append(x)
+        x = self.layer4(x)
+        four_conv_layer.append(x)
+
+        return four_conv_layer
diff --git a/modelscope/models/cv/face_detection/mogface/models/utils.py b/modelscope/models/cv/face_detection/mogface/models/utils.py
new file mode 100755
index 00000000..377ceb3d
--- /dev/null
+++ b/modelscope/models/cv/face_detection/mogface/models/utils.py
@@ -0,0 +1,212 @@
+# Modified from https://github.com/biubug6/Pytorch_Retinaface
+
+import math
+from itertools import product as product
+from math import ceil
+
+import numpy as np
+import torch
+
+
+def transform_anchor(anchors):
+    """
+    from [x0, x1, y0, y1] to [c_x, cy, w, h]
+    x1 = x0 + w - 1
+    c_x = (x0 + x1) / 2 = (2x0 + w - 1) / 2 = x0 + (w - 1) / 2
+    """
+    return np.concatenate(((anchors[:, :2] + anchors[:, 2:]) / 2,
+                           anchors[:, 2:] - anchors[:, :2] + 1),
+                          axis=1)
+
+
+def normalize_anchor(anchors):
+    """
+    from  [c_x, cy, w, h] to [x0, x1, y0, y1]
+    """
+    item_1 = anchors[:, :2] - (anchors[:, 2:] - 1) / 2
+    item_2 = anchors[:, :2] + (anchors[:, 2:] - 1) / 2
+    return np.concatenate((item_1, item_2), axis=1)
+
+
+class MogPriorBox(object):
+    """
+    both for fpn and single layer, single layer need to test
+    return (np.array) [num_anchros, 4] [x0, y0, x1, y1]
+    """
+
+    def __init__(self,
+                 scale_list=[1.],
+                 aspect_ratio_list=[1.0],
+                 stride_list=[4, 8, 16, 32, 64, 128],
+                 anchor_size_list=[16, 32, 64, 128, 256, 512]):
+        self.scale_list = scale_list
+        self.aspect_ratio_list = aspect_ratio_list
+        self.stride_list = stride_list
+        self.anchor_size_list = anchor_size_list
+
+    def __call__(self, img_height, img_width):
+        final_anchor_list = []
+
+        for idx, stride in enumerate(self.stride_list):
+            anchor_list = []
+            cur_img_height = img_height
+            cur_img_width = img_width
+            tmp_stride = stride
+
+            while tmp_stride != 1:
+                tmp_stride = tmp_stride // 2
+                cur_img_height = (cur_img_height + 1) // 2
+                cur_img_width = (cur_img_width + 1) // 2
+
+            for i in range(cur_img_height):
+                for j in range(cur_img_width):
+                    for scale in self.scale_list:
+                        cx = (j + 0.5) * stride
+                        cy = (i + 0.5) * stride
+                        side_x = self.anchor_size_list[idx] * scale
+                        side_y = self.anchor_size_list[idx] * scale
+                        for ratio in self.aspect_ratio_list:
+                            anchor_list.append([
+                                cx, cy, side_x / math.sqrt(ratio),
+                                side_y * math.sqrt(ratio)
+                            ])
+
+            final_anchor_list.append(anchor_list)
+        final_anchor_arr = np.concatenate(final_anchor_list, axis=0)
+        normalized_anchor_arr = normalize_anchor(final_anchor_arr).astype(
+            'float32')
+        transformed_anchor = transform_anchor(normalized_anchor_arr)
+
+        return transformed_anchor
+
+
+class PriorBox(object):
+
+    def __init__(self, cfg, image_size=None, phase='train'):
+        super(PriorBox, self).__init__()
+        self.min_sizes = cfg['min_sizes']
+        self.steps = cfg['steps']
+        self.clip = cfg['clip']
+        self.image_size = image_size
+        self.feature_maps = [[
+            ceil(self.image_size[0] / step),
+            ceil(self.image_size[1] / step)
+        ] for step in self.steps]
+        self.name = 's'
+
+    def forward(self):
+        anchors = []
+        for k, f in enumerate(self.feature_maps):
+            min_sizes = self.min_sizes[k]
+            for i, j in product(range(f[0]), range(f[1])):
+                for min_size in min_sizes:
+                    s_kx = min_size / self.image_size[1]
+                    s_ky = min_size / self.image_size[0]
+                    dense_cx = [
+                        x * self.steps[k] / self.image_size[1]
+                        for x in [j + 0.5]
+                    ]
+                    dense_cy = [
+                        y * self.steps[k] / self.image_size[0]
+                        for y in [i + 0.5]
+                    ]
+                    for cy, cx in product(dense_cy, dense_cx):
+                        anchors += [cx, cy, s_kx, s_ky]
+
+        # back to torch land
+        output = torch.Tensor(anchors).view(-1, 4)
+        if self.clip:
+            output.clamp_(max=1, min=0)
+        return output
+
+
+def py_cpu_nms(dets, thresh):
+    """Pure Python NMS baseline."""
+    x1 = dets[:, 0]
+    y1 = dets[:, 1]
+    x2 = dets[:, 2]
+    y2 = dets[:, 3]
+    scores = dets[:, 4]
+
+    areas = (x2 - x1 + 1) * (y2 - y1 + 1)
+    order = scores.argsort()[::-1]
+
+    keep = []
+    while order.size > 0:
+        i = order[0]
+        keep.append(i)
+        xx1 = np.maximum(x1[i], x1[order[1:]])
+        yy1 = np.maximum(y1[i], y1[order[1:]])
+        xx2 = np.minimum(x2[i], x2[order[1:]])
+        yy2 = np.minimum(y2[i], y2[order[1:]])
+
+        w = np.maximum(0.0, xx2 - xx1 + 1)
+        h = np.maximum(0.0, yy2 - yy1 + 1)
+        inter = w * h
+        ovr = inter / (areas[i] + areas[order[1:]] - inter)
+
+        inds = np.where(ovr <= thresh)[0]
+        order = order[inds + 1]
+
+    return keep
+
+
+def mogdecode(loc, anchors):
+    """
+    loc: torch.Tensor
+    anchors: 2-d, torch.Tensor (cx, cy, w, h)
+    boxes: 2-d, torch.Tensor (x0, y0, x1, y1)
+    """
+
+    boxes = torch.cat((anchors[:, :2] + loc[:, :2] * anchors[:, 2:],
+                       anchors[:, 2:] * torch.exp(loc[:, 2:])), 1)
+
+    boxes[:, 0] -= (boxes[:, 2] - 1) / 2
+    boxes[:, 1] -= (boxes[:, 3] - 1) / 2
+    boxes[:, 2] += boxes[:, 0] - 1
+    boxes[:, 3] += boxes[:, 1] - 1
+
+    return boxes
+
+
+# Adapted from https://github.com/Hakuyume/chainer-ssd
+def decode(loc, priors, variances):
+    """Decode locations from predictions using priors to undo
+    the encoding we did for offset regression at train time.
+    Args:
+        loc (tensor): location predictions for loc layers,
+            Shape: [num_priors,4]
+        priors (tensor): Prior boxes in center-offset form.
+            Shape: [num_priors,4].
+        variances: (list[float]) Variances of priorboxes
+    Return:
+        decoded bounding box predictions
+    """
+
+    boxes = torch.cat(
+        (priors[:, :2] + loc[:, :2] * variances[0] * priors[:, 2:],
+         priors[:, 2:] * torch.exp(loc[:, 2:] * variances[1])), 1)
+    boxes[:, :2] -= boxes[:, 2:] / 2
+    boxes[:, 2:] += boxes[:, :2]
+    return boxes
+
+
+def decode_landm(pre, priors, variances):
+    """Decode landm from predictions using priors to undo
+    the encoding we did for offset regression at train time.
+    Args:
+        pre (tensor): landm predictions for loc layers,
+            Shape: [num_priors,10]
+        priors (tensor): Prior boxes in center-offset form.
+            Shape: [num_priors,4].
+        variances: (list[float]) Variances of priorboxes
+    Return:
+        decoded landm predictions
+    """
+    a = priors[:, :2] + pre[:, :2] * variances[0] * priors[:, 2:]
+    b = priors[:, :2] + pre[:, 2:4] * variances[0] * priors[:, 2:]
+    c = priors[:, :2] + pre[:, 4:6] * variances[0] * priors[:, 2:]
+    d = priors[:, :2] + pre[:, 6:8] * variances[0] * priors[:, 2:]
+    e = priors[:, :2] + pre[:, 8:10] * variances[0] * priors[:, 2:]
+    landms = torch.cat((a, b, c, d, e), dim=1)
+    return landms
diff --git a/modelscope/models/cv/face_detection/mtcnn/__init__.py b/modelscope/models/cv/face_detection/mtcnn/__init__.py
new file mode 100644
index 00000000..9fddab9c
--- /dev/null
+++ b/modelscope/models/cv/face_detection/mtcnn/__init__.py
@@ -0,0 +1,2 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from .models.detector import MtcnnFaceDetector
diff --git a/modelscope/models/cv/face_detection/mtcnn/models/__init__.py b/modelscope/models/cv/face_detection/mtcnn/models/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/modelscope/models/cv/face_detection/mtcnn/models/box_utils.py b/modelscope/models/cv/face_detection/mtcnn/models/box_utils.py
new file mode 100644
index 00000000..f6a27b05
--- /dev/null
+++ b/modelscope/models/cv/face_detection/mtcnn/models/box_utils.py
@@ -0,0 +1,240 @@
+# The implementation is based on mtcnn, available at https://github.com/TropComplique/mtcnn-pytorch
+import numpy as np
+from PIL import Image
+
+
+def nms(boxes, overlap_threshold=0.5, mode='union'):
+    """Non-maximum suppression.
+
+    Arguments:
+        boxes: a float numpy array of shape [n, 5],
+            where each row is (xmin, ymin, xmax, ymax, score).
+        overlap_threshold: a float number.
+        mode: 'union' or 'min'.
+
+    Returns:
+        list with indices of the selected boxes
+    """
+
+    # if there are no boxes, return the empty list
+    if len(boxes) == 0:
+        return []
+
+    # list of picked indices
+    pick = []
+
+    # grab the coordinates of the bounding boxes
+    x1, y1, x2, y2, score = [boxes[:, i] for i in range(5)]
+
+    area = (x2 - x1 + 1.0) * (y2 - y1 + 1.0)
+    ids = np.argsort(score)  # in increasing order
+
+    while len(ids) > 0:
+
+        # grab index of the largest value
+        last = len(ids) - 1
+        i = ids[last]
+        pick.append(i)
+
+        # compute intersections
+        # of the box with the largest score
+        # with the rest of boxes
+
+        # left top corner of intersection boxes
+        ix1 = np.maximum(x1[i], x1[ids[:last]])
+        iy1 = np.maximum(y1[i], y1[ids[:last]])
+
+        # right bottom corner of intersection boxes
+        ix2 = np.minimum(x2[i], x2[ids[:last]])
+        iy2 = np.minimum(y2[i], y2[ids[:last]])
+
+        # width and height of intersection boxes
+        w = np.maximum(0.0, ix2 - ix1 + 1.0)
+        h = np.maximum(0.0, iy2 - iy1 + 1.0)
+
+        # intersections' areas
+        inter = w * h
+        if mode == 'min':
+            overlap = inter / np.minimum(area[i], area[ids[:last]])
+        elif mode == 'union':
+            # intersection over union (IoU)
+            overlap = inter / (area[i] + area[ids[:last]] - inter)
+
+        # delete all boxes where overlap is too big
+        ids = np.delete(
+            ids,
+            np.concatenate([[last],
+                            np.where(overlap > overlap_threshold)[0]]))
+
+    return pick
+
+
+def convert_to_square(bboxes):
+    """Convert bounding boxes to a square form.
+
+    Arguments:
+        bboxes: a float numpy array of shape [n, 5].
+
+    Returns:
+        a float numpy array of shape [n, 5],
+            squared bounding boxes.
+    """
+
+    square_bboxes = np.zeros_like(bboxes)
+    x1, y1, x2, y2 = [bboxes[:, i] for i in range(4)]
+    h = y2 - y1 + 1.0
+    w = x2 - x1 + 1.0
+    max_side = np.maximum(h, w)
+    square_bboxes[:, 0] = x1 + w * 0.5 - max_side * 0.5
+    square_bboxes[:, 1] = y1 + h * 0.5 - max_side * 0.5
+    square_bboxes[:, 2] = square_bboxes[:, 0] + max_side - 1.0
+    square_bboxes[:, 3] = square_bboxes[:, 1] + max_side - 1.0
+    return square_bboxes
+
+
+def calibrate_box(bboxes, offsets):
+    """Transform bounding boxes to be more like true bounding boxes.
+    'offsets' is one of the outputs of the nets.
+
+    Arguments:
+        bboxes: a float numpy array of shape [n, 5].
+        offsets: a float numpy array of shape [n, 4].
+
+    Returns:
+        a float numpy array of shape [n, 5].
+    """
+    x1, y1, x2, y2 = [bboxes[:, i] for i in range(4)]
+    w = x2 - x1 + 1.0
+    h = y2 - y1 + 1.0
+    w = np.expand_dims(w, 1)
+    h = np.expand_dims(h, 1)
+
+    # this is what happening here:
+    # tx1, ty1, tx2, ty2 = [offsets[:, i] for i in range(4)]
+    # x1_true = x1 + tx1*w
+    # y1_true = y1 + ty1*h
+    # x2_true = x2 + tx2*w
+    # y2_true = y2 + ty2*h
+    # below is just more compact form of this
+
+    # are offsets always such that
+    # x1 < x2 and y1 < y2 ?
+
+    translation = np.hstack([w, h, w, h]) * offsets
+    bboxes[:, 0:4] = bboxes[:, 0:4] + translation
+    return bboxes
+
+
+def get_image_boxes(bounding_boxes, img, size=24):
+    """Cut out boxes from the image.
+
+    Arguments:
+        bounding_boxes: a float numpy array of shape [n, 5].
+        img: an instance of PIL.Image.
+        size: an integer, size of cutouts.
+
+    Returns:
+        a float numpy array of shape [n, 3, size, size].
+    """
+
+    num_boxes = len(bounding_boxes)
+    width, height = img.size
+
+    [dy, edy, dx, edx, y, ey, x, ex, w,
+     h] = correct_bboxes(bounding_boxes, width, height)
+    img_boxes = np.zeros((num_boxes, 3, size, size), 'float32')
+
+    for i in range(num_boxes):
+        img_box = np.zeros((h[i], w[i], 3), 'uint8')
+
+        img_array = np.asarray(img, 'uint8')
+        img_box[dy[i]:(edy[i] + 1), dx[i]:(edx[i] + 1), :] =\
+            img_array[y[i]:(ey[i] + 1), x[i]:(ex[i] + 1), :]
+
+        # resize
+        img_box = Image.fromarray(img_box)
+        img_box = img_box.resize((size, size), Image.BILINEAR)
+        img_box = np.asarray(img_box, 'float32')
+
+        img_boxes[i, :, :, :] = _preprocess(img_box)
+
+    return img_boxes
+
+
+def correct_bboxes(bboxes, width, height):
+    """Crop boxes that are too big and get coordinates
+    with respect to cutouts.
+
+    Arguments:
+        bboxes: a float numpy array of shape [n, 5],
+            where each row is (xmin, ymin, xmax, ymax, score).
+        width: a float number.
+        height: a float number.
+
+    Returns:
+        dy, dx, edy, edx: a int numpy arrays of shape [n],
+            coordinates of the boxes with respect to the cutouts.
+        y, x, ey, ex: a int numpy arrays of shape [n],
+            corrected ymin, xmin, ymax, xmax.
+        h, w: a int numpy arrays of shape [n],
+            just heights and widths of boxes.
+
+        in the following order:
+            [dy, edy, dx, edx, y, ey, x, ex, w, h].
+    """
+
+    x1, y1, x2, y2 = [bboxes[:, i] for i in range(4)]
+    w, h = x2 - x1 + 1.0, y2 - y1 + 1.0
+    num_boxes = bboxes.shape[0]
+
+    # 'e' stands for end
+    # (x, y) -> (ex, ey)
+    x, y, ex, ey = x1, y1, x2, y2
+
+    # we need to cut out a box from the image.
+    # (x, y, ex, ey) are corrected coordinates of the box
+    # in the image.
+    # (dx, dy, edx, edy) are coordinates of the box in the cutout
+    # from the image.
+    dx, dy = np.zeros((num_boxes, )), np.zeros((num_boxes, ))
+    edx, edy = w.copy() - 1.0, h.copy() - 1.0
+
+    # if box's bottom right corner is too far right
+    ind = np.where(ex > width - 1.0)[0]
+    edx[ind] = w[ind] + width - 2.0 - ex[ind]
+    ex[ind] = width - 1.0
+
+    # if box's bottom right corner is too low
+    ind = np.where(ey > height - 1.0)[0]
+    edy[ind] = h[ind] + height - 2.0 - ey[ind]
+    ey[ind] = height - 1.0
+
+    # if box's top left corner is too far left
+    ind = np.where(x < 0.0)[0]
+    dx[ind] = 0.0 - x[ind]
+    x[ind] = 0.0
+
+    # if box's top left corner is too high
+    ind = np.where(y < 0.0)[0]
+    dy[ind] = 0.0 - y[ind]
+    y[ind] = 0.0
+
+    return_list = [dy, edy, dx, edx, y, ey, x, ex, w, h]
+    return_list = [i.astype('int32') for i in return_list]
+
+    return return_list
+
+
+def _preprocess(img):
+    """Preprocessing step before feeding the network.
+
+    Arguments:
+        img: a float numpy array of shape [h, w, c].
+
+    Returns:
+        a float numpy array of shape [1, c, h, w].
+    """
+    img = img.transpose((2, 0, 1))
+    img = np.expand_dims(img, 0)
+    img = (img - 127.5) * 0.0078125
+    return img
diff --git a/modelscope/models/cv/face_detection/mtcnn/models/detector.py b/modelscope/models/cv/face_detection/mtcnn/models/detector.py
new file mode 100644
index 00000000..9c3aca3a
--- /dev/null
+++ b/modelscope/models/cv/face_detection/mtcnn/models/detector.py
@@ -0,0 +1,149 @@
+# The implementation is based on mtcnn, available at https://github.com/TropComplique/mtcnn-pytorch
+import os
+
+import numpy as np
+import torch
+import torch.backends.cudnn as cudnn
+from PIL import Image
+from torch.autograd import Variable
+
+from modelscope.metainfo import Models
+from modelscope.models.base import TorchModel
+from modelscope.models.builder import MODELS
+from modelscope.utils.constant import Tasks
+from .box_utils import calibrate_box, convert_to_square, get_image_boxes, nms
+from .first_stage import run_first_stage
+from .get_nets import ONet, PNet, RNet
+
+
+@MODELS.register_module(Tasks.face_detection, module_name=Models.mtcnn)
+class MtcnnFaceDetector(TorchModel):
+
+    def __init__(self, model_path, device='cuda'):
+        super().__init__(model_path)
+        torch.set_grad_enabled(False)
+        cudnn.benchmark = True
+        self.model_path = model_path
+        self.device = device
+
+        self.pnet = PNet(model_path=os.path.join(self.model_path, 'pnet.npy'))
+        self.rnet = RNet(model_path=os.path.join(self.model_path, 'rnet.npy'))
+        self.onet = ONet(model_path=os.path.join(self.model_path, 'onet.npy'))
+
+        self.pnet = self.pnet.to(device)
+        self.rnet = self.rnet.to(device)
+        self.onet = self.onet.to(device)
+
+    def forward(self, input):
+        image = Image.fromarray(np.uint8(input['img'].cpu().numpy()))
+        pnet = self.pnet
+        rnet = self.rnet
+        onet = self.onet
+        onet.eval()
+
+        min_face_size = 20.0
+        thresholds = [0.7, 0.8, 0.9]
+        nms_thresholds = [0.7, 0.7, 0.7]
+
+        # BUILD AN IMAGE PYRAMID
+        width, height = image.size
+        min_length = min(height, width)
+
+        min_detection_size = 12
+        factor = 0.707  # sqrt(0.5)
+
+        # scales for scaling the image
+        scales = []
+
+        m = min_detection_size / min_face_size
+        min_length *= m
+
+        factor_count = 0
+        while min_length > min_detection_size:
+            scales.append(m * factor**factor_count)
+            min_length *= factor
+            factor_count += 1
+
+        # STAGE 1
+
+        # it will be returned
+        bounding_boxes = []
+
+        # run P-Net on different scales
+        for s in scales:
+            boxes = run_first_stage(
+                image,
+                pnet,
+                scale=s,
+                threshold=thresholds[0],
+                device=self.device)
+            bounding_boxes.append(boxes)
+
+        # collect boxes (and offsets, and scores) from different scales
+        bounding_boxes = [i for i in bounding_boxes if i is not None]
+        bounding_boxes = np.vstack(bounding_boxes)
+
+        keep = nms(bounding_boxes[:, 0:5], nms_thresholds[0])
+        bounding_boxes = bounding_boxes[keep]
+
+        # use offsets predicted by pnet to transform bounding boxes
+        bounding_boxes = calibrate_box(bounding_boxes[:, 0:5],
+                                       bounding_boxes[:, 5:])
+        # shape [n_boxes, 5]
+
+        bounding_boxes = convert_to_square(bounding_boxes)
+        bounding_boxes[:, 0:4] = np.round(bounding_boxes[:, 0:4])
+
+        # STAGE 2
+
+        img_boxes = get_image_boxes(bounding_boxes, image, size=24)
+        img_boxes = Variable(torch.FloatTensor(img_boxes), volatile=True)
+        output = rnet(img_boxes.to(self.device))
+        offsets = output[0].cpu().data.numpy()  # shape [n_boxes, 4]
+        probs = output[1].cpu().data.numpy()  # shape [n_boxes, 2]
+
+        keep = np.where(probs[:, 1] > thresholds[1])[0]
+        bounding_boxes = bounding_boxes[keep]
+        bounding_boxes[:, 4] = probs[keep, 1].reshape((-1, ))
+        offsets = offsets[keep]
+
+        keep = nms(bounding_boxes, nms_thresholds[1])
+        bounding_boxes = bounding_boxes[keep]
+        bounding_boxes = calibrate_box(bounding_boxes, offsets[keep])
+        bounding_boxes = convert_to_square(bounding_boxes)
+        bounding_boxes[:, 0:4] = np.round(bounding_boxes[:, 0:4])
+
+        # STAGE 3
+
+        img_boxes = get_image_boxes(bounding_boxes, image, size=48)
+        if len(img_boxes) == 0:
+            return [], []
+        img_boxes = Variable(torch.FloatTensor(img_boxes), volatile=True)
+        output = onet(img_boxes.to(self.device))
+        landmarks = output[0].cpu().data.numpy()  # shape [n_boxes, 10]
+        offsets = output[1].cpu().data.numpy()  # shape [n_boxes, 4]
+        probs = output[2].cpu().data.numpy()  # shape [n_boxes, 2]
+
+        keep = np.where(probs[:, 1] > thresholds[2])[0]
+        bounding_boxes = bounding_boxes[keep]
+        bounding_boxes[:, 4] = probs[keep, 1].reshape((-1, ))
+        offsets = offsets[keep]
+        landmarks = landmarks[keep]
+
+        # compute landmark points
+        width = bounding_boxes[:, 2] - bounding_boxes[:, 0] + 1.0
+        height = bounding_boxes[:, 3] - bounding_boxes[:, 1] + 1.0
+        xmin, ymin = bounding_boxes[:, 0], bounding_boxes[:, 1]
+        landmarks[:, 0:5] = np.expand_dims(
+            xmin, 1) + np.expand_dims(width, 1) * landmarks[:, 0:5]
+        landmarks[:, 5:10] = np.expand_dims(
+            ymin, 1) + np.expand_dims(height, 1) * landmarks[:, 5:10]
+
+        bounding_boxes = calibrate_box(bounding_boxes, offsets)
+        keep = nms(bounding_boxes, nms_thresholds[2], mode='min')
+        bounding_boxes = bounding_boxes[keep]
+        landmarks = landmarks[keep]
+        landmarks = landmarks.reshape(-1, 2, 5).transpose(
+            (0, 2, 1)).reshape(-1, 10)
+
+        return bounding_boxes, landmarks
diff --git a/modelscope/models/cv/face_detection/mtcnn/models/first_stage.py b/modelscope/models/cv/face_detection/mtcnn/models/first_stage.py
new file mode 100644
index 00000000..e2aba47e
--- /dev/null
+++ b/modelscope/models/cv/face_detection/mtcnn/models/first_stage.py
@@ -0,0 +1,100 @@
+# The implementation is based on mtcnn, available at https://github.com/TropComplique/mtcnn-pytorch
+import math
+
+import numpy as np
+import torch
+from PIL import Image
+from torch.autograd import Variable
+
+from .box_utils import _preprocess, nms
+
+
+def run_first_stage(image, net, scale, threshold, device='cuda'):
+    """Run P-Net, generate bounding boxes, and do NMS.
+
+    Arguments:
+        image: an instance of PIL.Image.
+        net: an instance of pytorch's nn.Module, P-Net.
+        scale: a float number,
+            scale width and height of the image by this number.
+        threshold: a float number,
+            threshold on the probability of a face when generating
+            bounding boxes from predictions of the net.
+
+    Returns:
+        a float numpy array of shape [n_boxes, 9],
+            bounding boxes with scores and offsets (4 + 1 + 4).
+    """
+
+    # scale the image and convert it to a float array
+    width, height = image.size
+    sw, sh = math.ceil(width * scale), math.ceil(height * scale)
+    img = image.resize((sw, sh), Image.BILINEAR)
+    img = np.asarray(img, 'float32')
+
+    img = Variable(
+        torch.FloatTensor(_preprocess(img)), volatile=True).to(device)
+    output = net(img)
+    probs = output[1].cpu().data.numpy()[0, 1, :, :]
+    offsets = output[0].cpu().data.numpy()
+    # probs: probability of a face at each sliding window
+    # offsets: transformations to true bounding boxes
+
+    boxes = _generate_bboxes(probs, offsets, scale, threshold)
+    if len(boxes) == 0:
+        return None
+
+    keep = nms(boxes[:, 0:5], overlap_threshold=0.5)
+    return boxes[keep]
+
+
+def _generate_bboxes(probs, offsets, scale, threshold):
+    """Generate bounding boxes at places
+    where there is probably a face.
+
+    Arguments:
+        probs: a float numpy array of shape [n, m].
+        offsets: a float numpy array of shape [1, 4, n, m].
+        scale: a float number,
+            width and height of the image were scaled by this number.
+        threshold: a float number.
+
+    Returns:
+        a float numpy array of shape [n_boxes, 9]
+    """
+
+    # applying P-Net is equivalent, in some sense, to
+    # moving 12x12 window with stride 2
+    stride = 2
+    cell_size = 12
+
+    # indices of boxes where there is probably a face
+    inds = np.where(probs > threshold)
+
+    if inds[0].size == 0:
+        return np.array([])
+
+    # transformations of bounding boxes
+    tx1, ty1, tx2, ty2 = [offsets[0, i, inds[0], inds[1]] for i in range(4)]
+    # they are defined as:
+    # w = x2 - x1 + 1
+    # h = y2 - y1 + 1
+    # x1_true = x1 + tx1*w
+    # x2_true = x2 + tx2*w
+    # y1_true = y1 + ty1*h
+    # y2_true = y2 + ty2*h
+
+    offsets = np.array([tx1, ty1, tx2, ty2])
+    score = probs[inds[0], inds[1]]
+
+    # P-Net is applied to scaled images
+    # so we need to rescale bounding boxes back
+    bounding_boxes = np.vstack([
+        np.round((stride * inds[1] + 1.0) / scale),
+        np.round((stride * inds[0] + 1.0) / scale),
+        np.round((stride * inds[1] + 1.0 + cell_size) / scale),
+        np.round((stride * inds[0] + 1.0 + cell_size) / scale), score, offsets
+    ])
+    # why one is added?
+
+    return bounding_boxes.T
diff --git a/modelscope/models/cv/face_detection/mtcnn/models/get_nets.py b/modelscope/models/cv/face_detection/mtcnn/models/get_nets.py
new file mode 100644
index 00000000..5fbbd33b
--- /dev/null
+++ b/modelscope/models/cv/face_detection/mtcnn/models/get_nets.py
@@ -0,0 +1,160 @@
+# The implementation is based on mtcnn, available at https://github.com/TropComplique/mtcnn-pytorch
+from collections import OrderedDict
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class Flatten(nn.Module):
+
+    def __init__(self):
+        super(Flatten, self).__init__()
+
+    def forward(self, x):
+        """
+        Arguments:
+            x: a float tensor with shape [batch_size, c, h, w].
+        Returns:
+            a float tensor with shape [batch_size, c*h*w].
+        """
+
+        # without this pretrained model isn't working
+        x = x.transpose(3, 2).contiguous()
+
+        return x.view(x.size(0), -1)
+
+
+class PNet(nn.Module):
+
+    def __init__(self, model_path=None):
+
+        super(PNet, self).__init__()
+
+        # suppose we have input with size HxW, then
+        # after first layer: H - 2,
+        # after pool: ceil((H - 2)/2),
+        # after second conv: ceil((H - 2)/2) - 2,
+        # after last conv: ceil((H - 2)/2) - 4,
+        # and the same for W
+
+        self.features = nn.Sequential(
+            OrderedDict([('conv1', nn.Conv2d(3, 10, 3, 1)),
+                         ('prelu1', nn.PReLU(10)),
+                         ('pool1', nn.MaxPool2d(2, 2, ceil_mode=True)),
+                         ('conv2', nn.Conv2d(10, 16, 3, 1)),
+                         ('prelu2', nn.PReLU(16)),
+                         ('conv3', nn.Conv2d(16, 32, 3, 1)),
+                         ('prelu3', nn.PReLU(32))]))
+
+        self.conv4_1 = nn.Conv2d(32, 2, 1, 1)
+        self.conv4_2 = nn.Conv2d(32, 4, 1, 1)
+
+        weights = np.load(model_path, allow_pickle=True)[()]
+        for n, p in self.named_parameters():
+            p.data = torch.FloatTensor(weights[n])
+
+    def forward(self, x):
+        """
+        Arguments:
+            x: a float tensor with shape [batch_size, 3, h, w].
+        Returns:
+            b: a float tensor with shape [batch_size, 4, h', w'].
+            a: a float tensor with shape [batch_size, 2, h', w'].
+        """
+        x = self.features(x)
+        a = self.conv4_1(x)
+        b = self.conv4_2(x)
+        a = F.softmax(a)
+        return b, a
+
+
+class RNet(nn.Module):
+
+    def __init__(self, model_path=None):
+
+        super(RNet, self).__init__()
+
+        self.features = nn.Sequential(
+            OrderedDict([('conv1', nn.Conv2d(3, 28, 3, 1)),
+                         ('prelu1', nn.PReLU(28)),
+                         ('pool1', nn.MaxPool2d(3, 2, ceil_mode=True)),
+                         ('conv2', nn.Conv2d(28, 48, 3, 1)),
+                         ('prelu2', nn.PReLU(48)),
+                         ('pool2', nn.MaxPool2d(3, 2, ceil_mode=True)),
+                         ('conv3', nn.Conv2d(48, 64, 2, 1)),
+                         ('prelu3', nn.PReLU(64)), ('flatten', Flatten()),
+                         ('conv4', nn.Linear(576, 128)),
+                         ('prelu4', nn.PReLU(128))]))
+
+        self.conv5_1 = nn.Linear(128, 2)
+        self.conv5_2 = nn.Linear(128, 4)
+
+        weights = np.load(model_path, allow_pickle=True)[()]
+        for n, p in self.named_parameters():
+            p.data = torch.FloatTensor(weights[n])
+
+    def forward(self, x):
+        """
+        Arguments:
+            x: a float tensor with shape [batch_size, 3, h, w].
+        Returns:
+            b: a float tensor with shape [batch_size, 4].
+            a: a float tensor with shape [batch_size, 2].
+        """
+        x = self.features(x)
+        a = self.conv5_1(x)
+        b = self.conv5_2(x)
+        a = F.softmax(a)
+        return b, a
+
+
+class ONet(nn.Module):
+
+    def __init__(self, model_path=None):
+
+        super(ONet, self).__init__()
+
+        self.features = nn.Sequential(
+            OrderedDict([
+                ('conv1', nn.Conv2d(3, 32, 3, 1)),
+                ('prelu1', nn.PReLU(32)),
+                ('pool1', nn.MaxPool2d(3, 2, ceil_mode=True)),
+                ('conv2', nn.Conv2d(32, 64, 3, 1)),
+                ('prelu2', nn.PReLU(64)),
+                ('pool2', nn.MaxPool2d(3, 2, ceil_mode=True)),
+                ('conv3', nn.Conv2d(64, 64, 3, 1)),
+                ('prelu3', nn.PReLU(64)),
+                ('pool3', nn.MaxPool2d(2, 2, ceil_mode=True)),
+                ('conv4', nn.Conv2d(64, 128, 2, 1)),
+                ('prelu4', nn.PReLU(128)),
+                ('flatten', Flatten()),
+                ('conv5', nn.Linear(1152, 256)),
+                ('drop5', nn.Dropout(0.25)),
+                ('prelu5', nn.PReLU(256)),
+            ]))
+
+        self.conv6_1 = nn.Linear(256, 2)
+        self.conv6_2 = nn.Linear(256, 4)
+        self.conv6_3 = nn.Linear(256, 10)
+
+        weights = np.load(model_path, allow_pickle=True)[()]
+        for n, p in self.named_parameters():
+            p.data = torch.FloatTensor(weights[n])
+
+    def forward(self, x):
+        """
+        Arguments:
+            x: a float tensor with shape [batch_size, 3, h, w].
+        Returns:
+            c: a float tensor with shape [batch_size, 10].
+            b: a float tensor with shape [batch_size, 4].
+            a: a float tensor with shape [batch_size, 2].
+        """
+        x = self.features(x)
+        a = self.conv6_1(x)
+        b = self.conv6_2(x)
+        c = self.conv6_3(x)
+        a = F.softmax(a)
+        return c, b, a
diff --git a/modelscope/models/cv/face_detection/retinaface/__init__.py b/modelscope/models/cv/face_detection/retinaface/__init__.py
new file mode 100644
index 00000000..e7b589a1
--- /dev/null
+++ b/modelscope/models/cv/face_detection/retinaface/__init__.py
@@ -0,0 +1,2 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from .detection import RetinaFaceDetection
diff --git a/modelscope/models/cv/face_detection/retinaface/detection.py b/modelscope/models/cv/face_detection/retinaface/detection.py
new file mode 100755
index 00000000..3dd31659
--- /dev/null
+++ b/modelscope/models/cv/face_detection/retinaface/detection.py
@@ -0,0 +1,137 @@
+# The implementation is based on resnet, available at https://github.com/biubug6/Pytorch_Retinaface
+import cv2
+import numpy as np
+import torch
+import torch.backends.cudnn as cudnn
+
+from modelscope.metainfo import Models
+from modelscope.models.base import Tensor, TorchModel
+from modelscope.models.builder import MODELS
+from modelscope.utils.config import Config
+from modelscope.utils.constant import ModelFile, Tasks
+from .models.retinaface import RetinaFace
+from .utils import PriorBox, decode, decode_landm, py_cpu_nms
+
+
+@MODELS.register_module(Tasks.face_detection, module_name=Models.retinaface)
+class RetinaFaceDetection(TorchModel):
+
+    def __init__(self, model_path, device='cuda'):
+        super().__init__(model_path)
+        torch.set_grad_enabled(False)
+        cudnn.benchmark = True
+        self.model_path = model_path
+        self.cfg = Config.from_file(
+            model_path.replace(ModelFile.TORCH_MODEL_FILE,
+                               ModelFile.CONFIGURATION))['models']
+        self.net = RetinaFace(cfg=self.cfg)
+        self.load_model()
+        self.device = device
+        self.net = self.net.to(self.device)
+
+        self.mean = torch.tensor([[[[104]], [[117]], [[123]]]]).to(device)
+
+    def check_keys(self, pretrained_state_dict):
+        ckpt_keys = set(pretrained_state_dict.keys())
+        model_keys = set(self.net.state_dict().keys())
+        used_pretrained_keys = model_keys & ckpt_keys
+        assert len(
+            used_pretrained_keys) > 0, 'load NONE from pretrained checkpoint'
+        return True
+
+    def remove_prefix(self, state_dict, prefix):
+        new_state_dict = dict()
+        for k, v in state_dict.items():
+            if k.startswith(prefix):
+                new_state_dict[k[len(prefix):]] = v
+            else:
+                new_state_dict[k] = v
+        return new_state_dict
+
+    def load_model(self, load_to_cpu=False):
+        pretrained_dict = torch.load(
+            self.model_path, map_location=torch.device('cpu'))
+        if 'state_dict' in pretrained_dict.keys():
+            pretrained_dict = self.remove_prefix(pretrained_dict['state_dict'],
+                                                 'module.')
+        else:
+            pretrained_dict = self.remove_prefix(pretrained_dict, 'module.')
+        self.check_keys(pretrained_dict)
+        self.net.load_state_dict(pretrained_dict, strict=False)
+        self.net.eval()
+
+    def forward(self, input):
+        img_raw = input['img'].cpu().numpy()
+        img = np.float32(img_raw)
+
+        im_height, im_width = img.shape[:2]
+        ss = 1.0
+        # tricky
+        if max(im_height, im_width) > 1500:
+            ss = 1000.0 / max(im_height, im_width)
+            img = cv2.resize(img, (0, 0), fx=ss, fy=ss)
+            im_height, im_width = img.shape[:2]
+
+        scale = torch.Tensor(
+            [img.shape[1], img.shape[0], img.shape[1], img.shape[0]])
+        img -= (104, 117, 123)
+        img = img.transpose(2, 0, 1)
+        img = torch.from_numpy(img).unsqueeze(0)
+        img = img.to(self.device)
+        scale = scale.to(self.device)
+
+        loc, conf, landms = self.net(img)  # forward pass
+        del img
+
+        confidence_threshold = 0.9
+        nms_threshold = 0.4
+        top_k = 5000
+        keep_top_k = 750
+
+        priorbox = PriorBox(self.cfg, image_size=(im_height, im_width))
+        priors = priorbox.forward()
+        priors = priors.to(self.device)
+        prior_data = priors.data
+        boxes = decode(loc.data.squeeze(0), prior_data, self.cfg['variance'])
+        boxes = boxes * scale
+        boxes = boxes.cpu().numpy()
+        scores = conf.squeeze(0).data.cpu().numpy()[:, 1]
+        landms = decode_landm(
+            landms.data.squeeze(0), prior_data, self.cfg['variance'])
+        scale1 = torch.Tensor([
+            im_width, im_height, im_width, im_height, im_width, im_height,
+            im_width, im_height, im_width, im_height
+        ])
+        scale1 = scale1.to(self.device)
+        landms = landms * scale1
+        landms = landms.cpu().numpy()
+
+        # ignore low scores
+        inds = np.where(scores > confidence_threshold)[0]
+        boxes = boxes[inds]
+        landms = landms[inds]
+        scores = scores[inds]
+
+        # keep top-K before NMS
+        order = scores.argsort()[::-1][:top_k]
+        boxes = boxes[order]
+        landms = landms[order]
+        scores = scores[order]
+
+        # do NMS
+        dets = np.hstack((boxes, scores[:, np.newaxis])).astype(
+            np.float32, copy=False)
+        keep = py_cpu_nms(dets, nms_threshold)
+        dets = dets[keep, :]
+        landms = landms[keep]
+
+        # keep top-K faster NMS
+        dets = dets[:keep_top_k, :]
+        landms = landms[:keep_top_k, :]
+
+        landms = landms.reshape((-1, 5, 2))
+        landms = landms.reshape(
+            -1,
+            10,
+        )
+        return dets / ss, landms / ss
diff --git a/modelscope/models/cv/face_detection/retinaface/models/__init__.py b/modelscope/models/cv/face_detection/retinaface/models/__init__.py
new file mode 100755
index 00000000..e69de29b
diff --git a/modelscope/models/cv/face_detection/retinaface/models/net.py b/modelscope/models/cv/face_detection/retinaface/models/net.py
new file mode 100755
index 00000000..3be7c4b9
--- /dev/null
+++ b/modelscope/models/cv/face_detection/retinaface/models/net.py
@@ -0,0 +1,149 @@
+# The implementation is based on resnet, available at https://github.com/biubug6/Pytorch_Retinaface
+import time
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torchvision.models as models
+import torchvision.models._utils as _utils
+from torch.autograd import Variable
+
+
+def conv_bn(inp, oup, stride=1, leaky=0):
+    return nn.Sequential(
+        nn.Conv2d(inp, oup, 3, stride, 1, bias=False), nn.BatchNorm2d(oup),
+        nn.LeakyReLU(negative_slope=leaky, inplace=True))
+
+
+def conv_bn_no_relu(inp, oup, stride):
+    return nn.Sequential(
+        nn.Conv2d(inp, oup, 3, stride, 1, bias=False),
+        nn.BatchNorm2d(oup),
+    )
+
+
+def conv_bn1X1(inp, oup, stride, leaky=0):
+    return nn.Sequential(
+        nn.Conv2d(inp, oup, 1, stride, padding=0, bias=False),
+        nn.BatchNorm2d(oup), nn.LeakyReLU(negative_slope=leaky, inplace=True))
+
+
+def conv_dw(inp, oup, stride, leaky=0.1):
+    return nn.Sequential(
+        nn.Conv2d(inp, inp, 3, stride, 1, groups=inp, bias=False),
+        nn.BatchNorm2d(inp),
+        nn.LeakyReLU(negative_slope=leaky, inplace=True),
+        nn.Conv2d(inp, oup, 1, 1, 0, bias=False),
+        nn.BatchNorm2d(oup),
+        nn.LeakyReLU(negative_slope=leaky, inplace=True),
+    )
+
+
+class SSH(nn.Module):
+
+    def __init__(self, in_channel, out_channel):
+        super(SSH, self).__init__()
+        assert out_channel % 4 == 0
+        leaky = 0
+        if (out_channel <= 64):
+            leaky = 0.1
+        self.conv3X3 = conv_bn_no_relu(in_channel, out_channel // 2, stride=1)
+
+        self.conv5X5_1 = conv_bn(
+            in_channel, out_channel // 4, stride=1, leaky=leaky)
+        self.conv5X5_2 = conv_bn_no_relu(
+            out_channel // 4, out_channel // 4, stride=1)
+
+        self.conv7X7_2 = conv_bn(
+            out_channel // 4, out_channel // 4, stride=1, leaky=leaky)
+        self.conv7x7_3 = conv_bn_no_relu(
+            out_channel // 4, out_channel // 4, stride=1)
+
+    def forward(self, input):
+        conv3X3 = self.conv3X3(input)
+
+        conv5X5_1 = self.conv5X5_1(input)
+        conv5X5 = self.conv5X5_2(conv5X5_1)
+
+        conv7X7_2 = self.conv7X7_2(conv5X5_1)
+        conv7X7 = self.conv7x7_3(conv7X7_2)
+
+        out = torch.cat([conv3X3, conv5X5, conv7X7], dim=1)
+        out = F.relu(out)
+        return out
+
+
+class FPN(nn.Module):
+
+    def __init__(self, in_channels_list, out_channels):
+        super(FPN, self).__init__()
+        leaky = 0
+        if (out_channels <= 64):
+            leaky = 0.1
+        self.output1 = conv_bn1X1(
+            in_channels_list[0], out_channels, stride=1, leaky=leaky)
+        self.output2 = conv_bn1X1(
+            in_channels_list[1], out_channels, stride=1, leaky=leaky)
+        self.output3 = conv_bn1X1(
+            in_channels_list[2], out_channels, stride=1, leaky=leaky)
+
+        self.merge1 = conv_bn(out_channels, out_channels, leaky=leaky)
+        self.merge2 = conv_bn(out_channels, out_channels, leaky=leaky)
+
+    def forward(self, input):
+        # names = list(input.keys())
+        input = list(input.values())
+
+        output1 = self.output1(input[0])
+        output2 = self.output2(input[1])
+        output3 = self.output3(input[2])
+
+        up3 = F.interpolate(
+            output3, size=[output2.size(2), output2.size(3)], mode='nearest')
+        output2 = output2 + up3
+        output2 = self.merge2(output2)
+
+        up2 = F.interpolate(
+            output2, size=[output1.size(2), output1.size(3)], mode='nearest')
+        output1 = output1 + up2
+        output1 = self.merge1(output1)
+
+        out = [output1, output2, output3]
+        return out
+
+
+class MobileNetV1(nn.Module):
+
+    def __init__(self):
+        super(MobileNetV1, self).__init__()
+        self.stage1 = nn.Sequential(
+            conv_bn(3, 8, 2, leaky=0.1),  # 3
+            conv_dw(8, 16, 1),  # 7
+            conv_dw(16, 32, 2),  # 11
+            conv_dw(32, 32, 1),  # 19
+            conv_dw(32, 64, 2),  # 27
+            conv_dw(64, 64, 1),  # 43
+        )
+        self.stage2 = nn.Sequential(
+            conv_dw(64, 128, 2),  # 43 + 16 = 59
+            conv_dw(128, 128, 1),  # 59 + 32 = 91
+            conv_dw(128, 128, 1),  # 91 + 32 = 123
+            conv_dw(128, 128, 1),  # 123 + 32 = 155
+            conv_dw(128, 128, 1),  # 155 + 32 = 187
+            conv_dw(128, 128, 1),  # 187 + 32 = 219
+        )
+        self.stage3 = nn.Sequential(
+            conv_dw(128, 256, 2),  # 219 +3 2 = 241
+            conv_dw(256, 256, 1),  # 241 + 64 = 301
+        )
+        self.avg = nn.AdaptiveAvgPool2d((1, 1))
+        self.fc = nn.Linear(256, 1000)
+
+    def forward(self, x):
+        x = self.stage1(x)
+        x = self.stage2(x)
+        x = self.stage3(x)
+        x = self.avg(x)
+        x = x.view(-1, 256)
+        x = self.fc(x)
+        return x
diff --git a/modelscope/models/cv/face_detection/retinaface/models/retinaface.py b/modelscope/models/cv/face_detection/retinaface/models/retinaface.py
new file mode 100755
index 00000000..8d2001dd
--- /dev/null
+++ b/modelscope/models/cv/face_detection/retinaface/models/retinaface.py
@@ -0,0 +1,145 @@
+# The implementation is based on resnet, available at https://github.com/biubug6/Pytorch_Retinaface
+from collections import OrderedDict
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torchvision.models as models
+import torchvision.models._utils as _utils
+import torchvision.models.detection.backbone_utils as backbone_utils
+
+from .net import FPN, SSH, MobileNetV1
+
+
+class ClassHead(nn.Module):
+
+    def __init__(self, inchannels=512, num_anchors=3):
+        super(ClassHead, self).__init__()
+        self.num_anchors = num_anchors
+        self.conv1x1 = nn.Conv2d(
+            inchannels,
+            self.num_anchors * 2,
+            kernel_size=(1, 1),
+            stride=1,
+            padding=0)
+
+    def forward(self, x):
+        out = self.conv1x1(x)
+        out = out.permute(0, 2, 3, 1).contiguous()
+
+        return out.view(out.shape[0], -1, 2)
+
+
+class BboxHead(nn.Module):
+
+    def __init__(self, inchannels=512, num_anchors=3):
+        super(BboxHead, self).__init__()
+        self.conv1x1 = nn.Conv2d(
+            inchannels,
+            num_anchors * 4,
+            kernel_size=(1, 1),
+            stride=1,
+            padding=0)
+
+    def forward(self, x):
+        out = self.conv1x1(x)
+        out = out.permute(0, 2, 3, 1).contiguous()
+
+        return out.view(out.shape[0], -1, 4)
+
+
+class LandmarkHead(nn.Module):
+
+    def __init__(self, inchannels=512, num_anchors=3):
+        super(LandmarkHead, self).__init__()
+        self.conv1x1 = nn.Conv2d(
+            inchannels,
+            num_anchors * 10,
+            kernel_size=(1, 1),
+            stride=1,
+            padding=0)
+
+    def forward(self, x):
+        out = self.conv1x1(x)
+        out = out.permute(0, 2, 3, 1).contiguous()
+
+        return out.view(out.shape[0], -1, 10)
+
+
+class RetinaFace(nn.Module):
+
+    def __init__(self, cfg=None):
+        """
+        :param cfg:  Network related settings.
+        """
+        super(RetinaFace, self).__init__()
+        backbone = None
+        if cfg['name'] == 'Resnet50':
+            backbone = models.resnet50(pretrained=cfg['pretrain'])
+        else:
+            raise Exception('Invalid name')
+
+        self.body = _utils.IntermediateLayerGetter(backbone,
+                                                   cfg['return_layers'])
+        in_channels_stage2 = cfg['in_channel']
+        in_channels_list = [
+            in_channels_stage2 * 2,
+            in_channels_stage2 * 4,
+            in_channels_stage2 * 8,
+        ]
+        out_channels = cfg['out_channel']
+        self.fpn = FPN(in_channels_list, out_channels)
+        self.ssh1 = SSH(out_channels, out_channels)
+        self.ssh2 = SSH(out_channels, out_channels)
+        self.ssh3 = SSH(out_channels, out_channels)
+
+        self.ClassHead = self._make_class_head(
+            fpn_num=3, inchannels=cfg['out_channel'])
+        self.BboxHead = self._make_bbox_head(
+            fpn_num=3, inchannels=cfg['out_channel'])
+        self.LandmarkHead = self._make_landmark_head(
+            fpn_num=3, inchannels=cfg['out_channel'])
+
+    def _make_class_head(self, fpn_num=3, inchannels=64, anchor_num=2):
+        classhead = nn.ModuleList()
+        for i in range(fpn_num):
+            classhead.append(ClassHead(inchannels, anchor_num))
+        return classhead
+
+    def _make_bbox_head(self, fpn_num=3, inchannels=64, anchor_num=2):
+        bboxhead = nn.ModuleList()
+        for i in range(fpn_num):
+            bboxhead.append(BboxHead(inchannels, anchor_num))
+        return bboxhead
+
+    def _make_landmark_head(self, fpn_num=3, inchannels=64, anchor_num=2):
+        landmarkhead = nn.ModuleList()
+        for i in range(fpn_num):
+            landmarkhead.append(LandmarkHead(inchannels, anchor_num))
+        return landmarkhead
+
+    def forward(self, inputs):
+        out = self.body(inputs)
+
+        # FPN
+        fpn = self.fpn(out)
+
+        # SSH
+        feature1 = self.ssh1(fpn[0])
+        feature2 = self.ssh2(fpn[1])
+        feature3 = self.ssh3(fpn[2])
+        features = [feature1, feature2, feature3]
+
+        bbox_regressions = torch.cat(
+            [self.BboxHead[i](feature) for i, feature in enumerate(features)],
+            dim=1)
+        classifications = torch.cat(
+            [self.ClassHead[i](feature) for i, feature in enumerate(features)],
+            dim=1)
+        ldm_regressions = torch.cat(
+            [self.LandmarkHead[i](feat) for i, feat in enumerate(features)],
+            dim=1)
+
+        output = (bbox_regressions, F.softmax(classifications,
+                                              dim=-1), ldm_regressions)
+        return output
diff --git a/modelscope/models/cv/face_detection/retinaface/utils.py b/modelscope/models/cv/face_detection/retinaface/utils.py
new file mode 100755
index 00000000..60c9e2dd
--- /dev/null
+++ b/modelscope/models/cv/face_detection/retinaface/utils.py
@@ -0,0 +1,123 @@
+# --------------------------------------------------------
+# Modified from https://github.com/biubug6/Pytorch_Retinaface
+# --------------------------------------------------------
+
+from itertools import product as product
+from math import ceil
+
+import numpy as np
+import torch
+
+
+class PriorBox(object):
+
+    def __init__(self, cfg, image_size=None, phase='train'):
+        super(PriorBox, self).__init__()
+        self.min_sizes = cfg['min_sizes']
+        self.steps = cfg['steps']
+        self.clip = cfg['clip']
+        self.image_size = image_size
+        self.feature_maps = [[
+            ceil(self.image_size[0] / step),
+            ceil(self.image_size[1] / step)
+        ] for step in self.steps]
+        self.name = 's'
+
+    def forward(self):
+        anchors = []
+        for k, f in enumerate(self.feature_maps):
+            min_sizes = self.min_sizes[k]
+            for i, j in product(range(f[0]), range(f[1])):
+                for min_size in min_sizes:
+                    s_kx = min_size / self.image_size[1]
+                    s_ky = min_size / self.image_size[0]
+                    dense_cx = [
+                        x * self.steps[k] / self.image_size[1]
+                        for x in [j + 0.5]
+                    ]
+                    dense_cy = [
+                        y * self.steps[k] / self.image_size[0]
+                        for y in [i + 0.5]
+                    ]
+                    for cy, cx in product(dense_cy, dense_cx):
+                        anchors += [cx, cy, s_kx, s_ky]
+
+        # back to torch land
+        output = torch.Tensor(anchors).view(-1, 4)
+        if self.clip:
+            output.clamp_(max=1, min=0)
+        return output
+
+
+def py_cpu_nms(dets, thresh):
+    """Pure Python NMS baseline."""
+    x1 = dets[:, 0]
+    y1 = dets[:, 1]
+    x2 = dets[:, 2]
+    y2 = dets[:, 3]
+    scores = dets[:, 4]
+
+    areas = (x2 - x1 + 1) * (y2 - y1 + 1)
+    order = scores.argsort()[::-1]
+
+    keep = []
+    while order.size > 0:
+        i = order[0]
+        keep.append(i)
+        xx1 = np.maximum(x1[i], x1[order[1:]])
+        yy1 = np.maximum(y1[i], y1[order[1:]])
+        xx2 = np.minimum(x2[i], x2[order[1:]])
+        yy2 = np.minimum(y2[i], y2[order[1:]])
+
+        w = np.maximum(0.0, xx2 - xx1 + 1)
+        h = np.maximum(0.0, yy2 - yy1 + 1)
+        inter = w * h
+        ovr = inter / (areas[i] + areas[order[1:]] - inter)
+
+        inds = np.where(ovr <= thresh)[0]
+        order = order[inds + 1]
+
+    return keep
+
+
+# Adapted from https://github.com/Hakuyume/chainer-ssd
+def decode(loc, priors, variances):
+    """Decode locations from predictions using priors to undo
+    the encoding we did for offset regression at train time.
+    Args:
+        loc (tensor): location predictions for loc layers,
+            Shape: [num_priors,4]
+        priors (tensor): Prior boxes in center-offset form.
+            Shape: [num_priors,4].
+        variances: (list[float]) Variances of priorboxes
+    Return:
+        decoded bounding box predictions
+    """
+
+    boxes = torch.cat(
+        (priors[:, :2] + loc[:, :2] * variances[0] * priors[:, 2:],
+         priors[:, 2:] * torch.exp(loc[:, 2:] * variances[1])), 1)
+    boxes[:, :2] -= boxes[:, 2:] / 2
+    boxes[:, 2:] += boxes[:, :2]
+    return boxes
+
+
+def decode_landm(pre, priors, variances):
+    """Decode landm from predictions using priors to undo
+    the encoding we did for offset regression at train time.
+    Args:
+        pre (tensor): landm predictions for loc layers,
+            Shape: [num_priors,10]
+        priors (tensor): Prior boxes in center-offset form.
+            Shape: [num_priors,4].
+        variances: (list[float]) Variances of priorboxes
+    Return:
+        decoded landm predictions
+    """
+    a = priors[:, :2] + pre[:, :2] * variances[0] * priors[:, 2:]
+    b = priors[:, :2] + pre[:, 2:4] * variances[0] * priors[:, 2:]
+    c = priors[:, :2] + pre[:, 4:6] * variances[0] * priors[:, 2:]
+    d = priors[:, :2] + pre[:, 6:8] * variances[0] * priors[:, 2:]
+    e = priors[:, :2] + pre[:, 8:10] * variances[0] * priors[:, 2:]
+    landms = torch.cat((a, b, c, d, e), dim=1)
+    return landms
diff --git a/modelscope/models/cv/face_detection/ulfd_slim/__init__.py b/modelscope/models/cv/face_detection/ulfd_slim/__init__.py
new file mode 100644
index 00000000..af1e7b42
--- /dev/null
+++ b/modelscope/models/cv/face_detection/ulfd_slim/__init__.py
@@ -0,0 +1,2 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from .detection import UlfdFaceDetector
diff --git a/modelscope/models/cv/face_detection/ulfd_slim/detection.py b/modelscope/models/cv/face_detection/ulfd_slim/detection.py
new file mode 100755
index 00000000..c0e2da6e
--- /dev/null
+++ b/modelscope/models/cv/face_detection/ulfd_slim/detection.py
@@ -0,0 +1,44 @@
+# The implementation is based on ULFD, available at
+# https://github.com/Linzaer/Ultra-Light-Fast-Generic-Face-Detector-1MB
+import os
+
+import cv2
+import numpy as np
+import torch
+import torch.backends.cudnn as cudnn
+import torch.nn.functional as F
+
+from modelscope.metainfo import Models
+from modelscope.models.base import Tensor, TorchModel
+from modelscope.models.builder import MODELS
+from modelscope.utils.constant import ModelFile, Tasks
+from .vision.ssd.fd_config import define_img_size
+from .vision.ssd.mb_tiny_fd import (create_mb_tiny_fd,
+                                    create_mb_tiny_fd_predictor)
+
+define_img_size(640)
+
+
+@MODELS.register_module(Tasks.face_detection, module_name=Models.ulfd)
+class UlfdFaceDetector(TorchModel):
+
+    def __init__(self, model_path, device='cuda'):
+        super().__init__(model_path)
+        torch.set_grad_enabled(False)
+        cudnn.benchmark = True
+        self.model_path = model_path
+        self.device = device
+        self.net = create_mb_tiny_fd(2, is_test=True, device=device)
+        self.predictor = create_mb_tiny_fd_predictor(
+            self.net, candidate_size=1500, device=device)
+        self.net.load(model_path)
+        self.net = self.net.to(device)
+
+    def forward(self, input):
+        img_raw = input['img']
+        img = np.array(img_raw.cpu().detach())
+        img = img[:, :, ::-1]
+        prob_th = 0.85
+        keep_top_k = 750
+        boxes, labels, probs = self.predictor.predict(img, keep_top_k, prob_th)
+        return boxes, probs
diff --git a/modelscope/models/cv/face_detection/ulfd_slim/vision/__init__.py b/modelscope/models/cv/face_detection/ulfd_slim/vision/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/modelscope/models/cv/face_detection/ulfd_slim/vision/box_utils.py b/modelscope/models/cv/face_detection/ulfd_slim/vision/box_utils.py
new file mode 100644
index 00000000..46d3b890
--- /dev/null
+++ b/modelscope/models/cv/face_detection/ulfd_slim/vision/box_utils.py
@@ -0,0 +1,124 @@
+# The implementation is based on ULFD, available at
+# https://github.com/Linzaer/Ultra-Light-Fast-Generic-Face-Detector-1MB
+import math
+
+import torch
+
+
+def hard_nms(box_scores, iou_threshold, top_k=-1, candidate_size=200):
+    """
+
+    Args:
+        box_scores (N, 5): boxes in corner-form and probabilities.
+        iou_threshold: intersection over union threshold.
+        top_k: keep top_k results. If k <= 0, keep all the results.
+        candidate_size: only consider the candidates with the highest scores.
+    Returns:
+         picked: a list of indexes of the kept boxes
+    """
+    scores = box_scores[:, -1]
+    boxes = box_scores[:, :-1]
+    picked = []
+    _, indexes = scores.sort(descending=True)
+    indexes = indexes[:candidate_size]
+    while len(indexes) > 0:
+        current = indexes[0]
+        picked.append(current.item())
+        if 0 < top_k == len(picked) or len(indexes) == 1:
+            break
+        current_box = boxes[current, :]
+        indexes = indexes[1:]
+        rest_boxes = boxes[indexes, :]
+        iou = iou_of(
+            rest_boxes,
+            current_box.unsqueeze(0),
+        )
+        indexes = indexes[iou <= iou_threshold]
+
+    return box_scores[picked, :]
+
+
+def nms(box_scores,
+        nms_method=None,
+        score_threshold=None,
+        iou_threshold=None,
+        sigma=0.5,
+        top_k=-1,
+        candidate_size=200):
+    return hard_nms(
+        box_scores, iou_threshold, top_k, candidate_size=candidate_size)
+
+
+def generate_priors(feature_map_list,
+                    shrinkage_list,
+                    image_size,
+                    min_boxes,
+                    clamp=True) -> torch.Tensor:
+    priors = []
+    for index in range(0, len(feature_map_list[0])):
+        scale_w = image_size[0] / shrinkage_list[0][index]
+        scale_h = image_size[1] / shrinkage_list[1][index]
+        for j in range(0, feature_map_list[1][index]):
+            for i in range(0, feature_map_list[0][index]):
+                x_center = (i + 0.5) / scale_w
+                y_center = (j + 0.5) / scale_h
+
+                for min_box in min_boxes[index]:
+                    w = min_box / image_size[0]
+                    h = min_box / image_size[1]
+                    priors.append([x_center, y_center, w, h])
+    priors = torch.tensor(priors)
+    if clamp:
+        torch.clamp(priors, 0.0, 1.0, out=priors)
+    return priors
+
+
+def convert_locations_to_boxes(locations, priors, center_variance,
+                               size_variance):
+    # priors can have one dimension less.
+    if priors.dim() + 1 == locations.dim():
+        priors = priors.unsqueeze(0)
+    a = locations[..., :2] * center_variance * priors[...,
+                                                      2:] + priors[..., :2]
+    b = torch.exp(locations[..., 2:] * size_variance) * priors[..., 2:]
+
+    return torch.cat([a, b], dim=locations.dim() - 1)
+
+
+def center_form_to_corner_form(locations):
+    a = locations[..., :2] - locations[..., 2:] / 2
+    b = locations[..., :2] + locations[..., 2:] / 2
+    return torch.cat([a, b], locations.dim() - 1)
+
+
+def iou_of(boxes0, boxes1, eps=1e-5):
+    """Return intersection-over-union (Jaccard index) of boxes.
+
+    Args:
+        boxes0 (N, 4): ground truth boxes.
+        boxes1 (N or 1, 4): predicted boxes.
+        eps: a small number to avoid 0 as denominator.
+    Returns:
+        iou (N): IoU values.
+    """
+    overlap_left_top = torch.max(boxes0[..., :2], boxes1[..., :2])
+    overlap_right_bottom = torch.min(boxes0[..., 2:], boxes1[..., 2:])
+
+    overlap_area = area_of(overlap_left_top, overlap_right_bottom)
+    area0 = area_of(boxes0[..., :2], boxes0[..., 2:])
+    area1 = area_of(boxes1[..., :2], boxes1[..., 2:])
+    return overlap_area / (area0 + area1 - overlap_area + eps)
+
+
+def area_of(left_top, right_bottom) -> torch.Tensor:
+    """Compute the areas of rectangles given two corners.
+
+    Args:
+        left_top (N, 2): left top corner.
+        right_bottom (N, 2): right bottom corner.
+
+    Returns:
+        area (N): return the area.
+    """
+    hw = torch.clamp(right_bottom - left_top, min=0.0)
+    return hw[..., 0] * hw[..., 1]
diff --git a/modelscope/models/cv/face_detection/ulfd_slim/vision/mb_tiny.py b/modelscope/models/cv/face_detection/ulfd_slim/vision/mb_tiny.py
new file mode 100644
index 00000000..8bbcef41
--- /dev/null
+++ b/modelscope/models/cv/face_detection/ulfd_slim/vision/mb_tiny.py
@@ -0,0 +1,49 @@
+# The implementation is based on ULFD, available at
+# https://github.com/Linzaer/Ultra-Light-Fast-Generic-Face-Detector-1MB
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class Mb_Tiny(nn.Module):
+
+    def __init__(self, num_classes=2):
+        super(Mb_Tiny, self).__init__()
+        self.base_channel = 8 * 2
+
+        def conv_bn(inp, oup, stride):
+            return nn.Sequential(
+                nn.Conv2d(inp, oup, 3, stride, 1, bias=False),
+                nn.BatchNorm2d(oup), nn.ReLU(inplace=True))
+
+        def conv_dw(inp, oup, stride):
+            return nn.Sequential(
+                nn.Conv2d(inp, inp, 3, stride, 1, groups=inp, bias=False),
+                nn.BatchNorm2d(inp),
+                nn.ReLU(inplace=True),
+                nn.Conv2d(inp, oup, 1, 1, 0, bias=False),
+                nn.BatchNorm2d(oup),
+                nn.ReLU(inplace=True),
+            )
+
+        self.model = nn.Sequential(
+            conv_bn(3, self.base_channel, 2),  # 160*120
+            conv_dw(self.base_channel, self.base_channel * 2, 1),
+            conv_dw(self.base_channel * 2, self.base_channel * 2, 2),  # 80*60
+            conv_dw(self.base_channel * 2, self.base_channel * 2, 1),
+            conv_dw(self.base_channel * 2, self.base_channel * 4, 2),  # 40*30
+            conv_dw(self.base_channel * 4, self.base_channel * 4, 1),
+            conv_dw(self.base_channel * 4, self.base_channel * 4, 1),
+            conv_dw(self.base_channel * 4, self.base_channel * 4, 1),
+            conv_dw(self.base_channel * 4, self.base_channel * 8, 2),  # 20*15
+            conv_dw(self.base_channel * 8, self.base_channel * 8, 1),
+            conv_dw(self.base_channel * 8, self.base_channel * 8, 1),
+            conv_dw(self.base_channel * 8, self.base_channel * 16, 2),  # 10*8
+            conv_dw(self.base_channel * 16, self.base_channel * 16, 1))
+        self.fc = nn.Linear(1024, num_classes)
+
+    def forward(self, x):
+        x = self.model(x)
+        x = F.avg_pool2d(x, 7)
+        x = x.view(-1, 1024)
+        x = self.fc(x)
+        return x
diff --git a/modelscope/models/cv/face_detection/ulfd_slim/vision/ssd/__init__.py b/modelscope/models/cv/face_detection/ulfd_slim/vision/ssd/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/modelscope/models/cv/face_detection/ulfd_slim/vision/ssd/data_preprocessing.py b/modelscope/models/cv/face_detection/ulfd_slim/vision/ssd/data_preprocessing.py
new file mode 100644
index 00000000..9251d67f
--- /dev/null
+++ b/modelscope/models/cv/face_detection/ulfd_slim/vision/ssd/data_preprocessing.py
@@ -0,0 +1,18 @@
+# The implementation is based on ULFD, available at
+# https://github.com/Linzaer/Ultra-Light-Fast-Generic-Face-Detector-1MB
+from ..transforms import Compose, Resize, SubtractMeans, ToTensor
+
+
+class PredictionTransform:
+
+    def __init__(self, size, mean=0.0, std=1.0):
+        self.transform = Compose([
+            Resize(size),
+            SubtractMeans(mean), lambda img, boxes=None, labels=None:
+            (img / std, boxes, labels),
+            ToTensor()
+        ])
+
+    def __call__(self, image):
+        image, _, _ = self.transform(image)
+        return image
diff --git a/modelscope/models/cv/face_detection/ulfd_slim/vision/ssd/fd_config.py b/modelscope/models/cv/face_detection/ulfd_slim/vision/ssd/fd_config.py
new file mode 100644
index 00000000..495a2fcd
--- /dev/null
+++ b/modelscope/models/cv/face_detection/ulfd_slim/vision/ssd/fd_config.py
@@ -0,0 +1,49 @@
+# The implementation is based on ULFD, available at
+# https://github.com/Linzaer/Ultra-Light-Fast-Generic-Face-Detector-1MB
+import numpy as np
+
+from ..box_utils import generate_priors
+
+image_mean_test = image_mean = np.array([127, 127, 127])
+image_std = 128.0
+iou_threshold = 0.3
+center_variance = 0.1
+size_variance = 0.2
+
+min_boxes = [[10, 16, 24], [32, 48], [64, 96], [128, 192, 256]]
+shrinkage_list = []
+image_size = [320, 240]  # default input size 320*240
+feature_map_w_h_list = [[40, 20, 10, 5], [30, 15, 8,
+                                          4]]  # default feature map size
+priors = []
+
+
+def define_img_size(size):
+    global image_size, feature_map_w_h_list, priors
+    img_size_dict = {
+        128: [128, 96],
+        160: [160, 120],
+        320: [320, 240],
+        480: [480, 360],
+        640: [640, 480],
+        1280: [1280, 960]
+    }
+    image_size = img_size_dict[size]
+
+    feature_map_w_h_list_dict = {
+        128: [[16, 8, 4, 2], [12, 6, 3, 2]],
+        160: [[20, 10, 5, 3], [15, 8, 4, 2]],
+        320: [[40, 20, 10, 5], [30, 15, 8, 4]],
+        480: [[60, 30, 15, 8], [45, 23, 12, 6]],
+        640: [[80, 40, 20, 10], [60, 30, 15, 8]],
+        1280: [[160, 80, 40, 20], [120, 60, 30, 15]]
+    }
+    feature_map_w_h_list = feature_map_w_h_list_dict[size]
+
+    for i in range(0, len(image_size)):
+        item_list = []
+        for k in range(0, len(feature_map_w_h_list[i])):
+            item_list.append(image_size[i] / feature_map_w_h_list[i][k])
+        shrinkage_list.append(item_list)
+    priors = generate_priors(feature_map_w_h_list, shrinkage_list, image_size,
+                             min_boxes)
diff --git a/modelscope/models/cv/face_detection/ulfd_slim/vision/ssd/mb_tiny_fd.py b/modelscope/models/cv/face_detection/ulfd_slim/vision/ssd/mb_tiny_fd.py
new file mode 100644
index 00000000..91ed268d
--- /dev/null
+++ b/modelscope/models/cv/face_detection/ulfd_slim/vision/ssd/mb_tiny_fd.py
@@ -0,0 +1,124 @@
+# The implementation is based on ULFD, available at
+# https://github.com/Linzaer/Ultra-Light-Fast-Generic-Face-Detector-1MB
+from torch.nn import Conv2d, ModuleList, ReLU, Sequential
+
+from ..mb_tiny import Mb_Tiny
+from . import fd_config as config
+from .predictor import Predictor
+from .ssd import SSD
+
+
+def SeperableConv2d(in_channels,
+                    out_channels,
+                    kernel_size=1,
+                    stride=1,
+                    padding=0):
+    """Replace Conv2d with a depthwise Conv2d and Pointwise Conv2d.
+    """
+    return Sequential(
+        Conv2d(
+            in_channels=in_channels,
+            out_channels=in_channels,
+            kernel_size=kernel_size,
+            groups=in_channels,
+            stride=stride,
+            padding=padding),
+        ReLU(),
+        Conv2d(
+            in_channels=in_channels, out_channels=out_channels, kernel_size=1),
+    )
+
+
+def create_mb_tiny_fd(num_classes, is_test=False, device='cuda'):
+    base_net = Mb_Tiny(2)
+    base_net_model = base_net.model  # disable dropout layer
+
+    source_layer_indexes = [8, 11, 13]
+    extras = ModuleList([
+        Sequential(
+            Conv2d(
+                in_channels=base_net.base_channel * 16,
+                out_channels=base_net.base_channel * 4,
+                kernel_size=1), ReLU(),
+            SeperableConv2d(
+                in_channels=base_net.base_channel * 4,
+                out_channels=base_net.base_channel * 16,
+                kernel_size=3,
+                stride=2,
+                padding=1), ReLU())
+    ])
+
+    regression_headers = ModuleList([
+        SeperableConv2d(
+            in_channels=base_net.base_channel * 4,
+            out_channels=3 * 4,
+            kernel_size=3,
+            padding=1),
+        SeperableConv2d(
+            in_channels=base_net.base_channel * 8,
+            out_channels=2 * 4,
+            kernel_size=3,
+            padding=1),
+        SeperableConv2d(
+            in_channels=base_net.base_channel * 16,
+            out_channels=2 * 4,
+            kernel_size=3,
+            padding=1),
+        Conv2d(
+            in_channels=base_net.base_channel * 16,
+            out_channels=3 * 4,
+            kernel_size=3,
+            padding=1)
+    ])
+
+    classification_headers = ModuleList([
+        SeperableConv2d(
+            in_channels=base_net.base_channel * 4,
+            out_channels=3 * num_classes,
+            kernel_size=3,
+            padding=1),
+        SeperableConv2d(
+            in_channels=base_net.base_channel * 8,
+            out_channels=2 * num_classes,
+            kernel_size=3,
+            padding=1),
+        SeperableConv2d(
+            in_channels=base_net.base_channel * 16,
+            out_channels=2 * num_classes,
+            kernel_size=3,
+            padding=1),
+        Conv2d(
+            in_channels=base_net.base_channel * 16,
+            out_channels=3 * num_classes,
+            kernel_size=3,
+            padding=1)
+    ])
+
+    return SSD(
+        num_classes,
+        base_net_model,
+        source_layer_indexes,
+        extras,
+        classification_headers,
+        regression_headers,
+        is_test=is_test,
+        config=config,
+        device=device)
+
+
+def create_mb_tiny_fd_predictor(net,
+                                candidate_size=200,
+                                nms_method=None,
+                                sigma=0.5,
+                                device=None):
+    predictor = Predictor(
+        net,
+        config.image_size,
+        config.image_mean_test,
+        config.image_std,
+        nms_method=nms_method,
+        iou_threshold=config.iou_threshold,
+        candidate_size=candidate_size,
+        sigma=sigma,
+        device=device)
+    return predictor
diff --git a/modelscope/models/cv/face_detection/ulfd_slim/vision/ssd/predictor.py b/modelscope/models/cv/face_detection/ulfd_slim/vision/ssd/predictor.py
new file mode 100644
index 00000000..f71820a5
--- /dev/null
+++ b/modelscope/models/cv/face_detection/ulfd_slim/vision/ssd/predictor.py
@@ -0,0 +1,80 @@
+# The implementation is based on ULFD, available at
+# https://github.com/Linzaer/Ultra-Light-Fast-Generic-Face-Detector-1MB
+import torch
+
+from .. import box_utils
+from .data_preprocessing import PredictionTransform
+
+
+class Predictor:
+
+    def __init__(self,
+                 net,
+                 size,
+                 mean=0.0,
+                 std=1.0,
+                 nms_method=None,
+                 iou_threshold=0.3,
+                 filter_threshold=0.85,
+                 candidate_size=200,
+                 sigma=0.5,
+                 device=None):
+        self.net = net
+        self.transform = PredictionTransform(size, mean, std)
+        self.iou_threshold = iou_threshold
+        self.filter_threshold = filter_threshold
+        self.candidate_size = candidate_size
+        self.nms_method = nms_method
+
+        self.sigma = sigma
+        if device:
+            self.device = device
+        else:
+            self.device = torch.device(
+                'cuda:0' if torch.cuda.is_available() else 'cpu')
+
+        self.net.to(self.device)
+        self.net.eval()
+
+    def predict(self, image, top_k=-1, prob_threshold=None):
+        height, width, _ = image.shape
+        image = self.transform(image)
+        images = image.unsqueeze(0)
+        images = images.to(self.device)
+        with torch.no_grad():
+            for i in range(1):
+                scores, boxes = self.net.forward(images)
+        boxes = boxes[0]
+        scores = scores[0]
+        if not prob_threshold:
+            prob_threshold = self.filter_threshold
+        # this version of nms is slower on GPU, so we move data to CPU.
+        picked_box_probs = []
+        picked_labels = []
+        for class_index in range(1, scores.size(1)):
+            probs = scores[:, class_index]
+            mask = probs > prob_threshold
+            probs = probs[mask]
+            if probs.size(0) == 0:
+                continue
+            subset_boxes = boxes[mask, :]
+            box_probs = torch.cat([subset_boxes, probs.reshape(-1, 1)], dim=1)
+            box_probs = box_utils.nms(
+                box_probs,
+                self.nms_method,
+                score_threshold=prob_threshold,
+                iou_threshold=self.iou_threshold,
+                sigma=self.sigma,
+                top_k=top_k,
+                candidate_size=self.candidate_size)
+            picked_box_probs.append(box_probs)
+            picked_labels.extend([class_index] * box_probs.size(0))
+        if not picked_box_probs:
+            return torch.tensor([]), torch.tensor([]), torch.tensor([])
+        picked_box_probs = torch.cat(picked_box_probs)
+        picked_box_probs[:, 0] *= width
+        picked_box_probs[:, 1] *= height
+        picked_box_probs[:, 2] *= width
+        picked_box_probs[:, 3] *= height
+        return picked_box_probs[:, :4], torch.tensor(
+            picked_labels), picked_box_probs[:, 4]
diff --git a/modelscope/models/cv/face_detection/ulfd_slim/vision/ssd/ssd.py b/modelscope/models/cv/face_detection/ulfd_slim/vision/ssd/ssd.py
new file mode 100644
index 00000000..08ff93a4
--- /dev/null
+++ b/modelscope/models/cv/face_detection/ulfd_slim/vision/ssd/ssd.py
@@ -0,0 +1,129 @@
+# The implementation is based on ULFD, available at
+# https://github.com/Linzaer/Ultra-Light-Fast-Generic-Face-Detector-1MB
+from collections import namedtuple
+from typing import List, Tuple
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from .. import box_utils
+
+GraphPath = namedtuple('GraphPath', ['s0', 'name', 's1'])
+
+
+class SSD(nn.Module):
+
+    def __init__(self,
+                 num_classes: int,
+                 base_net: nn.ModuleList,
+                 source_layer_indexes: List[int],
+                 extras: nn.ModuleList,
+                 classification_headers: nn.ModuleList,
+                 regression_headers: nn.ModuleList,
+                 is_test=False,
+                 config=None,
+                 device=None):
+        """Compose a SSD model using the given components.
+        """
+        super(SSD, self).__init__()
+
+        self.num_classes = num_classes
+        self.base_net = base_net
+        self.source_layer_indexes = source_layer_indexes
+        self.extras = extras
+        self.classification_headers = classification_headers
+        self.regression_headers = regression_headers
+        self.is_test = is_test
+        self.config = config
+
+        # register layers in source_layer_indexes by adding them to a module list
+        self.source_layer_add_ons = nn.ModuleList([
+            t[1] for t in source_layer_indexes
+            if isinstance(t, tuple) and not isinstance(t, GraphPath)
+        ])
+        if device:
+            self.device = device
+        else:
+            self.device = torch.device(
+                'cuda:0' if torch.cuda.is_available() else 'cpu')
+        if is_test:
+            self.config = config
+            self.priors = config.priors.to(self.device)
+
+    def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        confidences = []
+        locations = []
+        start_layer_index = 0
+        header_index = 0
+        end_layer_index = 0
+        for end_layer_index in self.source_layer_indexes:
+            if isinstance(end_layer_index, GraphPath):
+                path = end_layer_index
+                end_layer_index = end_layer_index.s0
+                added_layer = None
+            elif isinstance(end_layer_index, tuple):
+                added_layer = end_layer_index[1]
+                end_layer_index = end_layer_index[0]
+                path = None
+            else:
+                added_layer = None
+                path = None
+            for layer in self.base_net[start_layer_index:end_layer_index]:
+                x = layer(x)
+            if added_layer:
+                y = added_layer(x)
+            else:
+                y = x
+            if path:
+                sub = getattr(self.base_net[end_layer_index], path.name)
+                for layer in sub[:path.s1]:
+                    x = layer(x)
+                y = x
+                for layer in sub[path.s1:]:
+                    x = layer(x)
+                end_layer_index += 1
+            start_layer_index = end_layer_index
+            confidence, location = self.compute_header(header_index, y)
+            header_index += 1
+            confidences.append(confidence)
+            locations.append(location)
+
+        for layer in self.base_net[end_layer_index:]:
+            x = layer(x)
+
+        for layer in self.extras:
+            x = layer(x)
+            confidence, location = self.compute_header(header_index, x)
+            header_index += 1
+            confidences.append(confidence)
+            locations.append(location)
+
+        confidences = torch.cat(confidences, 1)
+        locations = torch.cat(locations, 1)
+
+        if self.is_test:
+            confidences = F.softmax(confidences, dim=2)
+            boxes = box_utils.convert_locations_to_boxes(
+                locations, self.priors, self.config.center_variance,
+                self.config.size_variance)
+            boxes = box_utils.center_form_to_corner_form(boxes)
+            return confidences, boxes
+        else:
+            return confidences, locations
+
+    def compute_header(self, i, x):
+        confidence = self.classification_headers[i](x)
+        confidence = confidence.permute(0, 2, 3, 1).contiguous()
+        confidence = confidence.view(confidence.size(0), -1, self.num_classes)
+
+        location = self.regression_headers[i](x)
+        location = location.permute(0, 2, 3, 1).contiguous()
+        location = location.view(location.size(0), -1, 4)
+
+        return confidence, location
+
+    def load(self, model):
+        self.load_state_dict(
+            torch.load(model, map_location=lambda storage, loc: storage))
diff --git a/modelscope/models/cv/face_detection/ulfd_slim/vision/transforms.py b/modelscope/models/cv/face_detection/ulfd_slim/vision/transforms.py
new file mode 100644
index 00000000..7c5331f1
--- /dev/null
+++ b/modelscope/models/cv/face_detection/ulfd_slim/vision/transforms.py
@@ -0,0 +1,56 @@
+# The implementation is based on ULFD, available at
+# https://github.com/Linzaer/Ultra-Light-Fast-Generic-Face-Detector-1MB
+import types
+
+import cv2
+import numpy as np
+import torch
+from numpy import random
+
+
+class Compose(object):
+    """Composes several augmentations together.
+    Args:
+        transforms (List[Transform]): list of transforms to compose.
+    Example:
+        >>> augmentations.Compose([
+        >>>     transforms.CenterCrop(10),
+        >>>     transforms.ToTensor(),
+        >>> ])
+    """
+
+    def __init__(self, transforms):
+        self.transforms = transforms
+
+    def __call__(self, img, boxes=None, labels=None):
+        for t in self.transforms:
+            img, boxes, labels = t(img, boxes, labels)
+        return img, boxes, labels
+
+
+class SubtractMeans(object):
+
+    def __init__(self, mean):
+        self.mean = np.array(mean, dtype=np.float32)
+
+    def __call__(self, image, boxes=None, labels=None):
+        image = image.astype(np.float32)
+        image -= self.mean
+        return image.astype(np.float32), boxes, labels
+
+
+class Resize(object):
+
+    def __init__(self, size=(300, 300)):
+        self.size = size
+
+    def __call__(self, image, boxes=None, labels=None):
+        image = cv2.resize(image, (self.size[0], self.size[1]))
+        return image, boxes, labels
+
+
+class ToTensor(object):
+
+    def __call__(self, cvimage, boxes=None, labels=None):
+        return torch.from_numpy(cvimage.astype(np.float32)).permute(
+            2, 0, 1), boxes, labels
diff --git a/modelscope/models/cv/face_emotion/__init__.py b/modelscope/models/cv/face_emotion/__init__.py
new file mode 100644
index 00000000..2a13ea42
--- /dev/null
+++ b/modelscope/models/cv/face_emotion/__init__.py
@@ -0,0 +1,20 @@
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .emotion_model import EfficientNetForFaceEmotion
+
+else:
+    _import_structure = {'emotion_model': ['EfficientNetForFaceEmotion']}
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/models/cv/face_emotion/efficient/__init__.py b/modelscope/models/cv/face_emotion/efficient/__init__.py
new file mode 100644
index 00000000..e8fc91a4
--- /dev/null
+++ b/modelscope/models/cv/face_emotion/efficient/__init__.py
@@ -0,0 +1,6 @@
+# The implementation here is modified based on EfficientNet,
+# originally Apache 2.0 License and publicly avaialbe at https://github.com/lukemelas/EfficientNet-PyTorch
+
+from .model import VALID_MODELS, EfficientNet
+from .utils import (BlockArgs, BlockDecoder, GlobalParams, efficientnet,
+                    get_model_params)
diff --git a/modelscope/models/cv/face_emotion/efficient/model.py b/modelscope/models/cv/face_emotion/efficient/model.py
new file mode 100644
index 00000000..db303016
--- /dev/null
+++ b/modelscope/models/cv/face_emotion/efficient/model.py
@@ -0,0 +1,380 @@
+# The implementation here is modified based on EfficientNet,
+# originally Apache 2.0 License and publicly avaialbe at https://github.com/lukemelas/EfficientNet-PyTorch
+
+import torch
+from torch import nn
+from torch.nn import functional as F
+
+from .utils import (MemoryEfficientSwish, Swish, calculate_output_image_size,
+                    drop_connect, efficientnet_params, get_model_params,
+                    get_same_padding_conv2d, load_pretrained_weights,
+                    round_filters, round_repeats)
+
+VALID_MODELS = ('efficientnet-b0', 'efficientnet-b1', 'efficientnet-b2',
+                'efficientnet-b3', 'efficientnet-b4', 'efficientnet-b5',
+                'efficientnet-b6', 'efficientnet-b7', 'efficientnet-b8',
+                'efficientnet-l2')
+
+
+class MBConvBlock(nn.Module):
+
+    def __init__(self, block_args, global_params, image_size=None):
+        super().__init__()
+        self._block_args = block_args
+        self._bn_mom = 1 - global_params.batch_norm_momentum
+        self._bn_eps = global_params.batch_norm_epsilon
+        self.has_se = (self._block_args.se_ratio
+                       is not None) and (0 < self._block_args.se_ratio <= 1)
+        self.id_skip = block_args.id_skip
+
+        inp = self._block_args.input_filters
+        oup = self._block_args.input_filters * self._block_args.expand_ratio
+        if self._block_args.expand_ratio != 1:
+            Conv2d = get_same_padding_conv2d(image_size=image_size)
+            self._expand_conv = Conv2d(
+                in_channels=inp, out_channels=oup, kernel_size=1, bias=False)
+            self._bn0 = nn.BatchNorm2d(
+                num_features=oup, momentum=self._bn_mom, eps=self._bn_eps)
+
+        k = self._block_args.kernel_size
+        s = self._block_args.stride
+        Conv2d = get_same_padding_conv2d(image_size=image_size)
+        self._depthwise_conv = Conv2d(
+            in_channels=oup,
+            out_channels=oup,
+            groups=oup,
+            kernel_size=k,
+            stride=s,
+            bias=False)
+        self._bn1 = nn.BatchNorm2d(
+            num_features=oup, momentum=self._bn_mom, eps=self._bn_eps)
+        image_size = calculate_output_image_size(image_size, s)
+
+        if self.has_se:
+            Conv2d = get_same_padding_conv2d(image_size=(1, 1))
+            num_squeezed_channels = max(
+                1,
+                int(self._block_args.input_filters
+                    * self._block_args.se_ratio))
+            self._se_reduce = Conv2d(
+                in_channels=oup,
+                out_channels=num_squeezed_channels,
+                kernel_size=1)
+            self._se_expand = Conv2d(
+                in_channels=num_squeezed_channels,
+                out_channels=oup,
+                kernel_size=1)
+
+        final_oup = self._block_args.output_filters
+        Conv2d = get_same_padding_conv2d(image_size=image_size)
+        self._project_conv = Conv2d(
+            in_channels=oup, out_channels=final_oup, kernel_size=1, bias=False)
+        self._bn2 = nn.BatchNorm2d(
+            num_features=final_oup, momentum=self._bn_mom, eps=self._bn_eps)
+        self._swish = MemoryEfficientSwish()
+
+    def forward(self, inputs, drop_connect_rate=None):
+        """MBConvBlock's forward function.
+        Args:
+            inputs (tensor): Input tensor.
+            drop_connect_rate (bool): Drop connect rate (float, between 0 and 1).
+        Returns:
+            Output of this block after processing.
+        """
+
+        x = inputs
+        if self._block_args.expand_ratio != 1:
+            x = self._expand_conv(inputs)
+            x = self._bn0(x)
+            x = self._swish(x)
+
+        x = self._depthwise_conv(x)
+        x = self._bn1(x)
+        x = self._swish(x)
+
+        if self.has_se:
+            x_squeezed = F.adaptive_avg_pool2d(x, 1)
+            x_squeezed = self._se_reduce(x_squeezed)
+            x_squeezed = self._swish(x_squeezed)
+            x_squeezed = self._se_expand(x_squeezed)
+            x = torch.sigmoid(x_squeezed) * x
+
+        x = self._project_conv(x)
+        x = self._bn2(x)
+
+        input_filters, output_filters = self._block_args.input_filters, self._block_args.output_filters
+        if self.id_skip and self._block_args.stride == 1 and input_filters == output_filters:
+            if drop_connect_rate:
+                x = drop_connect(
+                    x, p=drop_connect_rate, training=self.training)
+            x = x + inputs
+        return x
+
+    def set_swish(self, memory_efficient=True):
+        """Sets swish function as memory efficient (for training) or standard (for export).
+        Args:
+            memory_efficient (bool): Whether to use memory-efficient version of swish.
+        """
+        self._swish = MemoryEfficientSwish() if memory_efficient else Swish()
+
+
+class EfficientNet(nn.Module):
+    """EfficientNet model.
+       Most easily loaded with the .from_name or .from_pretrained methods.
+    Args:
+        blocks_args (list[namedtuple]): A list of BlockArgs to construct blocks.
+        global_params (namedtuple): A set of GlobalParams shared between blocks.
+    References:
+        [1] https://arxiv.org/abs/1905.11946 (EfficientNet)
+    Example:
+        >>> import torch
+        >>> from efficientnet.model import EfficientNet
+        >>> inputs = torch.rand(1, 3, 224, 224)
+        >>> model = EfficientNet.from_pretrained('efficientnet-b0')
+        >>> model.eval()
+        >>> outputs = model(inputs)
+    """
+
+    def __init__(self, blocks_args=None, global_params=None):
+        super().__init__()
+        assert isinstance(blocks_args, list), 'blocks_args should be a list'
+        assert len(blocks_args) > 0, 'block args must be greater than 0'
+        self._global_params = global_params
+        self._blocks_args = blocks_args
+
+        bn_mom = 1 - self._global_params.batch_norm_momentum
+        bn_eps = self._global_params.batch_norm_epsilon
+        image_size = global_params.image_size
+        Conv2d = get_same_padding_conv2d(image_size=image_size)
+
+        in_channels = 3
+        out_channels = round_filters(32, self._global_params)
+        self._conv_stem = Conv2d(
+            in_channels, out_channels, kernel_size=3, stride=2, bias=False)
+        self._bn0 = nn.BatchNorm2d(
+            num_features=out_channels, momentum=bn_mom, eps=bn_eps)
+        image_size = calculate_output_image_size(image_size, 2)
+
+        self._blocks = nn.ModuleList([])
+        for block_args in self._blocks_args:
+
+            block_args = block_args._replace(
+                input_filters=round_filters(block_args.input_filters,
+                                            self._global_params),
+                output_filters=round_filters(block_args.output_filters,
+                                             self._global_params),
+                num_repeat=round_repeats(block_args.num_repeat,
+                                         self._global_params))
+
+            self._blocks.append(
+                MBConvBlock(
+                    block_args, self._global_params, image_size=image_size))
+            image_size = calculate_output_image_size(image_size,
+                                                     block_args.stride)
+            if block_args.num_repeat > 1:
+                block_args = block_args._replace(
+                    input_filters=block_args.output_filters, stride=1)
+            for _ in range(block_args.num_repeat - 1):
+                self._blocks.append(
+                    MBConvBlock(
+                        block_args, self._global_params,
+                        image_size=image_size))
+
+        in_channels = block_args.output_filters
+        out_channels = round_filters(1280, self._global_params)
+        Conv2d = get_same_padding_conv2d(image_size=image_size)
+        self._conv_head = Conv2d(
+            in_channels, out_channels, kernel_size=1, bias=False)
+        self._bn1 = nn.BatchNorm2d(
+            num_features=out_channels, momentum=bn_mom, eps=bn_eps)
+
+        self._avg_pooling = nn.AdaptiveAvgPool2d(1)
+        if self._global_params.include_top:
+            self._dropout = nn.Dropout(self._global_params.dropout_rate)
+            self._fc = nn.Linear(out_channels, self._global_params.num_classes)
+
+        self._swish = MemoryEfficientSwish()
+
+    def set_swish(self, memory_efficient=True):
+        """Sets swish function as memory efficient (for training) or standard (for export).
+        Args:
+            memory_efficient (bool): Whether to use memory-efficient version of swish.
+        """
+        self._swish = MemoryEfficientSwish() if memory_efficient else Swish()
+        for block in self._blocks:
+            block.set_swish(memory_efficient)
+
+    def extract_endpoints(self, inputs):
+        """Use convolution layer to extract features
+        from reduction levels i in [1, 2, 3, 4, 5].
+        Args:
+            inputs (tensor): Input tensor.
+        Returns:
+            Dictionary of last intermediate features
+            with reduction levels i in [1, 2, 3, 4, 5].
+            Example:
+                >>> import torch
+                >>> from efficientnet.model import EfficientNet
+                >>> inputs = torch.rand(1, 3, 224, 224)
+                >>> model = EfficientNet.from_pretrained('efficientnet-b0')
+                >>> endpoints = model.extract_endpoints(inputs)
+                >>> print(endpoints['reduction_1'].shape)  # torch.Size([1, 16, 112, 112])
+                >>> print(endpoints['reduction_2'].shape)  # torch.Size([1, 24, 56, 56])
+                >>> print(endpoints['reduction_3'].shape)  # torch.Size([1, 40, 28, 28])
+                >>> print(endpoints['reduction_4'].shape)  # torch.Size([1, 112, 14, 14])
+                >>> print(endpoints['reduction_5'].shape)  # torch.Size([1, 320, 7, 7])
+                >>> print(endpoints['reduction_6'].shape)  # torch.Size([1, 1280, 7, 7])
+        """
+        endpoints = dict()
+
+        x = self._swish(self._bn0(self._conv_stem(inputs)))
+        prev_x = x
+
+        for idx, block in enumerate(self._blocks):
+            drop_connect_rate = self._global_params.drop_connect_rate
+            if drop_connect_rate:
+                drop_connect_rate *= float(idx) / len(
+                    self._blocks)  # scale drop connect_rate
+            x = block(x, drop_connect_rate=drop_connect_rate)
+            if prev_x.size(2) > x.size(2):
+                endpoints['reduction_{}'.format(len(endpoints) + 1)] = prev_x
+            elif idx == len(self._blocks) - 1:
+                endpoints['reduction_{}'.format(len(endpoints) + 1)] = x
+            prev_x = x
+
+        x = self._swish(self._bn1(self._conv_head(x)))
+        endpoints['reduction_{}'.format(len(endpoints) + 1)] = x
+
+        return endpoints
+
+    def extract_features(self, inputs):
+        """use convolution layer to extract feature .
+        Args:
+            inputs (tensor): Input tensor.
+        Returns:
+            Output of the final convolution
+            layer in the efficientnet model.
+        """
+        x = self._swish(self._bn0(self._conv_stem(inputs)))
+
+        for idx, block in enumerate(self._blocks):
+            drop_connect_rate = self._global_params.drop_connect_rate
+            if drop_connect_rate:
+                drop_connect_rate *= float(idx) / len(self._blocks)
+            x = block(x, drop_connect_rate=drop_connect_rate)
+        x = self._swish(self._bn1(self._conv_head(x)))
+
+        return x
+
+    def forward(self, inputs):
+        """EfficientNet's forward function.
+           Calls extract_features to extract features, applies final linear layer, and returns logits.
+        Args:
+            inputs (tensor): Input tensor.
+        Returns:
+            Output of this model after processing.
+        """
+        x = self.extract_features(inputs)
+        x = self._avg_pooling(x)
+        if self._global_params.include_top:
+            x = x.flatten(start_dim=1)
+            x = self._dropout(x)
+            x = self._fc(x)
+        return x
+
+    @classmethod
+    def from_name(cls, model_name, in_channels=3, **override_params):
+        """Create an efficientnet model according to name.
+        Args:
+            model_name (str): Name for efficientnet.
+            in_channels (int): Input data's channel number.
+            override_params (other key word params):
+                Params to override model's global_params.
+                Optional key:
+                    'width_coefficient', 'depth_coefficient',
+                    'image_size', 'dropout_rate',
+                    'num_classes', 'batch_norm_momentum',
+                    'batch_norm_epsilon', 'drop_connect_rate',
+                    'depth_divisor', 'min_depth'
+        Returns:
+            An efficientnet model.
+        """
+        cls._check_model_name_is_valid(model_name)
+        blocks_args, global_params = get_model_params(model_name,
+                                                      override_params)
+        model = cls(blocks_args, global_params)
+        model._change_in_channels(in_channels)
+        return model
+
+    @classmethod
+    def from_pretrained(cls,
+                        model_name,
+                        weights_path=None,
+                        advprop=False,
+                        in_channels=3,
+                        num_classes=1000,
+                        **override_params):
+        """Create an efficientnet model according to name.
+        Args:
+            model_name (str): Name for efficientnet.
+            weights_path (None or str):
+                str: path to pretrained weights file on the local disk.
+                None: use pretrained weights downloaded from the Internet.
+            advprop (bool):
+                Whether to load pretrained weights
+                trained with advprop (valid when weights_path is None).
+            in_channels (int): Input data's channel number.
+            num_classes (int):
+                Number of categories for classification.
+                It controls the output size for final linear layer.
+            override_params (other key word params):
+                Params to override model's global_params.
+                Optional key:
+                    'width_coefficient', 'depth_coefficient',
+                    'image_size', 'dropout_rate',
+                    'batch_norm_momentum',
+                    'batch_norm_epsilon', 'drop_connect_rate',
+                    'depth_divisor', 'min_depth'
+        Returns:
+            A pretrained efficientnet model.
+        """
+        model = cls.from_name(
+            model_name, num_classes=num_classes, **override_params)
+        model._change_in_channels(in_channels)
+        return model
+
+    @classmethod
+    def get_image_size(cls, model_name):
+        """Get the input image size for a given efficientnet model.
+        Args:
+            model_name (str): Name for efficientnet.
+        Returns:
+            Input image size (resolution).
+        """
+        cls._check_model_name_is_valid(model_name)
+        _, _, res, _ = efficientnet_params(model_name)
+        return res
+
+    @classmethod
+    def _check_model_name_is_valid(cls, model_name):
+        """Validates model name.
+        Args:
+            model_name (str): Name for efficientnet.
+        Returns:
+            bool: Is a valid name or not.
+        """
+        if model_name not in VALID_MODELS:
+            raise ValueError('model_name should be one of: '
+                             + ', '.join(VALID_MODELS))
+
+    def _change_in_channels(self, in_channels):
+        """Adjust model's first convolution layer to in_channels, if in_channels not equals 3.
+        Args:
+            in_channels (int): Input data's channel number.
+        """
+        if in_channels != 3:
+            Conv2d = get_same_padding_conv2d(
+                image_size=self._global_params.image_size)
+            out_channels = round_filters(32, self._global_params)
+            self._conv_stem = Conv2d(
+                in_channels, out_channels, kernel_size=3, stride=2, bias=False)
diff --git a/modelscope/models/cv/face_emotion/efficient/utils.py b/modelscope/models/cv/face_emotion/efficient/utils.py
new file mode 100644
index 00000000..6cae70fc
--- /dev/null
+++ b/modelscope/models/cv/face_emotion/efficient/utils.py
@@ -0,0 +1,559 @@
+# The implementation here is modified based on EfficientNet,
+# originally Apache 2.0 License and publicly avaialbe at https://github.com/lukemelas/EfficientNet-PyTorch
+
+import collections
+import math
+import re
+from functools import partial
+
+import torch
+from torch import nn
+from torch.nn import functional as F
+from torch.utils import model_zoo
+
+GlobalParams = collections.namedtuple('GlobalParams', [
+    'width_coefficient', 'depth_coefficient', 'image_size', 'dropout_rate',
+    'num_classes', 'batch_norm_momentum', 'batch_norm_epsilon',
+    'drop_connect_rate', 'depth_divisor', 'min_depth', 'include_top'
+])
+
+BlockArgs = collections.namedtuple('BlockArgs', [
+    'num_repeat', 'kernel_size', 'stride', 'expand_ratio', 'input_filters',
+    'output_filters', 'se_ratio', 'id_skip'
+])
+
+GlobalParams.__new__.__defaults__ = (None, ) * len(GlobalParams._fields)
+BlockArgs.__new__.__defaults__ = (None, ) * len(BlockArgs._fields)
+
+if hasattr(nn, 'SiLU'):
+    Swish = nn.SiLU
+else:
+
+    class Swish(nn.Module):
+
+        def forward(self, x):
+            return x * torch.sigmoid(x)
+
+
+class SwishImplementation(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx, i):
+        result = i * torch.sigmoid(i)
+        ctx.save_for_backward(i)
+        return result
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        i = ctx.saved_tensors[0]
+        sigmoid_i = torch.sigmoid(i)
+        return grad_output * (sigmoid_i * (1 + i * (1 - sigmoid_i)))
+
+
+class MemoryEfficientSwish(nn.Module):
+
+    def forward(self, x):
+        return SwishImplementation.apply(x)
+
+
+def round_filters(filters, global_params):
+    """Calculate and round number of filters based on width multiplier.
+       Use width_coefficient, depth_divisor and min_depth of global_params.
+    Args:
+        filters (int): Filters number to be calculated.
+        global_params (namedtuple): Global params of the model.
+    Returns:
+        new_filters: New filters number after calculating.
+    """
+    multiplier = global_params.width_coefficient
+    if not multiplier:
+        return filters
+
+    divisor = global_params.depth_divisor
+    min_depth = global_params.min_depth
+    filters *= multiplier
+    min_depth = min_depth or divisor
+    new_filters = max(min_depth,
+                      int(filters + divisor / 2) // divisor * divisor)
+    if new_filters < 0.9 * filters:
+        new_filters += divisor
+    return int(new_filters)
+
+
+def round_repeats(repeats, global_params):
+    """Calculate module's repeat number of a block based on depth multiplier.
+       Use depth_coefficient of global_params.
+    Args:
+        repeats (int): num_repeat to be calculated.
+        global_params (namedtuple): Global params of the model.
+    Returns:
+        new repeat: New repeat number after calculating.
+    """
+    multiplier = global_params.depth_coefficient
+    if not multiplier:
+        return repeats
+    return int(math.ceil(multiplier * repeats))
+
+
+def drop_connect(inputs, p, training):
+    """Drop connect.
+    Args:
+        input (tensor: BCWH): Input of this structure.
+        p (float: 0.0~1.0): Probability of drop connection.
+        training (bool): The running mode.
+    Returns:
+        output: Output after drop connection.
+    """
+    assert 0 <= p <= 1, 'p must be in range of [0,1]'
+
+    if not training:
+        return inputs
+
+    batch_size = inputs.shape[0]
+    keep_prob = 1 - p
+
+    random_tensor = keep_prob
+    random_tensor += torch.rand([batch_size, 1, 1, 1],
+                                dtype=inputs.dtype,
+                                device=inputs.device)
+    binary_tensor = torch.floor(random_tensor)
+
+    output = inputs / keep_prob * binary_tensor
+    return output
+
+
+def get_width_and_height_from_size(x):
+    """Obtain height and width from x.
+    Args:
+        x (int, tuple or list): Data size.
+    Returns:
+        size: A tuple or list (H,W).
+    """
+    if isinstance(x, int):
+        return x, x
+    if isinstance(x, list) or isinstance(x, tuple):
+        return x
+    else:
+        raise TypeError()
+
+
+def calculate_output_image_size(input_image_size, stride):
+    """Calculates the output image size when using Conv2dSamePadding with a stride.
+       Necessary for static padding. Thanks to mannatsingh for pointing this out.
+    Args:
+        input_image_size (int, tuple or list): Size of input image.
+        stride (int, tuple or list): Conv2d operation's stride.
+    Returns:
+        output_image_size: A list [H,W].
+    """
+    if input_image_size is None:
+        return None
+    image_height, image_width = get_width_and_height_from_size(
+        input_image_size)
+    stride = stride if isinstance(stride, int) else stride[0]
+    image_height = int(math.ceil(image_height / stride))
+    image_width = int(math.ceil(image_width / stride))
+    return [image_height, image_width]
+
+
+def get_same_padding_conv2d(image_size=None):
+    """Chooses static padding if you have specified an image size, and dynamic padding otherwise.
+       Static padding is necessary for ONNX exporting of models.
+    Args:
+        image_size (int or tuple): Size of the image.
+    Returns:
+        Conv2dDynamicSamePadding or Conv2dStaticSamePadding.
+    """
+    if image_size is None:
+        return Conv2dDynamicSamePadding
+    else:
+        return partial(Conv2dStaticSamePadding, image_size=image_size)
+
+
+class Conv2dDynamicSamePadding(nn.Conv2d):
+    """2D Convolutions like TensorFlow, for a dynamic image size.
+       The padding is operated in forward function by calculating dynamically.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 dilation=1,
+                 groups=1,
+                 bias=True):
+        super().__init__(in_channels, out_channels, kernel_size, stride, 0,
+                         dilation, groups, bias)
+        self.stride = self.stride if len(
+            self.stride) == 2 else [self.stride[0]] * 2
+
+    def forward(self, x):
+        ih, iw = x.size()[-2:]
+        kh, kw = self.weight.size()[-2:]
+        sh, sw = self.stride
+        oh, ow = math.ceil(ih / sh), math.ceil(iw / sw)
+        a1 = (oh - 1) * self.stride[0]
+        pad_h = max(a1 + (kh - 1) * self.dilation[0] + 1 - ih, 0)
+        a2 = (ow - 1) * self.stride[1]
+        pad_w = max(a2 + (kw - 1) * self.dilation[1] + 1 - iw, 0)
+        if pad_h > 0 or pad_w > 0:
+            x = F.pad(x, [
+                pad_w // 2, pad_w - pad_w // 2, pad_h // 2, pad_h - pad_h // 2
+            ])
+        return F.conv2d(x, self.weight, self.bias, self.stride, self.padding,
+                        self.dilation, self.groups)
+
+
+class Conv2dStaticSamePadding(nn.Conv2d):
+    """2D Convolutions like TensorFlow's 'SAME' mode, with the given input image size.
+       The padding mudule is calculated in construction function, then used in forward.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 image_size=None,
+                 **kwargs):
+        super().__init__(in_channels, out_channels, kernel_size, stride,
+                         **kwargs)
+        self.stride = self.stride if len(
+            self.stride) == 2 else [self.stride[0]] * 2
+
+        assert image_size is not None
+        ih, iw = (image_size,
+                  image_size) if isinstance(image_size, int) else image_size
+        kh, kw = self.weight.size()[-2:]
+        sh, sw = self.stride
+        oh, ow = math.ceil(ih / sh), math.ceil(iw / sw)
+        b1 = (oh - 1) * self.stride[0]
+        pad_h = max(b1 + (kh - 1) * self.dilation[0] + 1 - ih, 0)
+        b2 = (ow - 1) * self.stride[1]
+        pad_w = max(b2 + (kw - 1) * self.dilation[1] + 1 - iw, 0)
+        if pad_h > 0 or pad_w > 0:
+            self.static_padding = nn.ZeroPad2d(
+                (pad_w // 2, pad_w - pad_w // 2, pad_h // 2,
+                 pad_h - pad_h // 2))
+        else:
+            self.static_padding = nn.Identity()
+
+    def forward(self, x):
+        x = self.static_padding(x)
+        x = F.conv2d(x, self.weight, self.bias, self.stride, self.padding,
+                     self.dilation, self.groups)
+        return x
+
+
+def get_same_padding_maxPool2d(image_size=None):
+    """Chooses static padding if you have specified an image size, and dynamic padding otherwise.
+       Static padding is necessary for ONNX exporting of models.
+    Args:
+        image_size (int or tuple): Size of the image.
+    Returns:
+        MaxPool2dDynamicSamePadding or MaxPool2dStaticSamePadding.
+    """
+    if image_size is None:
+        return MaxPool2dDynamicSamePadding
+    else:
+        return partial(MaxPool2dStaticSamePadding, image_size=image_size)
+
+
+class MaxPool2dDynamicSamePadding(nn.MaxPool2d):
+    """2D MaxPooling like TensorFlow's 'SAME' mode, with a dynamic image size.
+       The padding is operated in forward function by calculating dynamically.
+    """
+
+    def __init__(self,
+                 kernel_size,
+                 stride,
+                 padding=0,
+                 dilation=1,
+                 return_indices=False,
+                 ceil_mode=False):
+        super().__init__(kernel_size, stride, padding, dilation,
+                         return_indices, ceil_mode)
+        self.stride = [self.stride] * 2 if isinstance(self.stride,
+                                                      int) else self.stride
+        self.kernel_size = [self.kernel_size] * 2 if isinstance(
+            self.kernel_size, int) else self.kernel_size
+        self.dilation = [self.dilation] * 2 if isinstance(
+            self.dilation, int) else self.dilation
+
+    def forward(self, x):
+        ih, iw = x.size()[-2:]
+        kh, kw = self.kernel_size
+        sh, sw = self.stride
+        oh, ow = math.ceil(ih / sh), math.ceil(iw / sw)
+        c1 = (oh - 1) * self.stride[0]
+        pad_h = max(c1 + (kh - 1) * self.dilation[0] + 1 - ih, 0)
+        c2 = (ow - 1) * self.stride[1]
+        pad_w = max(c2 + (kw - 1) * self.dilation[1] + 1 - iw, 0)
+        if pad_h > 0 or pad_w > 0:
+            x = F.pad(x, [
+                pad_w // 2, pad_w - pad_w // 2, pad_h // 2, pad_h - pad_h // 2
+            ])
+        return F.max_pool2d(x, self.kernel_size, self.stride, self.padding,
+                            self.dilation, self.ceil_mode, self.return_indices)
+
+
+class MaxPool2dStaticSamePadding(nn.MaxPool2d):
+    """2D MaxPooling like TensorFlow's 'SAME' mode, with the given input image size.
+       The padding mudule is calculated in construction function, then used in forward.
+    """
+
+    def __init__(self, kernel_size, stride, image_size=None, **kwargs):
+        super().__init__(kernel_size, stride, **kwargs)
+        self.stride = [self.stride] * 2 if isinstance(self.stride,
+                                                      int) else self.stride
+        self.kernel_size = [self.kernel_size] * 2 if isinstance(
+            self.kernel_size, int) else self.kernel_size
+        self.dilation = [self.dilation] * 2 if isinstance(
+            self.dilation, int) else self.dilation
+
+        assert image_size is not None
+        ih, iw = (image_size,
+                  image_size) if isinstance(image_size, int) else image_size
+        kh, kw = self.kernel_size
+        sh, sw = self.stride
+        oh, ow = math.ceil(ih / sh), math.ceil(iw / sw)
+        d1 = (oh - 1) * self.stride[0]
+        pad_h = max(d1 + (kh - 1) * self.dilation[0] + 1 - ih, 0)
+        d2 = (ow - 1) * self.stride[1]
+        pad_w = max(d2 + (kw - 1) * self.dilation[1] + 1 - iw, 0)
+        if pad_h > 0 or pad_w > 0:
+            self.static_padding = nn.ZeroPad2d(
+                (pad_w // 2, pad_w - pad_w // 2, pad_h // 2,
+                 pad_h - pad_h // 2))
+        else:
+            self.static_padding = nn.Identity()
+
+    def forward(self, x):
+        x = self.static_padding(x)
+        x = F.max_pool2d(x, self.kernel_size, self.stride, self.padding,
+                         self.dilation, self.ceil_mode, self.return_indices)
+        return x
+
+
+class BlockDecoder(object):
+    """Block Decoder for readability,
+       straight from the official TensorFlow repository.
+    """
+
+    @staticmethod
+    def _decode_block_string(block_string):
+        """Get a block through a string notation of arguments.
+        Args:
+            block_string (str): A string notation of arguments.
+                                Examples: 'r1_k3_s11_e1_i32_o16_se0.25_noskip'.
+        Returns:
+            BlockArgs: The namedtuple defined at the top of this file.
+        """
+        assert isinstance(block_string, str)
+
+        ops = block_string.split('_')
+        options = {}
+        for op in ops:
+            splits = re.split(r'(\d.*)', op)
+            if len(splits) >= 2:
+                key, value = splits[:2]
+                options[key] = value
+
+        # Check stride
+        assert (('s' in options and len(options['s']) == 1)
+                or (len(options['s']) == 2
+                    and options['s'][0] == options['s'][1]))
+
+        return BlockArgs(
+            num_repeat=int(options['r']),
+            kernel_size=int(options['k']),
+            stride=[int(options['s'][0])],
+            expand_ratio=int(options['e']),
+            input_filters=int(options['i']),
+            output_filters=int(options['o']),
+            se_ratio=float(options['se']) if 'se' in options else None,
+            id_skip=('noskip' not in block_string))
+
+    @staticmethod
+    def _encode_block_string(block):
+        """Encode a block to a string.
+        Args:
+            block (namedtuple): A BlockArgs type argument.
+        Returns:
+            block_string: A String form of BlockArgs.
+        """
+        args = [
+            'r%d' % block.num_repeat,
+            'k%d' % block.kernel_size,
+            's%d%d' % (block.strides[0], block.strides[1]),
+            'e%s' % block.expand_ratio,
+            'i%d' % block.input_filters,
+            'o%d' % block.output_filters
+        ]
+        if 0 < block.se_ratio <= 1:
+            args.append('se%s' % block.se_ratio)
+        if block.id_skip is False:
+            args.append('noskip')
+        return '_'.join(args)
+
+    @staticmethod
+    def decode(string_list):
+        """Decode a list of string notations to specify blocks inside the network.
+        Args:
+            string_list (list[str]): A list of strings, each string is a notation of block.
+        Returns:
+            blocks_args: A list of BlockArgs namedtuples of block args.
+        """
+        assert isinstance(string_list, list)
+        blocks_args = []
+        for block_string in string_list:
+            blocks_args.append(BlockDecoder._decode_block_string(block_string))
+        return blocks_args
+
+    @staticmethod
+    def encode(blocks_args):
+        """Encode a list of BlockArgs to a list of strings.
+        Args:
+            blocks_args (list[namedtuples]): A list of BlockArgs namedtuples of block args.
+        Returns:
+            block_strings: A list of strings, each string is a notation of block.
+        """
+        block_strings = []
+        for block in blocks_args:
+            block_strings.append(BlockDecoder._encode_block_string(block))
+        return block_strings
+
+
+def efficientnet_params(model_name):
+    """Map EfficientNet model name to parameter coefficients.
+    Args:
+        model_name (str): Model name to be queried.
+    Returns:
+        params_dict[model_name]: A (width,depth,res,dropout) tuple.
+    """
+    params_dict = {
+        'efficientnet-b0': (1.0, 1.0, 112, 0.2),
+        'efficientnet-b1': (1.0, 1.1, 240, 0.2),
+        'efficientnet-b2': (1.1, 1.2, 260, 0.3),
+        'efficientnet-b3': (1.2, 1.4, 300, 0.3),
+        'efficientnet-b4': (1.4, 1.8, 380, 0.4),
+        'efficientnet-b5': (1.6, 2.2, 456, 0.4),
+        'efficientnet-b6': (1.8, 2.6, 528, 0.5),
+        'efficientnet-b7': (2.0, 3.1, 600, 0.5),
+        'efficientnet-b8': (2.2, 3.6, 672, 0.5),
+        'efficientnet-l2': (4.3, 5.3, 800, 0.5),
+    }
+    return params_dict[model_name]
+
+
+def efficientnet(width_coefficient=None,
+                 depth_coefficient=None,
+                 image_size=None,
+                 dropout_rate=0.2,
+                 drop_connect_rate=0.2,
+                 num_classes=1000,
+                 include_top=True):
+    """Create BlockArgs and GlobalParams for efficientnet model.
+    Args:
+        width_coefficient (float)
+        depth_coefficient (float)
+        image_size (int)
+        dropout_rate (float)
+        drop_connect_rate (float)
+        num_classes (int)
+        Meaning as the name suggests.
+    Returns:
+        blocks_args, global_params.
+    """
+
+    blocks_args = [
+        'r1_k3_s11_e1_i32_o16_se0.25',
+        'r2_k3_s22_e6_i16_o24_se0.25',
+        'r2_k5_s22_e6_i24_o40_se0.25',
+        'r3_k3_s22_e6_i40_o80_se0.25',
+        'r3_k5_s11_e6_i80_o112_se0.25',
+        'r4_k5_s22_e6_i112_o192_se0.25',
+        'r1_k3_s11_e6_i192_o320_se0.25',
+    ]
+    blocks_args = BlockDecoder.decode(blocks_args)
+
+    global_params = GlobalParams(
+        width_coefficient=width_coefficient,
+        depth_coefficient=depth_coefficient,
+        image_size=image_size,
+        dropout_rate=dropout_rate,
+        num_classes=num_classes,
+        batch_norm_momentum=0.99,
+        batch_norm_epsilon=1e-3,
+        drop_connect_rate=drop_connect_rate,
+        depth_divisor=8,
+        min_depth=None,
+        include_top=include_top,
+    )
+    return blocks_args, global_params
+
+
+def get_model_params(model_name, override_params):
+    """Get the block args and global params for a given model name.
+    Args:
+        model_name (str): Model's name.
+        override_params (dict): A dict to modify global_params.
+    Returns:
+        blocks_args, global_params
+    """
+    if model_name.startswith('efficientnet'):
+        w, d, s, p = efficientnet_params(model_name)
+        blocks_args, global_params = efficientnet(
+            width_coefficient=w,
+            depth_coefficient=d,
+            dropout_rate=p,
+            image_size=s)
+    else:
+        raise NotImplementedError(
+            'model name is not pre-defined: {}'.format(model_name))
+    if override_params:
+        global_params = global_params._replace(**override_params)
+    return blocks_args, global_params
+
+
+def load_pretrained_weights(model,
+                            model_name,
+                            weights_path=None,
+                            load_fc=True,
+                            advprop=False,
+                            verbose=True):
+    """Loads pretrained weights from weights path or download using url.
+    Args:
+        model (Module): The whole model of efficientnet.
+        model_name (str): Model name of efficientnet.
+        weights_path (None or str):
+            str: path to pretrained weights file on the local disk.
+            None: use pretrained weights downloaded from the Internet.
+        load_fc (bool): Whether to load pretrained weights for fc layer at the end of the model.
+        advprop (bool): Whether to load pretrained weights
+                        trained with advprop (valid when weights_path is None).
+    """
+    if isinstance(weights_path, str):
+        state_dict = torch.load(weights_path)
+    else:
+        url_map_ = url_map_advprop if advprop else url_map
+        state_dict = model_zoo.load_url(url_map_[model_name])
+
+    if load_fc:
+        ret = model.load_state_dict(state_dict, strict=False)
+        assert not ret.missing_keys, 'Missing keys when loading pretrained weights: {}'.format(
+            ret.missing_keys)
+    else:
+        state_dict.pop('_fc.weight')
+        state_dict.pop('_fc.bias')
+        ret = model.load_state_dict(state_dict, strict=False)
+        assert set(ret.missing_keys) == set([
+            '_fc.weight', '_fc.bias'
+        ]), 'Missing keys when loading pretrained weights: {}'.format(
+            ret.missing_keys)
+    assert not ret.unexpected_keys, 'Missing keys when loading pretrained weights: {}'.format(
+        ret.unexpected_keys)
+
+    if verbose:
+        print('Loaded pretrained weights for {}'.format(model_name))
diff --git a/modelscope/models/cv/face_emotion/emotion_infer.py b/modelscope/models/cv/face_emotion/emotion_infer.py
new file mode 100644
index 00000000..e3398592
--- /dev/null
+++ b/modelscope/models/cv/face_emotion/emotion_infer.py
@@ -0,0 +1,67 @@
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
+import torch
+from PIL import Image
+from torch import nn
+from torchvision import transforms
+
+from modelscope.utils.logger import get_logger
+from .face_alignment.face_align import face_detection_PIL_v2
+
+logger = get_logger()
+
+
+def transform_PIL(img_pil):
+    val_transforms = transforms.Compose([
+        transforms.ToTensor(),
+        transforms.Normalize(
+            mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
+    ])
+    return val_transforms(img_pil)
+
+
+index2AU = [1, 2, 4, 6, 7, 10, 12, 15, 23, 24, 25, 26]
+emotion_list = [
+    'Neutral', 'Anger', 'Disgust', 'Fear', 'Happiness', 'Sadness', 'Surprise'
+]
+
+
+def inference(image_path, model, face_model, score_thre=0.5, GPU=0):
+    image = Image.open(image_path).convert('RGB')
+
+    face, bbox = face_detection_PIL_v2(image, face_model)
+    if bbox is None:
+        logger.warn('no face detected!')
+        result = {'emotion_result': None, 'box': None}
+        return result
+
+    face = transform_PIL(face)
+    face = face.unsqueeze(0)
+    if torch.cuda.is_available():
+        face = face.cuda(GPU)
+    logits_AU, logits_emotion = model(face)
+    logits_AU = torch.sigmoid(logits_AU)
+    logits_emotion = nn.functional.softmax(logits_emotion, 1)
+
+    _, index_list = logits_emotion.max(1)
+    emotion_index = index_list[0].data.item()
+    prob = logits_emotion[0][emotion_index]
+    if prob > score_thre and emotion_index != 3:
+        cur_emotion = emotion_list[emotion_index]
+    else:
+        cur_emotion = 'Neutral'
+
+    logits_AU = logits_AU[0]
+    au_ouput = torch.zeros_like(logits_AU)
+    au_ouput[logits_AU >= score_thre] = 1
+    au_ouput[logits_AU < score_thre] = 0
+
+    au_ouput = au_ouput.int()
+
+    cur_au_list = []
+    for idx in range(au_ouput.shape[0]):
+        if au_ouput[idx] == 1:
+            au = index2AU[idx]
+            cur_au_list.append(au)
+    cur_au_list.sort()
+    result = (cur_emotion, bbox)
+    return result
diff --git a/modelscope/models/cv/face_emotion/emotion_model.py b/modelscope/models/cv/face_emotion/emotion_model.py
new file mode 100644
index 00000000..f8df9c37
--- /dev/null
+++ b/modelscope/models/cv/face_emotion/emotion_model.py
@@ -0,0 +1,96 @@
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
+import os
+import sys
+
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+from modelscope.metainfo import Models
+from modelscope.models.base import TorchModel
+from modelscope.models.builder import MODELS
+from modelscope.models.cv.face_emotion.efficient import EfficientNet
+from modelscope.utils.constant import ModelFile, Tasks
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+
+@MODELS.register_module(Tasks.face_emotion, module_name=Models.face_emotion)
+class EfficientNetForFaceEmotion(TorchModel):
+
+    def __init__(self, model_dir, device_id=0, *args, **kwargs):
+
+        super().__init__(
+            model_dir=model_dir, device_id=device_id, *args, **kwargs)
+        self.model = FaceEmotionModel(
+            name='efficientnet-b0', num_embed=512, num_au=12, num_emotion=7)
+
+        if torch.cuda.is_available():
+            self.device = 'cuda'
+            logger.info('Use GPU')
+        else:
+            self.device = 'cpu'
+            logger.info('Use CPU')
+        pretrained_params = torch.load(
+            '{}/{}'.format(model_dir, ModelFile.TORCH_MODEL_BIN_FILE),
+            map_location=self.device)
+
+        state_dict = pretrained_params['model']
+        new_state = {}
+        for k, v in state_dict.items():
+            if k.startswith('module.'):
+                k = k[7:]
+            new_state[k] = v
+
+        self.model.load_state_dict(new_state)
+        self.model.eval()
+        self.model.to(self.device)
+
+    def forward(self, x):
+        logits_au, logits_emotion = self.model(x)
+        return logits_au, logits_emotion
+
+
+class FaceEmotionModel(nn.Module):
+
+    def __init__(self,
+                 name='efficientnet-b0',
+                 num_embed=512,
+                 num_au=12,
+                 num_emotion=7):
+        super(FaceEmotionModel, self).__init__()
+        self.backbone = EfficientNet.from_pretrained(
+            name, weights_path=None, advprop=True)
+        self.average_pool = nn.AdaptiveAvgPool2d(1)
+        self.embed = nn.Linear(self.backbone._fc.weight.data.shape[1],
+                               num_embed)
+        self.features = nn.BatchNorm1d(num_embed)
+        nn.init.constant_(self.features.weight, 1.0)
+        self.features.weight.requires_grad = False
+        self.fc_au = nn.Sequential(
+            nn.Dropout(0.6),
+            nn.Linear(num_embed, num_au),
+        )
+        self.fc_emotion = nn.Sequential(
+            nn.Dropout(0.6),
+            nn.Linear(num_embed, num_emotion),
+        )
+
+    def feat_single_img(self, x):
+        x = self.backbone.extract_features(x)
+        x = self.average_pool(x)
+        x = x.flatten(1)
+        x = self.embed(x)
+        x = self.features(x)
+        return x
+
+    def forward(self, x):
+        x = self.feat_single_img(x)
+        logits_au = self.fc_au(x)
+        att_au = torch.sigmoid(logits_au).unsqueeze(-1)
+        x = x.unsqueeze(1)
+        emotion_vec_list = torch.matmul(att_au, x)
+        emotion_vec = emotion_vec_list.sum(1)
+        logits_emotion = self.fc_emotion(emotion_vec)
+        return logits_au, logits_emotion
diff --git a/modelscope/models/cv/face_emotion/face_alignment/__init__.py b/modelscope/models/cv/face_emotion/face_alignment/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/modelscope/models/cv/face_emotion/face_alignment/face.py b/modelscope/models/cv/face_emotion/face_alignment/face.py
new file mode 100644
index 00000000..a362bddc
--- /dev/null
+++ b/modelscope/models/cv/face_emotion/face_alignment/face.py
@@ -0,0 +1,79 @@
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
+import os
+
+import cv2
+import numpy as np
+import tensorflow as tf
+
+
+def init(mod):
+    PATH_TO_CKPT = mod
+    net = tf.Graph()
+    with net.as_default():
+        od_graph_def = tf.GraphDef()
+        config = tf.ConfigProto()
+        config.gpu_options.per_process_gpu_memory_fraction = 0.6
+        with tf.gfile.GFile(PATH_TO_CKPT, 'rb') as fid:
+            serialized_graph = fid.read()
+            od_graph_def.ParseFromString(serialized_graph)
+            tf.import_graph_def(od_graph_def, name='')
+            sess = tf.Session(graph=net, config=config)
+    return sess, net
+
+
+def filter_bboxes_confs(shape,
+                        imgsBboxes,
+                        imgsConfs,
+                        single=False,
+                        thresh=0.5):
+    [w, h] = shape
+    if single:
+        bboxes, confs = [], []
+        for y in range(len(imgsBboxes)):
+            if imgsConfs[y] >= thresh:
+                [x1, y1, x2, y2] = list(imgsBboxes[y])
+                x1, y1, x2, y2 = int(w * x1), int(h * y1), int(w * x2), int(
+                    h * y2)
+                bboxes.append([y1, x1, y2, x2])
+                confs.append(imgsConfs[y])
+        return bboxes, confs
+    else:
+        retImgsBboxes, retImgsConfs = [], []
+        for x in range(len(imgsBboxes)):
+            bboxes, confs = [], []
+            for y in range(len(imgsBboxes[x])):
+                if imgsConfs[x][y] >= thresh:
+                    [x1, y1, x2, y2] = list(imgsBboxes[x][y])
+                    x1, y1, x2, y2 = int(w * x1), int(h * y1), int(
+                        w * x2), int(h * y2)
+                    bboxes.append([y1, x1, y2, x2])
+                    confs.append(imgsConfs[x][y])
+            retImgsBboxes.append(bboxes)
+            retImgsConfs.append(confs)
+        return retImgsBboxes, retImgsConfs
+
+
+def detect(im, sess, net):
+    image_np = cv2.cvtColor(im, cv2.COLOR_BGR2RGB)
+    image_np_expanded = np.expand_dims(image_np, axis=0)
+    image_tensor = net.get_tensor_by_name('image_tensor:0')
+    bboxes = net.get_tensor_by_name('detection_boxes:0')
+    dConfs = net.get_tensor_by_name('detection_scores:0')
+    classes = net.get_tensor_by_name('detection_classes:0')
+    num_detections = net.get_tensor_by_name('num_detections:0')
+    (bboxes, dConfs, classes,
+     num_detections) = sess.run([bboxes, dConfs, classes, num_detections],
+                                feed_dict={image_tensor: image_np_expanded})
+    w, h, _ = im.shape
+    bboxes, confs = filter_bboxes_confs([w, h], bboxes[0], dConfs[0], True)
+    return bboxes, confs
+
+
+class FaceDetector:
+
+    def __init__(self, mod):
+        self.sess, self.net = init(mod)
+
+    def do_detect(self, im):
+        bboxes, confs = detect(im, self.sess, self.net)
+        return bboxes, confs
diff --git a/modelscope/models/cv/face_emotion/face_alignment/face_align.py b/modelscope/models/cv/face_emotion/face_alignment/face_align.py
new file mode 100644
index 00000000..71282b12
--- /dev/null
+++ b/modelscope/models/cv/face_emotion/face_alignment/face_align.py
@@ -0,0 +1,59 @@
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
+import os
+import sys
+
+import cv2
+import numpy as np
+from PIL import Image, ImageFile
+
+from .face import FaceDetector
+
+ImageFile.LOAD_TRUNCATED_IMAGES = True
+
+
+def adjust_bx_v2(box, w, h):
+    x1, y1, x2, y2 = box[0], box[1], box[2], box[3]
+    box_w = x2 - x1
+    box_h = y2 - y1
+    delta = abs(box_w - box_h)
+    if box_w > box_h:
+        if y1 >= delta:
+            y1 = y1 - delta
+        else:
+            delta_y1 = y1
+            y1 = 0
+            delta_y2 = delta - delta_y1
+            y2 = y2 + delta_y2 if y2 < h - delta_y2 else h - 1
+    else:
+        if x1 >= delta / 2 and x2 <= w - delta / 2:
+            x1 = x1 - delta / 2
+            x2 = x2 + delta / 2
+        elif x1 < delta / 2 and x2 <= w - delta / 2:
+            delta_x1 = x1
+            x1 = 0
+            delta_x2 = delta - delta_x1
+            x2 = x2 + delta_x2 if x2 < w - delta_x2 else w - 1
+        elif x1 >= delta / 2 and x2 > w - delta / 2:
+            delta_x2 = w - x2
+            x2 = w - 1
+            delta_x1 = delta - x1
+            x1 = x1 - delta_x1 if x1 >= delta_x1 else 0
+
+    x1, y1, x2, y2 = int(x1), int(y1), int(x2), int(y2)
+    return [x1, y1, x2, y2]
+
+
+def face_detection_PIL_v2(image, face_model):
+    crop_size = 112
+    face_detector = FaceDetector(face_model)
+    img = np.array(image)
+    h, w = img.shape[0:2]
+    bxs, conf = face_detector.do_detect(img)
+    bx = bxs[0]
+    bx = adjust_bx_v2(bx, w, h)
+    x1, y1, x2, y2 = bx
+    image = img[y1:y2, x1:x2, :]
+    img = Image.fromarray(image)
+    img = img.resize((crop_size, crop_size))
+    bx = tuple(bx)
+    return img, bx
diff --git a/modelscope/models/cv/face_human_hand_detection/__init__.py b/modelscope/models/cv/face_human_hand_detection/__init__.py
new file mode 100644
index 00000000..33a5fd2f
--- /dev/null
+++ b/modelscope/models/cv/face_human_hand_detection/__init__.py
@@ -0,0 +1,20 @@
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .det_infer import NanoDetForFaceHumanHandDetection
+
+else:
+    _import_structure = {'det_infer': ['NanoDetForFaceHumanHandDetection']}
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/models/cv/face_human_hand_detection/det_infer.py b/modelscope/models/cv/face_human_hand_detection/det_infer.py
new file mode 100644
index 00000000..7a7225ee
--- /dev/null
+++ b/modelscope/models/cv/face_human_hand_detection/det_infer.py
@@ -0,0 +1,133 @@
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
+
+import cv2
+import numpy as np
+import torch
+
+from modelscope.metainfo import Models
+from modelscope.models.base import TorchModel
+from modelscope.models.builder import MODELS
+from modelscope.utils.constant import ModelFile, Tasks
+from modelscope.utils.logger import get_logger
+from .one_stage_detector import OneStageDetector
+
+logger = get_logger()
+
+
+def load_model_weight(model_dir, device):
+    checkpoint = torch.load(
+        '{}/{}'.format(model_dir, ModelFile.TORCH_MODEL_BIN_FILE),
+        map_location=device)
+    state_dict = checkpoint['state_dict'].copy()
+    for k in checkpoint['state_dict']:
+        if k.startswith('avg_model.'):
+            v = state_dict.pop(k)
+            state_dict[k[4:]] = v
+
+    return state_dict
+
+
+@MODELS.register_module(
+    Tasks.face_human_hand_detection,
+    module_name=Models.face_human_hand_detection)
+class NanoDetForFaceHumanHandDetection(TorchModel):
+
+    def __init__(self, model_dir, device_id=0, *args, **kwargs):
+
+        super().__init__(
+            model_dir=model_dir, device_id=device_id, *args, **kwargs)
+
+        self.model = OneStageDetector()
+        if torch.cuda.is_available():
+            self.device = 'cuda'
+            logger.info('Use GPU ')
+        else:
+            self.device = 'cpu'
+            logger.info('Use CPU')
+
+        self.state_dict = load_model_weight(model_dir, self.device)
+        self.model.load_state_dict(self.state_dict, strict=False)
+        self.model.eval()
+        self.model.to(self.device)
+
+    def forward(self, x):
+        pred_result = self.model.inference(x)
+        return pred_result
+
+
+def naive_collate(batch):
+    elem = batch[0]
+    if isinstance(elem, dict):
+        return {key: naive_collate([d[key] for d in batch]) for key in elem}
+    else:
+        return batch
+
+
+def get_resize_matrix(raw_shape, dst_shape):
+
+    r_w, r_h = raw_shape
+    d_w, d_h = dst_shape
+    Rs = np.eye(3)
+
+    Rs[0, 0] *= d_w / r_w
+    Rs[1, 1] *= d_h / r_h
+    return Rs
+
+
+def color_aug_and_norm(meta, mean, std):
+    img = meta['img'].astype(np.float32) / 255
+    mean = np.array(mean, dtype=np.float32).reshape(1, 1, 3) / 255
+    std = np.array(std, dtype=np.float32).reshape(1, 1, 3) / 255
+    img = (img - mean) / std
+    meta['img'] = img
+    return meta
+
+
+def img_process(meta, mean, std):
+    raw_img = meta['img']
+    height = raw_img.shape[0]
+    width = raw_img.shape[1]
+    dst_shape = [320, 320]
+    M = np.array([[1., 0., 0.], [0., 1., 0.], [0., 0., 1.]])
+    ResizeM = get_resize_matrix((width, height), dst_shape)
+    M = ResizeM @ M
+    img = cv2.warpPerspective(raw_img, M, dsize=tuple(dst_shape))
+    meta['img'] = img
+    meta['warp_matrix'] = M
+    meta = color_aug_and_norm(meta, mean, std)
+    return meta
+
+
+def overlay_bbox_cv(dets, class_names, score_thresh):
+    all_box = []
+    for label in dets:
+        for bbox in dets[label]:
+            score = bbox[-1]
+            if score > score_thresh:
+                x0, y0, x1, y1 = [int(i) for i in bbox[:4]]
+                all_box.append([label, x0, y0, x1, y1, score])
+    all_box.sort(key=lambda v: v[5])
+    return all_box
+
+
+mean = [103.53, 116.28, 123.675]
+std = [57.375, 57.12, 58.395]
+class_names = ['person', 'face', 'hand']
+
+
+def inference(model, device, img_path):
+    img_info = {'id': 0}
+    img = cv2.imread(img_path)
+    height, width = img.shape[:2]
+    img_info['height'] = height
+    img_info['width'] = width
+    meta = dict(img_info=img_info, raw_img=img, img=img)
+
+    meta = img_process(meta, mean, std)
+    meta['img'] = torch.from_numpy(meta['img'].transpose(2, 0, 1)).to(device)
+    meta = naive_collate([meta])
+    meta['img'] = (meta['img'][0]).reshape(1, 3, 320, 320)
+    with torch.no_grad():
+        res = model(meta)
+    result = overlay_bbox_cv(res[0], class_names, score_thresh=0.35)
+    return result
diff --git a/modelscope/models/cv/face_human_hand_detection/ghost_pan.py b/modelscope/models/cv/face_human_hand_detection/ghost_pan.py
new file mode 100644
index 00000000..e00de407
--- /dev/null
+++ b/modelscope/models/cv/face_human_hand_detection/ghost_pan.py
@@ -0,0 +1,395 @@
+# The implementation here is modified based on nanodet,
+# originally Apache 2.0 License and publicly avaialbe at https://github.com/RangiLyu/nanodet
+
+import math
+
+import torch
+import torch.nn as nn
+
+from .utils import ConvModule, DepthwiseConvModule, act_layers
+
+
+def _make_divisible(v, divisor, min_value=None):
+    if min_value is None:
+        min_value = divisor
+    new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
+    # Make sure that round down does not go down by more than 10%.
+    if new_v < 0.9 * v:
+        new_v += divisor
+    return new_v
+
+
+def hard_sigmoid(x, inplace: bool = False):
+    if inplace:
+        return x.add_(3.0).clamp_(0.0, 6.0).div_(6.0)
+    else:
+        return F.relu6(x + 3.0) / 6.0
+
+
+class SqueezeExcite(nn.Module):
+
+    def __init__(self,
+                 in_chs,
+                 se_ratio=0.25,
+                 reduced_base_chs=None,
+                 activation='ReLU',
+                 gate_fn=hard_sigmoid,
+                 divisor=4,
+                 **_):
+        super(SqueezeExcite, self).__init__()
+        self.gate_fn = gate_fn
+        reduced_chs = _make_divisible((reduced_base_chs or in_chs) * se_ratio,
+                                      divisor)
+        self.avg_pool = nn.AdaptiveAvgPool2d(1)
+        self.conv_reduce = nn.Conv2d(in_chs, reduced_chs, 1, bias=True)
+        self.act1 = act_layers(activation)
+        self.conv_expand = nn.Conv2d(reduced_chs, in_chs, 1, bias=True)
+
+    def forward(self, x):
+        x_se = self.avg_pool(x)
+        x_se = self.conv_reduce(x_se)
+        x_se = self.act1(x_se)
+        x_se = self.conv_expand(x_se)
+        x = x * self.gate_fn(x_se)
+        return x
+
+
+class GhostModule(nn.Module):
+
+    def __init__(self,
+                 inp,
+                 oup,
+                 kernel_size=1,
+                 ratio=2,
+                 dw_size=3,
+                 stride=1,
+                 activation='ReLU'):
+        super(GhostModule, self).__init__()
+        self.oup = oup
+        init_channels = math.ceil(oup / ratio)
+        new_channels = init_channels * (ratio - 1)
+
+        self.primary_conv = nn.Sequential(
+            nn.Conv2d(
+                inp,
+                init_channels,
+                kernel_size,
+                stride,
+                kernel_size // 2,
+                bias=False),
+            nn.BatchNorm2d(init_channels),
+            act_layers(activation) if activation else nn.Sequential(),
+        )
+
+        self.cheap_operation = nn.Sequential(
+            nn.Conv2d(
+                init_channels,
+                new_channels,
+                dw_size,
+                1,
+                dw_size // 2,
+                groups=init_channels,
+                bias=False,
+            ),
+            nn.BatchNorm2d(new_channels),
+            act_layers(activation) if activation else nn.Sequential(),
+        )
+
+    def forward(self, x):
+        x1 = self.primary_conv(x)
+        x2 = self.cheap_operation(x1)
+        out = torch.cat([x1, x2], dim=1)
+        return out
+
+
+class GhostBottleneck(nn.Module):
+    """Ghost bottleneck w/ optional SE"""
+
+    def __init__(
+        self,
+        in_chs,
+        mid_chs,
+        out_chs,
+        dw_kernel_size=3,
+        stride=1,
+        activation='ReLU',
+        se_ratio=0.0,
+    ):
+        super(GhostBottleneck, self).__init__()
+        has_se = se_ratio is not None and se_ratio > 0.0
+        self.stride = stride
+
+        # Point-wise expansion
+        self.ghost1 = GhostModule(in_chs, mid_chs, activation=activation)
+
+        # Depth-wise convolution
+        if self.stride > 1:
+            self.conv_dw = nn.Conv2d(
+                mid_chs,
+                mid_chs,
+                dw_kernel_size,
+                stride=stride,
+                padding=(dw_kernel_size - 1) // 2,
+                groups=mid_chs,
+                bias=False,
+            )
+            self.bn_dw = nn.BatchNorm2d(mid_chs)
+
+        if has_se:
+            self.se = SqueezeExcite(mid_chs, se_ratio=se_ratio)
+        else:
+            self.se = None
+
+        self.ghost2 = GhostModule(mid_chs, out_chs, activation=None)
+
+        if in_chs == out_chs and self.stride == 1:
+            self.shortcut = nn.Sequential()
+        else:
+            self.shortcut = nn.Sequential(
+                nn.Conv2d(
+                    in_chs,
+                    in_chs,
+                    dw_kernel_size,
+                    stride=stride,
+                    padding=(dw_kernel_size - 1) // 2,
+                    groups=in_chs,
+                    bias=False,
+                ),
+                nn.BatchNorm2d(in_chs),
+                nn.Conv2d(in_chs, out_chs, 1, stride=1, padding=0, bias=False),
+                nn.BatchNorm2d(out_chs),
+            )
+
+    def forward(self, x):
+        residual = x
+
+        x = self.ghost1(x)
+
+        if self.stride > 1:
+            x = self.conv_dw(x)
+            x = self.bn_dw(x)
+
+        if self.se is not None:
+            x = self.se(x)
+
+        x = self.ghost2(x)
+
+        x += self.shortcut(residual)
+        return x
+
+
+class GhostBlocks(nn.Module):
+    """Stack of GhostBottleneck used in GhostPAN.
+
+    Args:
+        in_channels (int): Number of input channels.
+        out_channels (int): Number of output channels.
+        expand (int): Expand ratio of GhostBottleneck. Default: 1.
+        kernel_size (int): Kernel size of depthwise convolution. Default: 5.
+        num_blocks (int): Number of GhostBottlecneck blocks. Default: 1.
+        use_res (bool): Whether to use residual connection. Default: False.
+        activation (str): Name of activation function. Default: LeakyReLU.
+    """
+
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        expand=1,
+        kernel_size=5,
+        num_blocks=1,
+        use_res=False,
+        activation='LeakyReLU',
+    ):
+        super(GhostBlocks, self).__init__()
+        self.use_res = use_res
+        if use_res:
+            self.reduce_conv = ConvModule(
+                in_channels,
+                out_channels,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+                activation=activation,
+            )
+        blocks = []
+        for _ in range(num_blocks):
+            blocks.append(
+                GhostBottleneck(
+                    in_channels,
+                    int(out_channels * expand),
+                    out_channels,
+                    dw_kernel_size=kernel_size,
+                    activation=activation,
+                ))
+        self.blocks = nn.Sequential(*blocks)
+
+    def forward(self, x):
+        out = self.blocks(x)
+        if self.use_res:
+            out = out + self.reduce_conv(x)
+        return out
+
+
+class GhostPAN(nn.Module):
+    """Path Aggregation Network with Ghost block.
+
+    Args:
+        in_channels (List[int]): Number of input channels per scale.
+        out_channels (int): Number of output channels (used at each scale)
+        num_csp_blocks (int): Number of bottlenecks in CSPLayer. Default: 3
+        use_depthwise (bool): Whether to depthwise separable convolution in
+            blocks. Default: False
+        kernel_size (int): Kernel size of depthwise convolution. Default: 5.
+        expand (int): Expand ratio of GhostBottleneck. Default: 1.
+        num_blocks (int): Number of GhostBottlecneck blocks. Default: 1.
+        use_res (bool): Whether to use residual connection. Default: False.
+        num_extra_level (int): Number of extra conv layers for more feature levels.
+            Default: 0.
+        upsample_cfg (dict): Config dict for interpolate layer.
+            Default: `dict(scale_factor=2, mode='nearest')`
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='BN')
+        activation (str): Activation layer name.
+            Default: LeakyReLU.
+    """
+
+    def __init__(
+            self,
+            in_channels,
+            out_channels,
+            use_depthwise=False,
+            kernel_size=5,
+            expand=1,
+            num_blocks=1,
+            use_res=False,
+            num_extra_level=0,
+            upsample_cfg=dict(scale_factor=2, mode='bilinear'),
+            norm_cfg=dict(type='BN'),
+            activation='LeakyReLU',
+    ):
+        super(GhostPAN, self).__init__()
+        assert num_extra_level >= 0
+        assert num_blocks >= 1
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+
+        conv = DepthwiseConvModule if use_depthwise else ConvModule
+
+        # build top-down blocks
+        self.upsample = nn.Upsample(**upsample_cfg)
+        self.reduce_layers = nn.ModuleList()
+        for idx in range(len(in_channels)):
+            self.reduce_layers.append(
+                ConvModule(
+                    in_channels[idx],
+                    out_channels,
+                    1,
+                    norm_cfg=norm_cfg,
+                    activation=activation,
+                ))
+        self.top_down_blocks = nn.ModuleList()
+        for idx in range(len(in_channels) - 1, 0, -1):
+            self.top_down_blocks.append(
+                GhostBlocks(
+                    out_channels * 2,
+                    out_channels,
+                    expand,
+                    kernel_size=kernel_size,
+                    num_blocks=num_blocks,
+                    use_res=use_res,
+                    activation=activation,
+                ))
+
+        # build bottom-up blocks
+        self.downsamples = nn.ModuleList()
+        self.bottom_up_blocks = nn.ModuleList()
+        for idx in range(len(in_channels) - 1):
+            self.downsamples.append(
+                conv(
+                    out_channels,
+                    out_channels,
+                    kernel_size,
+                    stride=2,
+                    padding=kernel_size // 2,
+                    norm_cfg=norm_cfg,
+                    activation=activation,
+                ))
+            self.bottom_up_blocks.append(
+                GhostBlocks(
+                    out_channels * 2,
+                    out_channels,
+                    expand,
+                    kernel_size=kernel_size,
+                    num_blocks=num_blocks,
+                    use_res=use_res,
+                    activation=activation,
+                ))
+
+        # extra layers
+        self.extra_lvl_in_conv = nn.ModuleList()
+        self.extra_lvl_out_conv = nn.ModuleList()
+        for i in range(num_extra_level):
+            self.extra_lvl_in_conv.append(
+                conv(
+                    out_channels,
+                    out_channels,
+                    kernel_size,
+                    stride=2,
+                    padding=kernel_size // 2,
+                    norm_cfg=norm_cfg,
+                    activation=activation,
+                ))
+            self.extra_lvl_out_conv.append(
+                conv(
+                    out_channels,
+                    out_channels,
+                    kernel_size,
+                    stride=2,
+                    padding=kernel_size // 2,
+                    norm_cfg=norm_cfg,
+                    activation=activation,
+                ))
+
+    def forward(self, inputs):
+        """
+        Args:
+            inputs (tuple[Tensor]): input features.
+        Returns:
+            tuple[Tensor]: multi level features.
+        """
+        assert len(inputs) == len(self.in_channels)
+        inputs = [
+            reduce(input_x)
+            for input_x, reduce in zip(inputs, self.reduce_layers)
+        ]
+        # top-down path
+        inner_outs = [inputs[-1]]
+        for idx in range(len(self.in_channels) - 1, 0, -1):
+            feat_heigh = inner_outs[0]
+            feat_low = inputs[idx - 1]
+
+            inner_outs[0] = feat_heigh
+
+            upsample_feat = self.upsample(feat_heigh)
+
+            inner_out = self.top_down_blocks[len(self.in_channels) - 1 - idx](
+                torch.cat([upsample_feat, feat_low], 1))
+            inner_outs.insert(0, inner_out)
+
+        # bottom-up path
+        outs = [inner_outs[0]]
+        for idx in range(len(self.in_channels) - 1):
+            feat_low = outs[-1]
+            feat_height = inner_outs[idx + 1]
+            downsample_feat = self.downsamples[idx](feat_low)
+            out = self.bottom_up_blocks[idx](
+                torch.cat([downsample_feat, feat_height], 1))
+            outs.append(out)
+
+        # extra layers
+        for extra_in_layer, extra_out_layer in zip(self.extra_lvl_in_conv,
+                                                   self.extra_lvl_out_conv):
+            outs.append(extra_in_layer(inputs[-1]) + extra_out_layer(outs[-1]))
+
+        return tuple(outs)
diff --git a/modelscope/models/cv/face_human_hand_detection/nanodet_plus_head.py b/modelscope/models/cv/face_human_hand_detection/nanodet_plus_head.py
new file mode 100644
index 00000000..7f5b50ec
--- /dev/null
+++ b/modelscope/models/cv/face_human_hand_detection/nanodet_plus_head.py
@@ -0,0 +1,427 @@
+# The implementation here is modified based on nanodet,
+# originally Apache 2.0 License and publicly avaialbe at https://github.com/RangiLyu/nanodet
+
+import math
+
+import cv2
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torchvision.ops import nms
+
+from .utils import ConvModule, DepthwiseConvModule
+
+
+class Integral(nn.Module):
+    """A fixed layer for calculating integral result from distribution.
+    This layer calculates the target location by :math: `sum{P(y_i) * y_i}`,
+    P(y_i) denotes the softmax vector that represents the discrete distribution
+    y_i denotes the discrete set, usually {0, 1, 2, ..., reg_max}
+    Args:
+        reg_max (int): The maximal value of the discrete set. Default: 16. You
+            may want to reset it according to your new dataset or related
+            settings.
+    """
+
+    def __init__(self, reg_max=16):
+        super(Integral, self).__init__()
+        self.reg_max = reg_max
+        self.register_buffer('project',
+                             torch.linspace(0, self.reg_max, self.reg_max + 1))
+
+    def forward(self, x):
+        """Forward feature from the regression head to get integral result of
+        bounding box location.
+        Args:
+            x (Tensor): Features of the regression head, shape (N, 4*(n+1)),
+                n is self.reg_max.
+        Returns:
+            x (Tensor): Integral result of box locations, i.e., distance
+                offsets from the box center in four directions, shape (N, 4).
+        """
+        shape = x.size()
+        x = F.softmax(x.reshape(*shape[:-1], 4, self.reg_max + 1), dim=-1)
+        x = F.linear(x, self.project.type_as(x)).reshape(*shape[:-1], 4)
+        return x
+
+
+def batched_nms(boxes, scores, idxs, nms_cfg, class_agnostic=False):
+    """Performs non-maximum suppression in a batched fashion.
+    Modified from https://github.com/pytorch/vision/blob
+    /505cd6957711af790211896d32b40291bea1bc21/torchvision/ops/boxes.py#L39.
+    In order to perform NMS independently per class, we add an offset to all
+    the boxes. The offset is dependent only on the class idx, and is large
+    enough so that boxes from different classes do not overlap.
+    Arguments:
+        boxes (torch.Tensor): boxes in shape (N, 4).
+        scores (torch.Tensor): scores in shape (N, ).
+        idxs (torch.Tensor): each index value correspond to a bbox cluster,
+            and NMS will not be applied between elements of different idxs,
+            shape (N, ).
+        nms_cfg (dict): specify nms type and other parameters like iou_thr.
+            Possible keys includes the following.
+            - iou_thr (float): IoU threshold used for NMS.
+            - split_thr (float): threshold number of boxes. In some cases the
+                number of boxes is large (e.g., 200k). To avoid OOM during
+                training, the users could set `split_thr` to a small value.
+                If the number of boxes is greater than the threshold, it will
+                perform NMS on each group of boxes separately and sequentially.
+                Defaults to 10000.
+        class_agnostic (bool): if true, nms is class agnostic,
+            i.e. IoU thresholding happens over all boxes,
+            regardless of the predicted class.
+    Returns:
+        tuple: kept dets and indice.
+    """
+    nms_cfg_ = nms_cfg.copy()
+    class_agnostic = nms_cfg_.pop('class_agnostic', class_agnostic)
+    if class_agnostic:
+        boxes_for_nms = boxes
+    else:
+        max_coordinate = boxes.max()
+        offsets = idxs.to(boxes) * (max_coordinate + 1)
+        boxes_for_nms = boxes + offsets[:, None]
+    nms_cfg_.pop('type', 'nms')
+    split_thr = nms_cfg_.pop('split_thr', 10000)
+    if len(boxes_for_nms) < split_thr:
+        keep = nms(boxes_for_nms, scores, **nms_cfg_)
+        boxes = boxes[keep]
+        scores = scores[keep]
+    else:
+        total_mask = scores.new_zeros(scores.size(), dtype=torch.bool)
+        for id in torch.unique(idxs):
+            mask = (idxs == id).nonzero(as_tuple=False).view(-1)
+            keep = nms(boxes_for_nms[mask], scores[mask], **nms_cfg_)
+            total_mask[mask[keep]] = True
+
+        keep = total_mask.nonzero(as_tuple=False).view(-1)
+        keep = keep[scores[keep].argsort(descending=True)]
+        boxes = boxes[keep]
+        scores = scores[keep]
+
+    return torch.cat([boxes, scores[:, None]], -1), keep
+
+
+def multiclass_nms(multi_bboxes,
+                   multi_scores,
+                   score_thr,
+                   nms_cfg,
+                   max_num=-1,
+                   score_factors=None):
+    """NMS for multi-class bboxes.
+
+    Args:
+        multi_bboxes (Tensor): shape (n, #class*4) or (n, 4)
+        multi_scores (Tensor): shape (n, #class), where the last column
+            contains scores of the background class, but this will be ignored.
+        score_thr (float): bbox threshold, bboxes with scores lower than it
+            will not be considered.
+        nms_thr (float): NMS IoU threshold
+        max_num (int): if there are more than max_num bboxes after NMS,
+            only top max_num will be kept.
+        score_factors (Tensor): The factors multiplied to scores before
+            applying NMS
+
+    Returns:
+        tuple: (bboxes, labels), tensors of shape (k, 5) and (k, 1). Labels \
+            are 0-based.
+    """
+    num_classes = multi_scores.size(1) - 1
+    if multi_bboxes.shape[1] > 4:
+        bboxes = multi_bboxes.view(multi_scores.size(0), -1, 4)
+    else:
+        bboxes = multi_bboxes[:, None].expand(
+            multi_scores.size(0), num_classes, 4)
+    scores = multi_scores[:, :-1]
+
+    valid_mask = scores > score_thr
+
+    bboxes = torch.masked_select(
+        bboxes,
+        torch.stack((valid_mask, valid_mask, valid_mask, valid_mask),
+                    -1)).view(-1, 4)
+    if score_factors is not None:
+        scores = scores * score_factors[:, None]
+    scores = torch.masked_select(scores, valid_mask)
+    labels = valid_mask.nonzero(as_tuple=False)[:, 1]
+
+    if bboxes.numel() == 0:
+        bboxes = multi_bboxes.new_zeros((0, 5))
+        labels = multi_bboxes.new_zeros((0, ), dtype=torch.long)
+
+        if torch.onnx.is_in_onnx_export():
+            raise RuntimeError('[ONNX Error] Can not record NMS '
+                               'as it has not been executed this time')
+        return bboxes, labels
+
+    dets, keep = batched_nms(bboxes, scores, labels, nms_cfg)
+
+    if max_num > 0:
+        dets = dets[:max_num]
+        keep = keep[:max_num]
+
+    return dets, labels[keep]
+
+
+def distance2bbox(points, distance, max_shape=None):
+    """Decode distance prediction to bounding box.
+
+    Args:
+        points (Tensor): Shape (n, 2), [x, y].
+        distance (Tensor): Distance from the given point to 4
+            boundaries (left, top, right, bottom).
+        max_shape (tuple): Shape of the image.
+
+    Returns:
+        Tensor: Decoded bboxes.
+    """
+    x1 = points[..., 0] - distance[..., 0]
+    y1 = points[..., 1] - distance[..., 1]
+    x2 = points[..., 0] + distance[..., 2]
+    y2 = points[..., 1] + distance[..., 3]
+    if max_shape is not None:
+        x1 = x1.clamp(min=0, max=max_shape[1])
+        y1 = y1.clamp(min=0, max=max_shape[0])
+        x2 = x2.clamp(min=0, max=max_shape[1])
+        y2 = y2.clamp(min=0, max=max_shape[0])
+    return torch.stack([x1, y1, x2, y2], -1)
+
+
+def warp_boxes(boxes, M, width, height):
+    n = len(boxes)
+    if n:
+        xy = np.ones((n * 4, 3))
+        xy[:, :2] = boxes[:, [0, 1, 2, 3, 0, 3, 2, 1]].reshape(n * 4, 2)
+        xy = xy @ M.T
+        xy = (xy[:, :2] / xy[:, 2:3]).reshape(n, 8)
+        x = xy[:, [0, 2, 4, 6]]
+        y = xy[:, [1, 3, 5, 7]]
+        xy = np.concatenate(
+            (x.min(1), y.min(1), x.max(1), y.max(1))).reshape(4, n).T
+        xy[:, [0, 2]] = xy[:, [0, 2]].clip(0, width)
+        xy[:, [1, 3]] = xy[:, [1, 3]].clip(0, height)
+        return xy.astype(np.float32)
+    else:
+        return boxes
+
+
+class NanoDetPlusHead(nn.Module):
+    """Detection head used in NanoDet-Plus.
+
+    Args:
+        num_classes (int): Number of categories excluding the background
+            category.
+        loss (dict): Loss config.
+        input_channel (int): Number of channels of the input feature.
+        feat_channels (int): Number of channels of the feature.
+            Default: 96.
+        stacked_convs (int): Number of conv layers in the stacked convs.
+            Default: 2.
+        kernel_size (int): Size of the convolving kernel. Default: 5.
+        strides (list[int]): Strides of input multi-level feature maps.
+            Default: [8, 16, 32].
+        conv_type (str): Type of the convolution.
+            Default: "DWConv".
+        norm_cfg (dict): Dictionary to construct and config norm layer.
+            Default: dict(type='BN').
+        reg_max (int): The maximal value of the discrete set. Default: 7.
+        activation (str): Type of activation function. Default: "LeakyReLU".
+        assigner_cfg (dict): Config dict of the assigner. Default: dict(topk=13).
+    """
+
+    def __init__(self,
+                 num_classes,
+                 input_channel,
+                 feat_channels=96,
+                 stacked_convs=2,
+                 kernel_size=5,
+                 strides=[8, 16, 32],
+                 conv_type='DWConv',
+                 norm_cfg=dict(type='BN'),
+                 reg_max=7,
+                 activation='LeakyReLU',
+                 assigner_cfg=dict(topk=13),
+                 **kwargs):
+        super(NanoDetPlusHead, self).__init__()
+        self.num_classes = num_classes
+        self.in_channels = input_channel
+        self.feat_channels = feat_channels
+        self.stacked_convs = stacked_convs
+        self.kernel_size = kernel_size
+        self.strides = strides
+        self.reg_max = reg_max
+        self.activation = activation
+        self.ConvModule = ConvModule if conv_type == 'Conv' else DepthwiseConvModule
+
+        self.norm_cfg = norm_cfg
+        self.distribution_project = Integral(self.reg_max)
+
+        self._init_layers()
+
+    def _init_layers(self):
+        self.cls_convs = nn.ModuleList()
+        for _ in self.strides:
+            cls_convs = self._buid_not_shared_head()
+            self.cls_convs.append(cls_convs)
+
+        self.gfl_cls = nn.ModuleList([
+            nn.Conv2d(
+                self.feat_channels,
+                self.num_classes + 4 * (self.reg_max + 1),
+                1,
+                padding=0,
+            ) for _ in self.strides
+        ])
+
+    def _buid_not_shared_head(self):
+        cls_convs = nn.ModuleList()
+        for i in range(self.stacked_convs):
+            chn = self.in_channels if i == 0 else self.feat_channels
+            cls_convs.append(
+                self.ConvModule(
+                    chn,
+                    self.feat_channels,
+                    self.kernel_size,
+                    stride=1,
+                    padding=self.kernel_size // 2,
+                    norm_cfg=self.norm_cfg,
+                    bias=self.norm_cfg is None,
+                    activation=self.activation,
+                ))
+        return cls_convs
+
+    def forward(self, feats):
+        if torch.onnx.is_in_onnx_export():
+            return self._forward_onnx(feats)
+        outputs = []
+        for feat, cls_convs, gfl_cls in zip(
+                feats,
+                self.cls_convs,
+                self.gfl_cls,
+        ):
+            for conv in cls_convs:
+                feat = conv(feat)
+            output = gfl_cls(feat)
+            outputs.append(output.flatten(start_dim=2))
+        outputs = torch.cat(outputs, dim=2).permute(0, 2, 1)
+        return outputs
+
+    def post_process(self, preds, meta):
+        """Prediction results post processing. Decode bboxes and rescale
+        to original image size.
+        Args:
+            preds (Tensor): Prediction output.
+            meta (dict): Meta info.
+        """
+        cls_scores, bbox_preds = preds.split(
+            [self.num_classes, 4 * (self.reg_max + 1)], dim=-1)
+        result_list = self.get_bboxes(cls_scores, bbox_preds, meta)
+        det_results = {}
+        warp_matrixes = (
+            meta['warp_matrix']
+            if isinstance(meta['warp_matrix'], list) else meta['warp_matrix'])
+        img_heights = (
+            meta['img_info']['height'].cpu().numpy() if isinstance(
+                meta['img_info']['height'], torch.Tensor) else
+            meta['img_info']['height'])
+        img_widths = (
+            meta['img_info']['width'].cpu().numpy() if isinstance(
+                meta['img_info']['width'], torch.Tensor) else
+            meta['img_info']['width'])
+        img_ids = (
+            meta['img_info']['id'].cpu().numpy() if isinstance(
+                meta['img_info']['id'], torch.Tensor) else
+            meta['img_info']['id'])
+
+        for result, img_width, img_height, img_id, warp_matrix in zip(
+                result_list, img_widths, img_heights, img_ids, warp_matrixes):
+            det_result = {}
+            det_bboxes, det_labels = result
+            det_bboxes = det_bboxes.detach().cpu().numpy()
+            det_bboxes[:, :4] = warp_boxes(det_bboxes[:, :4],
+                                           np.linalg.inv(warp_matrix),
+                                           img_width, img_height)
+            classes = det_labels.detach().cpu().numpy()
+            for i in range(self.num_classes):
+                inds = classes == i
+                det_result[i] = np.concatenate(
+                    [
+                        det_bboxes[inds, :4].astype(np.float32),
+                        det_bboxes[inds, 4:5].astype(np.float32),
+                    ],
+                    axis=1,
+                ).tolist()
+            det_results[img_id] = det_result
+        return det_results
+
+    def get_bboxes(self, cls_preds, reg_preds, img_metas):
+        """Decode the outputs to bboxes.
+        Args:
+            cls_preds (Tensor): Shape (num_imgs, num_points, num_classes).
+            reg_preds (Tensor): Shape (num_imgs, num_points, 4 * (regmax + 1)).
+            img_metas (dict): Dict of image info.
+
+        Returns:
+            results_list (list[tuple]): List of detection bboxes and labels.
+        """
+        device = cls_preds.device
+        b = cls_preds.shape[0]
+        input_height, input_width = img_metas['img'].shape[2:]
+        input_shape = (input_height, input_width)
+
+        featmap_sizes = [(math.ceil(input_height / stride),
+                          math.ceil(input_width) / stride)
+                         for stride in self.strides]
+        mlvl_center_priors = [
+            self.get_single_level_center_priors(
+                b,
+                featmap_sizes[i],
+                stride,
+                dtype=torch.float32,
+                device=device,
+            ) for i, stride in enumerate(self.strides)
+        ]
+        center_priors = torch.cat(mlvl_center_priors, dim=1)
+        dis_preds = self.distribution_project(reg_preds) * center_priors[...,
+                                                                         2,
+                                                                         None]
+        bboxes = distance2bbox(
+            center_priors[..., :2], dis_preds, max_shape=input_shape)
+        scores = cls_preds.sigmoid()
+        result_list = []
+        for i in range(b):
+            score, bbox = scores[i], bboxes[i]
+            padding = score.new_zeros(score.shape[0], 1)
+            score = torch.cat([score, padding], dim=1)
+            results = multiclass_nms(
+                bbox,
+                score,
+                score_thr=0.05,
+                nms_cfg=dict(type='nms', iou_threshold=0.6),
+                max_num=100,
+            )
+            result_list.append(results)
+        return result_list
+
+    def get_single_level_center_priors(self, batch_size, featmap_size, stride,
+                                       dtype, device):
+        """Generate centers of a single stage feature map.
+        Args:
+            batch_size (int): Number of images in one batch.
+            featmap_size (tuple[int]): height and width of the feature map
+            stride (int): down sample stride of the feature map
+            dtype (obj:`torch.dtype`): data type of the tensors
+            device (obj:`torch.device`): device of the tensors
+        Return:
+            priors (Tensor): center priors of a single level feature map.
+        """
+        h, w = featmap_size
+        x_range = (torch.arange(w, dtype=dtype, device=device)) * stride
+        y_range = (torch.arange(h, dtype=dtype, device=device)) * stride
+        y, x = torch.meshgrid(y_range, x_range)
+        y = y.flatten()
+        x = x.flatten()
+        strides = x.new_full((x.shape[0], ), stride)
+        proiors = torch.stack([x, y, strides, strides], dim=-1)
+        return proiors.unsqueeze(0).repeat(batch_size, 1, 1)
diff --git a/modelscope/models/cv/face_human_hand_detection/one_stage_detector.py b/modelscope/models/cv/face_human_hand_detection/one_stage_detector.py
new file mode 100644
index 00000000..c1d0a52f
--- /dev/null
+++ b/modelscope/models/cv/face_human_hand_detection/one_stage_detector.py
@@ -0,0 +1,64 @@
+# The implementation here is modified based on nanodet,
+# originally Apache 2.0 License and publicly avaialbe at https://github.com/RangiLyu/nanodet
+
+import torch
+import torch.nn as nn
+
+from .ghost_pan import GhostPAN
+from .nanodet_plus_head import NanoDetPlusHead
+from .shufflenetv2 import ShuffleNetV2
+
+
+class OneStageDetector(nn.Module):
+
+    def __init__(self):
+        super(OneStageDetector, self).__init__()
+        self.backbone = ShuffleNetV2(
+            model_size='1.0x',
+            out_stages=(2, 3, 4),
+            with_last_conv=False,
+            kernal_size=3,
+            activation='LeakyReLU',
+            pretrain=False)
+        self.fpn = GhostPAN(
+            in_channels=[116, 232, 464],
+            out_channels=96,
+            use_depthwise=True,
+            kernel_size=5,
+            expand=1,
+            num_blocks=1,
+            use_res=False,
+            num_extra_level=1,
+            upsample_cfg=dict(scale_factor=2, mode='bilinear'),
+            norm_cfg=dict(type='BN'),
+            activation='LeakyReLU')
+        self.head = NanoDetPlusHead(
+            num_classes=3,
+            input_channel=96,
+            feat_channels=96,
+            stacked_convs=2,
+            kernel_size=5,
+            strides=[8, 16, 32, 64],
+            conv_type='DWConv',
+            norm_cfg=dict(type='BN'),
+            reg_max=7,
+            activation='LeakyReLU',
+            assigner_cfg=dict(topk=13))
+        self.epoch = 0
+
+    def forward(self, x):
+        x = self.backbone(x)
+        if hasattr(self, 'fpn'):
+            x = self.fpn(x)
+        if hasattr(self, 'head'):
+            x = self.head(x)
+        return x
+
+    def inference(self, meta):
+        with torch.no_grad():
+            torch.cuda.synchronize()
+            preds = self(meta['img'])
+            torch.cuda.synchronize()
+            results = self.head.post_process(preds, meta)
+            torch.cuda.synchronize()
+        return results
diff --git a/modelscope/models/cv/face_human_hand_detection/shufflenetv2.py b/modelscope/models/cv/face_human_hand_detection/shufflenetv2.py
new file mode 100644
index 00000000..7f4dfc2a
--- /dev/null
+++ b/modelscope/models/cv/face_human_hand_detection/shufflenetv2.py
@@ -0,0 +1,182 @@
+# The implementation here is modified based on nanodet,
+# originally Apache 2.0 License and publicly avaialbe at https://github.com/RangiLyu/nanodet
+
+import torch
+import torch.nn as nn
+
+from .utils import act_layers
+
+
+def channel_shuffle(x, groups):
+    batchsize, num_channels, height, width = x.data.size()
+    channels_per_group = num_channels // groups
+
+    x = x.view(batchsize, groups, channels_per_group, height, width)
+
+    x = torch.transpose(x, 1, 2).contiguous()
+
+    x = x.view(batchsize, -1, height, width)
+
+    return x
+
+
+class ShuffleV2Block(nn.Module):
+
+    def __init__(self, inp, oup, stride, activation='ReLU'):
+        super(ShuffleV2Block, self).__init__()
+
+        if not (1 <= stride <= 3):
+            raise ValueError('illegal stride value')
+        self.stride = stride
+
+        branch_features = oup // 2
+        assert (self.stride != 1) or (inp == branch_features << 1)
+
+        if self.stride > 1:
+            self.branch1 = nn.Sequential(
+                self.depthwise_conv(
+                    inp, inp, kernel_size=3, stride=self.stride, padding=1),
+                nn.BatchNorm2d(inp),
+                nn.Conv2d(
+                    inp,
+                    branch_features,
+                    kernel_size=1,
+                    stride=1,
+                    padding=0,
+                    bias=False),
+                nn.BatchNorm2d(branch_features),
+                act_layers(activation),
+            )
+        else:
+            self.branch1 = nn.Sequential()
+
+        self.branch2 = nn.Sequential(
+            nn.Conv2d(
+                inp if (self.stride > 1) else branch_features,
+                branch_features,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+                bias=False,
+            ),
+            nn.BatchNorm2d(branch_features),
+            act_layers(activation),
+            self.depthwise_conv(
+                branch_features,
+                branch_features,
+                kernel_size=3,
+                stride=self.stride,
+                padding=1,
+            ),
+            nn.BatchNorm2d(branch_features),
+            nn.Conv2d(
+                branch_features,
+                branch_features,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+                bias=False,
+            ),
+            nn.BatchNorm2d(branch_features),
+            act_layers(activation),
+        )
+
+    @staticmethod
+    def depthwise_conv(i, o, kernel_size, stride=1, padding=0, bias=False):
+        return nn.Conv2d(
+            i, o, kernel_size, stride, padding, bias=bias, groups=i)
+
+    def forward(self, x):
+        if self.stride == 1:
+            x1, x2 = x.chunk(2, dim=1)
+            out = torch.cat((x1, self.branch2(x2)), dim=1)
+        else:
+            out = torch.cat((self.branch1(x), self.branch2(x)), dim=1)
+
+        out = channel_shuffle(out, 2)
+
+        return out
+
+
+class ShuffleNetV2(nn.Module):
+
+    def __init__(
+        self,
+        model_size='1.5x',
+        out_stages=(2, 3, 4),
+        with_last_conv=False,
+        kernal_size=3,
+        activation='ReLU',
+        pretrain=True,
+    ):
+        super(ShuffleNetV2, self).__init__()
+        assert set(out_stages).issubset((2, 3, 4))
+
+        print('model size is ', model_size)
+
+        self.stage_repeats = [4, 8, 4]
+        self.model_size = model_size
+        self.out_stages = out_stages
+        self.with_last_conv = with_last_conv
+        self.kernal_size = kernal_size
+        self.activation = activation
+        if model_size == '0.5x':
+            self._stage_out_channels = [24, 48, 96, 192, 1024]
+        elif model_size == '1.0x':
+            self._stage_out_channels = [24, 116, 232, 464, 1024]
+        elif model_size == '1.5x':
+            self._stage_out_channels = [24, 176, 352, 704, 1024]
+        elif model_size == '2.0x':
+            self._stage_out_channels = [24, 244, 488, 976, 2048]
+        else:
+            raise NotImplementedError
+
+        # building first layer
+        input_channels = 3
+        output_channels = self._stage_out_channels[0]
+        self.conv1 = nn.Sequential(
+            nn.Conv2d(input_channels, output_channels, 3, 2, 1, bias=False),
+            nn.BatchNorm2d(output_channels),
+            act_layers(activation),
+        )
+        input_channels = output_channels
+
+        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+
+        stage_names = ['stage{}'.format(i) for i in [2, 3, 4]]
+        for name, repeats, output_channels in zip(
+                stage_names, self.stage_repeats, self._stage_out_channels[1:]):
+            seq = [
+                ShuffleV2Block(
+                    input_channels, output_channels, 2, activation=activation)
+            ]
+            for i in range(repeats - 1):
+                seq.append(
+                    ShuffleV2Block(
+                        output_channels,
+                        output_channels,
+                        1,
+                        activation=activation))
+            setattr(self, name, nn.Sequential(*seq))
+            input_channels = output_channels
+        output_channels = self._stage_out_channels[-1]
+        if self.with_last_conv:
+            conv5 = nn.Sequential(
+                nn.Conv2d(
+                    input_channels, output_channels, 1, 1, 0, bias=False),
+                nn.BatchNorm2d(output_channels),
+                act_layers(activation),
+            )
+            self.stage4.add_module('conv5', conv5)
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.maxpool(x)
+        output = []
+
+        for i in range(2, 5):
+            stage = getattr(self, 'stage{}'.format(i))
+            x = stage(x)
+            if i in self.out_stages:
+                output.append(x)
+        return tuple(output)
diff --git a/modelscope/models/cv/face_human_hand_detection/utils.py b/modelscope/models/cv/face_human_hand_detection/utils.py
new file mode 100644
index 00000000..f989c164
--- /dev/null
+++ b/modelscope/models/cv/face_human_hand_detection/utils.py
@@ -0,0 +1,277 @@
+# The implementation here is modified based on nanodet,
+# originally Apache 2.0 License and publicly avaialbe at https://github.com/RangiLyu/nanodet
+
+import torch
+import torch.nn as nn
+
+activations = {
+    'ReLU': nn.ReLU,
+    'LeakyReLU': nn.LeakyReLU,
+    'ReLU6': nn.ReLU6,
+    'SELU': nn.SELU,
+    'ELU': nn.ELU,
+    'GELU': nn.GELU,
+    'PReLU': nn.PReLU,
+    'SiLU': nn.SiLU,
+    'HardSwish': nn.Hardswish,
+    'Hardswish': nn.Hardswish,
+    None: nn.Identity,
+}
+
+
+def act_layers(name):
+    assert name in activations.keys()
+    if name == 'LeakyReLU':
+        return nn.LeakyReLU(negative_slope=0.1, inplace=True)
+    elif name == 'GELU':
+        return nn.GELU()
+    elif name == 'PReLU':
+        return nn.PReLU()
+    else:
+        return activations[name](inplace=True)
+
+
+norm_cfg = {
+    'BN': ('bn', nn.BatchNorm2d),
+    'SyncBN': ('bn', nn.SyncBatchNorm),
+    'GN': ('gn', nn.GroupNorm),
+}
+
+
+def build_norm_layer(cfg, num_features, postfix=''):
+    """Build normalization layer
+
+    Args:
+        cfg (dict): cfg should contain:
+            type (str): identify norm layer type.
+            layer args: args needed to instantiate a norm layer.
+            requires_grad (bool): [optional] whether stop gradient updates
+        num_features (int): number of channels from input.
+        postfix (int, str): appended into norm abbreviation to
+            create named layer.
+
+    Returns:
+        name (str): abbreviation + postfix
+        layer (nn.Module): created norm layer
+    """
+    assert isinstance(cfg, dict) and 'type' in cfg
+    cfg_ = cfg.copy()
+
+    layer_type = cfg_.pop('type')
+    if layer_type not in norm_cfg:
+        raise KeyError('Unrecognized norm type {}'.format(layer_type))
+    else:
+        abbr, norm_layer = norm_cfg[layer_type]
+        if norm_layer is None:
+            raise NotImplementedError
+
+    assert isinstance(postfix, (int, str))
+    name = abbr + str(postfix)
+
+    requires_grad = cfg_.pop('requires_grad', True)
+    cfg_.setdefault('eps', 1e-5)
+    if layer_type != 'GN':
+        layer = norm_layer(num_features, **cfg_)
+        if layer_type == 'SyncBN' and hasattr(layer, '_specify_ddp_gpu_num'):
+            layer._specify_ddp_gpu_num(1)
+    else:
+        assert 'num_groups' in cfg_
+        layer = norm_layer(num_channels=num_features, **cfg_)
+
+    for param in layer.parameters():
+        param.requires_grad = requires_grad
+
+    return name, layer
+
+
+class ConvModule(nn.Module):
+    """A conv block that contains conv/norm/activation layers.
+
+    Args:
+        in_channels (int): Same as nn.Conv2d.
+        out_channels (int): Same as nn.Conv2d.
+        kernel_size (int or tuple[int]): Same as nn.Conv2d.
+        stride (int or tuple[int]): Same as nn.Conv2d.
+        padding (int or tuple[int]): Same as nn.Conv2d.
+        dilation (int or tuple[int]): Same as nn.Conv2d.
+        groups (int): Same as nn.Conv2d.
+        bias (bool or str): If specified as `auto`, it will be decided by the
+            norm_cfg. Bias will be set as True if norm_cfg is None, otherwise
+            False.
+        conv_cfg (dict): Config dict for convolution layer.
+        norm_cfg (dict): Config dict for normalization layer.
+        activation (str): activation layer, "ReLU" by default.
+        inplace (bool): Whether to use inplace mode for activation.
+        order (tuple[str]): The order of conv/norm/activation layers. It is a
+            sequence of "conv", "norm" and "act". Examples are
+            ("conv", "norm", "act") and ("act", "conv", "norm").
+    """
+
+    def __init__(
+            self,
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride=1,
+            padding=0,
+            dilation=1,
+            groups=1,
+            bias='auto',
+            conv_cfg=None,
+            norm_cfg=None,
+            activation='ReLU',
+            inplace=True,
+            order=('conv', 'norm', 'act'),
+    ):
+        super(ConvModule, self).__init__()
+        assert conv_cfg is None or isinstance(conv_cfg, dict)
+        assert norm_cfg is None or isinstance(norm_cfg, dict)
+        assert activation is None or isinstance(activation, str)
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.activation = activation
+        self.inplace = inplace
+        self.order = order
+        assert isinstance(self.order, tuple) and len(self.order) == 3
+        assert set(order) == {'conv', 'norm', 'act'}
+
+        self.with_norm = norm_cfg is not None
+        if bias == 'auto':
+            bias = False if self.with_norm else True
+        self.with_bias = bias
+
+        if self.with_norm and self.with_bias:
+            warnings.warn('ConvModule has norm and bias at the same time')
+
+        self.conv = nn.Conv2d(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            groups=groups,
+            bias=bias,
+        )
+        self.in_channels = self.conv.in_channels
+        self.out_channels = self.conv.out_channels
+        self.kernel_size = self.conv.kernel_size
+        self.stride = self.conv.stride
+        self.padding = self.conv.padding
+        self.dilation = self.conv.dilation
+        self.transposed = self.conv.transposed
+        self.output_padding = self.conv.output_padding
+        self.groups = self.conv.groups
+
+        if self.with_norm:
+            if order.index('norm') > order.index('conv'):
+                norm_channels = out_channels
+            else:
+                norm_channels = in_channels
+            self.norm_name, norm = build_norm_layer(norm_cfg, norm_channels)
+            self.add_module(self.norm_name, norm)
+        else:
+            self.norm_name = None
+
+        if self.activation:
+            self.act = act_layers(self.activation)
+
+    @property
+    def norm(self):
+        if self.norm_name:
+            return getattr(self, self.norm_name)
+        else:
+            return None
+
+    def forward(self, x, norm=True):
+        for layer in self.order:
+            if layer == 'conv':
+                x = self.conv(x)
+            elif layer == 'norm' and norm and self.with_norm:
+                x = self.norm(x)
+            elif layer == 'act' and self.activation:
+                x = self.act(x)
+        return x
+
+
+class DepthwiseConvModule(nn.Module):
+
+    def __init__(
+            self,
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride=1,
+            padding=0,
+            dilation=1,
+            bias='auto',
+            norm_cfg=dict(type='BN'),
+            activation='ReLU',
+            inplace=True,
+            order=('depthwise', 'dwnorm', 'act', 'pointwise', 'pwnorm', 'act'),
+    ):
+        super(DepthwiseConvModule, self).__init__()
+        assert activation is None or isinstance(activation, str)
+        self.activation = activation
+        self.inplace = inplace
+        self.order = order
+        assert isinstance(self.order, tuple) and len(self.order) == 6
+        assert set(order) == {
+            'depthwise',
+            'dwnorm',
+            'act',
+            'pointwise',
+            'pwnorm',
+            'act',
+        }
+
+        self.with_norm = norm_cfg is not None
+        if bias == 'auto':
+            bias = False if self.with_norm else True
+        self.with_bias = bias
+
+        if self.with_norm and self.with_bias:
+            warnings.warn('ConvModule has norm and bias at the same time')
+
+        self.depthwise = nn.Conv2d(
+            in_channels,
+            in_channels,
+            kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            groups=in_channels,
+            bias=bias,
+        )
+        self.pointwise = nn.Conv2d(
+            in_channels,
+            out_channels,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            bias=bias)
+
+        self.in_channels = self.depthwise.in_channels
+        self.out_channels = self.pointwise.out_channels
+        self.kernel_size = self.depthwise.kernel_size
+        self.stride = self.depthwise.stride
+        self.padding = self.depthwise.padding
+        self.dilation = self.depthwise.dilation
+        self.transposed = self.depthwise.transposed
+        self.output_padding = self.depthwise.output_padding
+
+        if self.with_norm:
+            _, self.dwnorm = build_norm_layer(norm_cfg, in_channels)
+            _, self.pwnorm = build_norm_layer(norm_cfg, out_channels)
+
+        if self.activation:
+            self.act = act_layers(self.activation)
+
+    def forward(self, x, norm=True):
+        for layer_name in self.order:
+            if layer_name != 'act':
+                layer = self.__getattr__(layer_name)
+                x = layer(x)
+            elif layer_name == 'act' and self.activation:
+                x = self.act(x)
+        return x
diff --git a/modelscope/models/cv/face_recognition/align_face.py b/modelscope/models/cv/face_recognition/align_face.py
index a6469a10..0477375a 100644
--- a/modelscope/models/cv/face_recognition/align_face.py
+++ b/modelscope/models/cv/face_recognition/align_face.py
@@ -1,3 +1,7 @@
+"""
+The implementation here is modified based on insightface, originally MIT license and publicly avaialbe at
+https://github.com/deepinsight/insightface/blob/master/python-package/insightface/utils/face_align.py
+"""
 import cv2
 import numpy as np
 from skimage import transform as trans
diff --git a/modelscope/models/cv/face_recognition/torchkit/backbone/__init__.py b/modelscope/models/cv/face_recognition/torchkit/backbone/__init__.py
index a58d8e17..afe89963 100755
--- a/modelscope/models/cv/face_recognition/torchkit/backbone/__init__.py
+++ b/modelscope/models/cv/face_recognition/torchkit/backbone/__init__.py
@@ -1,3 +1,5 @@
+# The implementation is adopted from TFace,made pubicly available under the Apache-2.0 license at
+# https://github.com/Tencent/TFace/blob/master/recognition/torchkit/backbone
 from .model_irse import (IR_18, IR_34, IR_50, IR_101, IR_152, IR_200, IR_SE_50,
                          IR_SE_101, IR_SE_152, IR_SE_200)
 from .model_resnet import ResNet_50, ResNet_101, ResNet_152
diff --git a/modelscope/models/cv/face_recognition/torchkit/backbone/common.py b/modelscope/models/cv/face_recognition/torchkit/backbone/common.py
index 426d2591..a1683225 100755
--- a/modelscope/models/cv/face_recognition/torchkit/backbone/common.py
+++ b/modelscope/models/cv/face_recognition/torchkit/backbone/common.py
@@ -1,3 +1,5 @@
+# The implementation is adopted from TFace,made pubicly available under the Apache-2.0 license at
+# https://github.com/Tencent/TFace/blob/master/recognition/torchkit/backbone/common.py
 import torch
 import torch.nn as nn
 from torch.nn import (BatchNorm1d, BatchNorm2d, Conv2d, Linear, Module, ReLU,
diff --git a/modelscope/models/cv/face_recognition/torchkit/backbone/model_irse.py b/modelscope/models/cv/face_recognition/torchkit/backbone/model_irse.py
index 4fb7ee9c..1982ca05 100755
--- a/modelscope/models/cv/face_recognition/torchkit/backbone/model_irse.py
+++ b/modelscope/models/cv/face_recognition/torchkit/backbone/model_irse.py
@@ -1,5 +1,5 @@
-# based on:
-# https://github.com/ZhaoJ9014/face.evoLVe.PyTorch/blob/master/backbone/model_irse.py
+# The implementation is adopted from TFace,made pubicly available under the Apache-2.0 license at
+# https://github.com/Tencent/TFace/blob/master/recognition/torchkit/backbone/model_irse.py
 from collections import namedtuple
 
 from torch.nn import (BatchNorm1d, BatchNorm2d, Conv2d, Dropout, Linear,
diff --git a/modelscope/models/cv/face_recognition/torchkit/backbone/model_resnet.py b/modelscope/models/cv/face_recognition/torchkit/backbone/model_resnet.py
index 7072f384..568e24ff 100755
--- a/modelscope/models/cv/face_recognition/torchkit/backbone/model_resnet.py
+++ b/modelscope/models/cv/face_recognition/torchkit/backbone/model_resnet.py
@@ -1,5 +1,5 @@
-# based on:
-# https://github.com/ZhaoJ9014/face.evoLVe.PyTorch/blob/master/backbone/model_resnet.py
+# The implementation is adopted from TFace,made pubicly available under the Apache-2.0 license at
+# https://github.com/Tencent/TFace/blob/master/recognition/torchkit/backbone/model_resnet.py
 import torch.nn as nn
 from torch.nn import (BatchNorm1d, BatchNorm2d, Conv2d, Dropout, Linear,
                       MaxPool2d, Module, ReLU, Sequential)
diff --git a/modelscope/models/cv/facial_expression_recognition/__init__.py b/modelscope/models/cv/facial_expression_recognition/__init__.py
new file mode 100644
index 00000000..35a15d18
--- /dev/null
+++ b/modelscope/models/cv/facial_expression_recognition/__init__.py
@@ -0,0 +1,20 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .fer import FacialExpressionRecognition
+
+else:
+    _import_structure = {'fer': ['FacialExpressionRecognition']}
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/models/cv/facial_expression_recognition/fer/__init__.py b/modelscope/models/cv/facial_expression_recognition/fer/__init__.py
new file mode 100644
index 00000000..2546035b
--- /dev/null
+++ b/modelscope/models/cv/facial_expression_recognition/fer/__init__.py
@@ -0,0 +1,2 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from .facial_expression_recognition import FacialExpressionRecognition
diff --git a/modelscope/models/cv/facial_expression_recognition/fer/facial_expression_recognition.py b/modelscope/models/cv/facial_expression_recognition/fer/facial_expression_recognition.py
new file mode 100644
index 00000000..c5eb71a1
--- /dev/null
+++ b/modelscope/models/cv/facial_expression_recognition/fer/facial_expression_recognition.py
@@ -0,0 +1,72 @@
+# The implementation is based on Facial-Expression-Recognition, available at
+# https://github.com/WuJie1010/Facial-Expression-Recognition.Pytorch
+import os
+
+import cv2
+import numpy as np
+import torch
+import torch.backends.cudnn as cudnn
+import torch.nn.functional as F
+from PIL import Image
+from torch.autograd import Variable
+
+from modelscope.metainfo import Models
+from modelscope.models.base import Tensor, TorchModel
+from modelscope.models.builder import MODELS
+from modelscope.utils.constant import ModelFile, Tasks
+from . import transforms
+from .vgg import VGG
+
+
+@MODELS.register_module(
+    Tasks.facial_expression_recognition, module_name=Models.fer)
+class FacialExpressionRecognition(TorchModel):
+
+    def __init__(self, model_path, device='cuda'):
+        super().__init__(model_path)
+        torch.set_grad_enabled(False)
+        cudnn.benchmark = True
+        self.model_path = model_path
+        self.device = device
+        self.cfg_path = model_path.replace(ModelFile.TORCH_MODEL_FILE,
+                                           ModelFile.CONFIGURATION)
+        self.net = VGG('VGG19', cfg_path=self.cfg_path)
+        self.load_model()
+        self.net = self.net.to(device)
+        self.transform_test = transforms.Compose([
+            transforms.TenCrop(44),
+            transforms.Lambda(lambda crops: torch.stack(
+                [transforms.ToTensor()(crop) for crop in crops])),
+        ])
+
+        self.mean = np.array([[104, 117, 123]])
+
+    def load_model(self, load_to_cpu=False):
+        pretrained_dict = torch.load(
+            self.model_path, map_location=torch.device('cpu'))
+        self.net.load_state_dict(pretrained_dict['net'], strict=True)
+        self.net.eval()
+
+    def forward(self, input):
+        img = input['img']
+        img = cv2.cvtColor(img.cpu().numpy(), cv2.COLOR_BGR2GRAY)
+        img = cv2.resize(img, (48, 48))
+        img = img[:, :, np.newaxis]
+        img = np.concatenate((img, img, img), axis=2)
+
+        img = Image.fromarray(np.uint8(img))
+        inputs = self.transform_test(img)
+
+        ncrops, c, h, w = inputs.shape
+
+        inputs = inputs.view(-1, c, h, w)
+        inputs = inputs.to(self.device)
+        inputs = Variable(inputs, volatile=True)
+        outputs = self.net(inputs)
+
+        outputs_avg = outputs.view(ncrops, -1).mean(0)  # avg over crops
+
+        score = F.softmax(outputs_avg)
+        _, predicted = torch.max(outputs_avg.data, 0)
+
+        return score, predicted
diff --git a/modelscope/models/cv/facial_expression_recognition/fer/transforms.py b/modelscope/models/cv/facial_expression_recognition/fer/transforms.py
new file mode 100644
index 00000000..a1448c49
--- /dev/null
+++ b/modelscope/models/cv/facial_expression_recognition/fer/transforms.py
@@ -0,0 +1,118 @@
+# The implementation is based on Facial-Expression-Recognition, available at
+# https://github.com/WuJie1010/Facial-Expression-Recognition.Pytorch
+import numbers
+import types
+
+import numpy as np
+import torch
+from PIL import Image
+
+
+def to_tensor(pic):
+
+    # handle PIL Image
+    if pic.mode == 'I':
+        img = torch.from_numpy(np.array(pic, np.int32, copy=False))
+    elif pic.mode == 'I;16':
+        img = torch.from_numpy(np.array(pic, np.int16, copy=False))
+    else:
+        img = torch.ByteTensor(torch.ByteStorage.from_buffer(pic.tobytes()))
+    # PIL image mode: 1, L, P, I, F, RGB, YCbCr, RGBA, CMYK
+    if pic.mode == 'YCbCr':
+        nchannel = 3
+    elif pic.mode == 'I;16':
+        nchannel = 1
+    else:
+        nchannel = len(pic.mode)
+    img = img.view(pic.size[1], pic.size[0], nchannel)
+    # put it from HWC to CHW format
+    # yikes, this transpose takes 80% of the loading time/CPU
+    img = img.transpose(0, 1).transpose(0, 2).contiguous()
+    if isinstance(img, torch.ByteTensor):
+        return img.float().div(255)
+    else:
+        return img
+
+
+def center_crop(img, output_size):
+    if isinstance(output_size, numbers.Number):
+        output_size = (int(output_size), int(output_size))
+    w, h = img.size
+    th, tw = output_size
+    i = int(round((h - th) / 2.))
+    j = int(round((w - tw) / 2.))
+    return img.crop((j, i, j + tw, i + th))
+
+
+def five_crop(img, size):
+    if isinstance(size, numbers.Number):
+        size = (int(size), int(size))
+    else:
+        assert len(
+            size) == 2, 'Please provide only two dimensions (h, w) for size.'
+
+    w, h = img.size
+    crop_h, crop_w = size
+    if crop_w > w or crop_h > h:
+        raise ValueError(
+            'Requested crop size {} is bigger than input size {}'.format(
+                size, (h, w)))
+    tl = img.crop((0, 0, crop_w, crop_h))
+    tr = img.crop((w - crop_w, 0, w, crop_h))
+    bl = img.crop((0, h - crop_h, crop_w, h))
+    br = img.crop((w - crop_w, h - crop_h, w, h))
+    center = center_crop(img, (crop_h, crop_w))
+    return (tl, tr, bl, br, center)
+
+
+class TenCrop(object):
+
+    def __init__(self, size, vertical_flip=False):
+        self.size = size
+        if isinstance(size, numbers.Number):
+            self.size = (int(size), int(size))
+        else:
+            assert len(
+                size
+            ) == 2, 'Please provide only two dimensions (h, w) for size.'
+            self.size = size
+        self.vertical_flip = vertical_flip
+
+    def __call__(self, img):
+        first_five = five_crop(img, self.size)
+
+        if self.vertical_flip:
+            img = img.transpose(Image.FLIP_TOP_BOTTOM)
+        else:
+            img = img.transpose(Image.FLIP_LEFT_RIGHT)
+
+        second_five = five_crop(img, self.size)
+
+        return first_five + second_five
+
+
+class Compose(object):
+
+    def __init__(self, transforms):
+        self.transforms = transforms
+
+    def __call__(self, img):
+        for t in self.transforms:
+            img = t(img)
+        return img
+
+
+class ToTensor(object):
+
+    def __call__(self, pic):
+        return to_tensor(pic)
+
+
+class Lambda(object):
+
+    def __init__(self, lambd):
+        assert isinstance(lambd, types.LambdaType)
+        self.lambd = lambd
+
+    def __call__(self, img):
+        return self.lambd(img)
diff --git a/modelscope/models/cv/facial_expression_recognition/fer/vgg.py b/modelscope/models/cv/facial_expression_recognition/fer/vgg.py
new file mode 100644
index 00000000..8120b6cc
--- /dev/null
+++ b/modelscope/models/cv/facial_expression_recognition/fer/vgg.py
@@ -0,0 +1,40 @@
+# The implementation is based on Facial-Expression-Recognition, available at
+# https://github.com/WuJie1010/Facial-Expression-Recognition.Pytorch
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.autograd import Variable
+
+from modelscope.utils.config import Config
+
+
+class VGG(nn.Module):
+
+    def __init__(self, vgg_name, cfg_path):
+        super(VGG, self).__init__()
+        model_cfg = Config.from_file(cfg_path)['models']
+        self.features = self._make_layers(model_cfg[vgg_name])
+        self.classifier = nn.Linear(512, 7)
+
+    def forward(self, x):
+        out = self.features(x)
+        out = out.view(out.size(0), -1)
+        out = F.dropout(out, p=0.5, training=self.training)
+        out = self.classifier(out)
+        return out
+
+    def _make_layers(self, cfg):
+        layers = []
+        in_channels = 3
+        for x in cfg:
+            if x == 'M':
+                layers += [nn.MaxPool2d(kernel_size=2, stride=2)]
+            else:
+                layers += [
+                    nn.Conv2d(in_channels, x, kernel_size=3, padding=1),
+                    nn.BatchNorm2d(x),
+                    nn.ReLU(inplace=True)
+                ]
+                in_channels = x
+        layers += [nn.AvgPool2d(kernel_size=1, stride=1)]
+        return nn.Sequential(*layers)
diff --git a/modelscope/models/cv/hand_static/__init__.py b/modelscope/models/cv/hand_static/__init__.py
new file mode 100644
index 00000000..654d2acb
--- /dev/null
+++ b/modelscope/models/cv/hand_static/__init__.py
@@ -0,0 +1,20 @@
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .hand_model import HandStatic
+
+else:
+    _import_structure = {'hand_model': ['HandStatic']}
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/models/cv/hand_static/hand_model.py b/modelscope/models/cv/hand_static/hand_model.py
new file mode 100644
index 00000000..38517307
--- /dev/null
+++ b/modelscope/models/cv/hand_static/hand_model.py
@@ -0,0 +1,93 @@
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
+import os
+import sys
+
+import cv2
+import numpy as np
+import torch
+import torch.nn.functional as F
+from PIL import Image
+from torch import nn
+from torchvision.transforms import transforms
+
+from modelscope.metainfo import Models
+from modelscope.models.base import TorchModel
+from modelscope.models.builder import MODELS
+from modelscope.utils.constant import ModelFile, Tasks
+from modelscope.utils.logger import get_logger
+from .networks import StaticGestureNet
+
+logger = get_logger()
+
+map_idx = {
+    0: 'unrecog',
+    1: 'one',
+    2: 'two',
+    3: 'bixin',
+    4: 'yaogun',
+    5: 'zan',
+    6: 'fist',
+    7: 'ok',
+    8: 'tuoju',
+    9: 'd_bixin',
+    10: 'd_fist_left',
+    11: 'd_fist_right',
+    12: 'd_hand',
+    13: 'fashe',
+    14: 'five',
+    15: 'nohand'
+}
+
+img_size = [112, 112]
+
+spatial_transform = transforms.Compose([
+    transforms.Resize(img_size),
+    transforms.ToTensor(),
+    transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
+])
+
+
+@MODELS.register_module(Tasks.hand_static, module_name=Models.hand_static)
+class HandStatic(TorchModel):
+
+    def __init__(self, model_dir, device_id=0, *args, **kwargs):
+
+        super().__init__(
+            model_dir=model_dir, device_id=device_id, *args, **kwargs)
+
+        self.model = StaticGestureNet()
+        if torch.cuda.is_available():
+            self.device = 'cuda'
+        else:
+            self.device = 'cpu'
+        self.params = torch.load(
+            '{}/{}'.format(model_dir, ModelFile.TORCH_MODEL_BIN_FILE),
+            map_location=self.device)
+
+        self.model.load_state_dict(self.params)
+        self.model.to(self.device)
+        self.model.eval()
+        self.device_id = device_id
+        if self.device_id >= 0 and self.device == 'cuda':
+            self.model.to('cuda:{}'.format(self.device_id))
+            logger.info('Use GPU: {}'.format(self.device_id))
+        else:
+            self.device_id = -1
+            logger.info('Use CPU for inference')
+
+    def forward(self, x):
+        pred_result = self.model(x)
+        return pred_result
+
+
+def infer(img_path, model, device):
+
+    img = Image.open(img_path)
+    clip = spatial_transform(img)
+    clip = clip.unsqueeze(0).to(device).float()
+    outputs = model(clip)
+    predicted = int(outputs.max(1)[1])
+    pred_result = map_idx.get(predicted)
+    logger.info('pred result: {}'.format(pred_result))
+
+    return pred_result
diff --git a/modelscope/models/cv/hand_static/networks.py b/modelscope/models/cv/hand_static/networks.py
new file mode 100644
index 00000000..6cf46f5d
--- /dev/null
+++ b/modelscope/models/cv/hand_static/networks.py
@@ -0,0 +1,358 @@
+""" HandStatic
+The implementation here is modified based on MobileFaceNet,
+originally Apache 2.0 License and publicly avaialbe at https://github.com/xuexingyu24/MobileFaceNet_Tutorial_Pytorch
+"""
+
+import os
+
+import torch
+import torch.nn as nn
+import torchvision
+import torchvision.models as models
+from torch.nn import (AdaptiveAvgPool2d, BatchNorm1d, BatchNorm2d, Conv2d,
+                      Dropout, Linear, MaxPool2d, Module, PReLU, ReLU,
+                      Sequential, Sigmoid)
+
+
+class StaticGestureNet(torch.nn.Module):
+
+    def __init__(self, train=True):
+        super().__init__()
+
+        model = MobileFaceNet(512)
+        self.feature_extractor = model
+        self.fc_layer = torch.nn.Sequential(
+            nn.Linear(512, 128), nn.Softplus(), nn.Linear(128, 15))
+        self.sigmoid = nn.Sigmoid()
+
+    def forward(self, inputs):
+        out = self.feature_extractor(inputs)
+        out = self.fc_layer(out)
+        out = self.sigmoid(out)
+        return out
+
+
+class Flatten(Module):
+
+    def forward(self, input):
+        return input.view(input.size(0), -1)
+
+
+def l2_norm(input, axis=1):
+    norm = torch.norm(input, 2, axis, True)
+    output = torch.div(input, norm)
+    return output
+
+
+class SEModule(Module):
+
+    def __init__(self, channels, reduction):
+        super(SEModule, self).__init__()
+        self.avg_pool = AdaptiveAvgPool2d(1)
+        self.fc1 = Conv2d(
+            channels,
+            channels // reduction,
+            kernel_size=1,
+            padding=0,
+            bias=False)
+        self.relu = ReLU(inplace=True)
+        self.fc2 = Conv2d(
+            channels // reduction,
+            channels,
+            kernel_size=1,
+            padding=0,
+            bias=False)
+        self.sigmoid = Sigmoid()
+
+    def forward(self, x):
+        module_input = x
+        x = self.avg_pool(x)
+        x = self.fc1(x)
+        x = self.relu(x)
+        x = self.fc2(x)
+        x = self.sigmoid(x)
+        return module_input * x
+
+
+class BottleneckIR(Module):
+
+    def __init__(self, in_channel, depth, stride):
+        super(BottleneckIR, self).__init__()
+        if in_channel == depth:
+            self.shortcut_layer = MaxPool2d(1, stride)
+        else:
+            self.shortcut_layer = Sequential(
+                Conv2d(in_channel, depth, (1, 1), stride, bias=False),
+                BatchNorm2d(depth))
+        self.res_layer = Sequential(
+            BatchNorm2d(in_channel),
+            Conv2d(in_channel, depth, (3, 3), (1, 1), 1, bias=False),
+            PReLU(depth), Conv2d(depth, depth, (3, 3), stride, 1, bias=False),
+            BatchNorm2d(depth))
+
+    def forward(self, x):
+        shortcut = self.shortcut_layer(x)
+        res = self.res_layer(x)
+        return res + shortcut
+
+
+class BottleneckIRSE(Module):
+
+    def __init__(self, in_channel, depth, stride):
+        super(BottleneckIRSE, self).__init__()
+        if in_channel == depth:
+            self.shortcut_layer = MaxPool2d(1, stride)
+        else:
+            self.shortcut_layer = Sequential(
+                Conv2d(in_channel, depth, (1, 1), stride, bias=False),
+                BatchNorm2d(depth))
+        self.res_layer = Sequential(
+            BatchNorm2d(in_channel),
+            Conv2d(in_channel, depth, (3, 3), (1, 1), 1, bias=False),
+            PReLU(depth), Conv2d(depth, depth, (3, 3), stride, 1, bias=False),
+            BatchNorm2d(depth), SEModule(depth, 16))
+
+    def forward(self, x):
+        shortcut = self.shortcut_layer(x)
+        res = self.res_layer(x)
+        return res + shortcut
+
+
+def get_block(in_channel, depth, num_units, stride=2):
+    return [Bottleneck(in_channel, depth, stride)
+            ] + [Bottleneck(depth, depth, 1) for i in range(num_units - 1)]
+
+
+def get_blocks(num_layers):
+    if num_layers == 50:
+        blocks = [
+            get_block(in_channel=64, depth=64, num_units=3),
+            get_block(in_channel=64, depth=128, num_units=4),
+            get_block(in_channel=128, depth=256, num_units=14),
+            get_block(in_channel=256, depth=512, num_units=3)
+        ]
+    elif num_layers == 100:
+        blocks = [
+            get_block(in_channel=64, depth=64, num_units=3),
+            get_block(in_channel=64, depth=128, num_units=13),
+            get_block(in_channel=128, depth=256, num_units=30),
+            get_block(in_channel=256, depth=512, num_units=3)
+        ]
+    elif num_layers == 152:
+        blocks = [
+            get_block(in_channel=64, depth=64, num_units=3),
+            get_block(in_channel=64, depth=128, num_units=8),
+            get_block(in_channel=128, depth=256, num_units=36),
+            get_block(in_channel=256, depth=512, num_units=3)
+        ]
+    return blocks
+
+
+class Backbone(Module):
+
+    def __init__(self, num_layers, drop_ratio, mode='ir'):
+        super(Backbone, self).__init__()
+        assert num_layers in [50, 100,
+                              152], 'num_layers should be 50,100, or 152'
+        assert mode in ['ir', 'ir_se'], 'mode should be ir or ir_se'
+        blocks = get_blocks(num_layers)
+        if mode == 'ir':
+            unit_module = BottleneckIR
+        elif mode == 'ir_se':
+            unit_module = BottleneckIRSE
+        self.input_layer = Sequential(
+            Conv2d(3, 64, (3, 3), 1, 1, bias=False), BatchNorm2d(64),
+            PReLU(64))
+        self.output_layer = Sequential(
+            BatchNorm2d(512), Dropout(drop_ratio), Flatten(),
+            Linear(512 * 7 * 7, 512), BatchNorm1d(512))
+        modules = []
+        for block in blocks:
+            for bottleneck in block:
+                modules.append(
+                    unit_module(bottleneck.in_channel, bottleneck.depth,
+                                bottleneck.stride))
+        self.body = Sequential(*modules)
+
+    def forward(self, x):
+        x = self.input_layer(x)
+        x = self.body(x)
+        x = self.output_layer(x)
+        return l2_norm(x)
+
+
+class ConvBlock(Module):
+
+    def __init__(self,
+                 in_c,
+                 out_c,
+                 kernel=(1, 1),
+                 stride=(1, 1),
+                 padding=(0, 0),
+                 groups=1):
+        super(ConvBlock, self).__init__()
+        self.conv = Conv2d(
+            in_c,
+            out_channels=out_c,
+            kernel_size=kernel,
+            groups=groups,
+            stride=stride,
+            padding=padding,
+            bias=False)
+        self.bn = BatchNorm2d(out_c)
+        self.prelu = PReLU(out_c)
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.bn(x)
+        x = self.prelu(x)
+        return x
+
+
+class LinearBlock(Module):
+
+    def __init__(self,
+                 in_c,
+                 out_c,
+                 kernel=(1, 1),
+                 stride=(1, 1),
+                 padding=(0, 0),
+                 groups=1):
+        super(LinearBlock, self).__init__()
+        self.conv = Conv2d(
+            in_c,
+            out_channels=out_c,
+            kernel_size=kernel,
+            groups=groups,
+            stride=stride,
+            padding=padding,
+            bias=False)
+        self.bn = BatchNorm2d(out_c)
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.bn(x)
+        return x
+
+
+class DepthWise(Module):
+
+    def __init__(self,
+                 in_c,
+                 out_c,
+                 residual=False,
+                 kernel=(3, 3),
+                 stride=(2, 2),
+                 padding=(1, 1),
+                 groups=1):
+        super(DepthWise, self).__init__()
+        self.conv = ConvBlock(
+            in_c, out_c=groups, kernel=(1, 1), padding=(0, 0), stride=(1, 1))
+        self.conv_dw = ConvBlock(
+            groups,
+            groups,
+            groups=groups,
+            kernel=kernel,
+            padding=padding,
+            stride=stride)
+        self.project = LinearBlock(
+            groups, out_c, kernel=(1, 1), padding=(0, 0), stride=(1, 1))
+        self.residual = residual
+
+    def forward(self, x):
+        if self.residual:
+            short_cut = x
+        x = self.conv(x)
+        x = self.conv_dw(x)
+        x = self.project(x)
+        if self.residual:
+            output = short_cut + x
+        else:
+            output = x
+        return output
+
+
+class Residual(Module):
+
+    def __init__(self,
+                 c,
+                 num_block,
+                 groups,
+                 kernel=(3, 3),
+                 stride=(1, 1),
+                 padding=(1, 1)):
+        super(Residual, self).__init__()
+        modules = []
+        for _ in range(num_block):
+            modules.append(
+                DepthWise(
+                    c,
+                    c,
+                    residual=True,
+                    kernel=kernel,
+                    padding=padding,
+                    stride=stride,
+                    groups=groups))
+        self.model = Sequential(*modules)
+
+    def forward(self, x):
+        return self.model(x)
+
+
+class MobileFaceNet(Module):
+
+    def __init__(self, embedding_size):
+        super(MobileFaceNet, self).__init__()
+        self.conv1 = ConvBlock(
+            3, 64, kernel=(3, 3), stride=(2, 2), padding=(1, 1))
+        self.conv2_dw = ConvBlock(
+            64, 64, kernel=(3, 3), stride=(1, 1), padding=(1, 1), groups=64)
+        self.conv_23 = DepthWise(
+            64, 64, kernel=(3, 3), stride=(2, 2), padding=(1, 1), groups=128)
+        self.conv_3 = Residual(
+            64,
+            num_block=4,
+            groups=128,
+            kernel=(3, 3),
+            stride=(1, 1),
+            padding=(1, 1))
+        self.conv_34 = DepthWise(
+            64, 128, kernel=(3, 3), stride=(2, 2), padding=(1, 1), groups=256)
+        self.conv_4 = Residual(
+            128,
+            num_block=6,
+            groups=256,
+            kernel=(3, 3),
+            stride=(1, 1),
+            padding=(1, 1))
+        self.conv_45 = DepthWise(
+            128, 128, kernel=(3, 3), stride=(2, 2), padding=(1, 1), groups=512)
+        self.conv_5 = Residual(
+            128,
+            num_block=2,
+            groups=256,
+            kernel=(3, 3),
+            stride=(1, 1),
+            padding=(1, 1))
+        self.conv_6_sep = ConvBlock(
+            128, 512, kernel=(1, 1), stride=(1, 1), padding=(0, 0))
+        self.conv_6_dw = LinearBlock(
+            512, 512, groups=512, kernel=(7, 7), stride=(1, 1), padding=(0, 0))
+        self.conv_6_flatten = Flatten()
+        self.linear = Linear(512, embedding_size, bias=False)
+        self.bn = BatchNorm1d(embedding_size)
+
+    def forward(self, x):
+        out = self.conv1(x)
+        out = self.conv2_dw(out)
+        out = self.conv_23(out)
+        out = self.conv_3(out)
+        out = self.conv_34(out)
+        out = self.conv_4(out)
+        out = self.conv_45(out)
+        out = self.conv_5(out)
+        out = self.conv_6_sep(out)
+        out = self.conv_6_dw(out)
+        out = self.conv_6_flatten(out)
+        out = self.linear(out)
+        return l2_norm(out)
diff --git a/modelscope/models/cv/image_instance_segmentation/backbones/swin_transformer.py b/modelscope/models/cv/image_instance_segmentation/backbones/swin_transformer.py
index 3e7609e1..2007688d 100644
--- a/modelscope/models/cv/image_instance_segmentation/backbones/swin_transformer.py
+++ b/modelscope/models/cv/image_instance_segmentation/backbones/swin_transformer.py
@@ -1,5 +1,5 @@
-# Modified from: https://github.com/microsoft/Swin-Transformer/blob/main/models/swin_transformer.py
-
+# The implementation is adopted from Swin Transformer, made publicly available under the MIT License at
+# https://github.com/microsoft/Swin-Transformer/blob/main/models/swin_transformer.py
 import numpy as np
 import torch
 import torch.nn as nn
diff --git a/modelscope/models/cv/image_instance_segmentation/cascade_mask_rcnn_swin.py b/modelscope/models/cv/image_instance_segmentation/cascade_mask_rcnn_swin.py
index 30e70f82..ff83271e 100644
--- a/modelscope/models/cv/image_instance_segmentation/cascade_mask_rcnn_swin.py
+++ b/modelscope/models/cv/image_instance_segmentation/cascade_mask_rcnn_swin.py
@@ -1,3 +1,5 @@
+# Part of the implementation is borrowed and modified from MMDetection, publicly available at
+# https://github.com/open-mmlab/mmdetection/blob/master/mmdet/models/detectors/two_stage.py
 import os
 from collections import OrderedDict
 
diff --git a/modelscope/models/cv/image_instance_segmentation/datasets/__init__.py b/modelscope/models/cv/image_instance_segmentation/datasets/__init__.py
index cca1432f..1b096fb3 100644
--- a/modelscope/models/cv/image_instance_segmentation/datasets/__init__.py
+++ b/modelscope/models/cv/image_instance_segmentation/datasets/__init__.py
@@ -1 +1,2 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
 from .transforms import build_preprocess_transform
diff --git a/modelscope/models/cv/image_instance_segmentation/datasets/transforms.py b/modelscope/models/cv/image_instance_segmentation/datasets/transforms.py
index c2c11286..f0dde759 100644
--- a/modelscope/models/cv/image_instance_segmentation/datasets/transforms.py
+++ b/modelscope/models/cv/image_instance_segmentation/datasets/transforms.py
@@ -1,3 +1,4 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
 import os.path as osp
 
 import numpy as np
@@ -51,9 +52,9 @@ class LoadImageFromFile:
     """Load an image from file.
 
     Required keys are "img_prefix" and "img_info" (a dict that must contain the
-    key "filename"). Added or updated keys are "filename", "img", "img_shape",
-    "ori_shape" (same as `img_shape`), "pad_shape" (same as `img_shape`),
-    "scale_factor" (1.0) and "img_norm_cfg" (means=0 and stds=1).
+    key "filename", "ann_file", and "classes"). Added or updated keys are
+    "filename", "ori_filename", "img", "img_shape", "ori_shape" (same as `img_shape`),
+    "img_fields", "ann_file" (path to annotation file) and "classes".
 
     Args:
         to_float32 (bool): Whether to convert the loaded image to a float32
@@ -73,7 +74,7 @@ class LoadImageFromFile:
         """Call functions to load image and get image meta information.
 
         Args:
-            results (dict): Result dict from :obj:`ImageInstanceSegmentationDataset`.
+            results (dict): Result dict from :obj:`ImageInstanceSegmentationCocoDataset`.
 
         Returns:
             dict: The dict contains loaded image and meta information.
diff --git a/modelscope/models/cv/image_instance_segmentation/model.py b/modelscope/models/cv/image_instance_segmentation/model.py
index 2be59623..a56a1608 100644
--- a/modelscope/models/cv/image_instance_segmentation/model.py
+++ b/modelscope/models/cv/image_instance_segmentation/model.py
@@ -1,3 +1,4 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
 import os
 from typing import Any, Dict
 
diff --git a/modelscope/models/cv/image_instance_segmentation/postprocess_utils.py b/modelscope/models/cv/image_instance_segmentation/postprocess_utils.py
index 43e52292..6058cd73 100644
--- a/modelscope/models/cv/image_instance_segmentation/postprocess_utils.py
+++ b/modelscope/models/cv/image_instance_segmentation/postprocess_utils.py
@@ -1,3 +1,5 @@
+# Part of the implementation is borrowed and modified from MMDetection, publicly available at
+# https://github.com/open-mmlab/mmdetection/blob/master/mmdet/core/visualization/image.py
 import itertools
 
 import cv2
@@ -105,12 +107,12 @@ def get_img_ins_seg_result(img_seg_result=None,
     }
     for seg_result in img_seg_result:
 
-        box = {
-            'x': np.int(seg_result[0]),
-            'y': np.int(seg_result[1]),
-            'w': np.int(seg_result[2] - seg_result[0]),
-            'h': np.int(seg_result[3] - seg_result[1])
-        }
+        box = [
+            np.int(seg_result[0]),
+            np.int(seg_result[1]),
+            np.int(seg_result[2]),
+            np.int(seg_result[3])
+        ]
         score = np.float(seg_result[4])
         category = seg_result[5]
 
@@ -161,12 +163,10 @@ def show_result(
             np.random.random() * 255.0
         ])
 
-        x1 = int(box['x'])
-        y1 = int(box['y'])
-        w = int(box['w'])
-        h = int(box['h'])
-        x2 = x1 + w
-        y2 = y1 + h
+        x1 = int(box[0])
+        y1 = int(box[1])
+        x2 = int(box[2])
+        y2 = int(box[3])
 
         if show_box:
             cv2.rectangle(
diff --git a/modelscope/models/cv/image_panoptic_segmentation/__init__.py b/modelscope/models/cv/image_panoptic_segmentation/__init__.py
new file mode 100644
index 00000000..2b2be4b7
--- /dev/null
+++ b/modelscope/models/cv/image_panoptic_segmentation/__init__.py
@@ -0,0 +1,22 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .panseg_model import SwinLPanopticSegmentation
+
+else:
+    _import_structure = {
+        'panseg_model': ['SwinLPanopticSegmentation'],
+    }
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/models/cv/image_panoptic_segmentation/panseg_model.py b/modelscope/models/cv/image_panoptic_segmentation/panseg_model.py
new file mode 100644
index 00000000..f44c01e8
--- /dev/null
+++ b/modelscope/models/cv/image_panoptic_segmentation/panseg_model.py
@@ -0,0 +1,53 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os.path as osp
+
+import torch
+
+from modelscope.metainfo import Models
+from modelscope.models.base.base_torch_model import TorchModel
+from modelscope.models.builder import MODELS
+from modelscope.utils.constant import ModelFile, Tasks
+
+
+@MODELS.register_module(
+    Tasks.image_segmentation, module_name=Models.panoptic_segmentation)
+class SwinLPanopticSegmentation(TorchModel):
+
+    def __init__(self, model_dir: str, **kwargs):
+        """str -- model file root."""
+        super().__init__(model_dir, **kwargs)
+
+        from mmcv.runner import load_checkpoint
+        import mmcv
+        from mmdet.models import build_detector
+
+        config = osp.join(model_dir, 'config.py')
+
+        cfg = mmcv.Config.fromfile(config)
+        if 'pretrained' in cfg.model:
+            cfg.model.pretrained = None
+        elif 'init_cfg' in cfg.model.backbone:
+            cfg.model.backbone.init_cfg = None
+
+        # build model
+        cfg.model.train_cfg = None
+        self.model = build_detector(cfg.model, test_cfg=cfg.get('test_cfg'))
+
+        # load model
+        model_path = osp.join(model_dir, ModelFile.TORCH_MODEL_FILE)
+        checkpoint = load_checkpoint(
+            self.model, model_path, map_location='cpu')
+
+        self.CLASSES = checkpoint['meta']['CLASSES']
+        self.num_classes = len(self.CLASSES)
+        self.cfg = cfg
+
+    def inference(self, data):
+        """data is dict,contain img and img_metas,follow with mmdet."""
+
+        with torch.no_grad():
+            results = self.model(return_loss=False, rescale=True, **data)
+        return results
+
+    def forward(self, Inputs):
+        return self.model(**Inputs)
diff --git a/modelscope/models/cv/image_reid_person/pass_model.py b/modelscope/models/cv/image_reid_person/pass_model.py
index 2222fedb..3b032949 100644
--- a/modelscope/models/cv/image_reid_person/pass_model.py
+++ b/modelscope/models/cv/image_reid_person/pass_model.py
@@ -1,4 +1,4 @@
-# The implementation is also open-sourced by the authors as PASS-reID, and is available publicly on
+# The implementation is adopted from PASS-reID, made pubicly available under the Apache-2.0 License at
 # https://github.com/CASIA-IVA-Lab/PASS-reID
 
 import os
diff --git a/modelscope/models/cv/image_reid_person/transreid_model.py b/modelscope/models/cv/image_reid_person/transreid_model.py
index 275c4e22..5bceb468 100644
--- a/modelscope/models/cv/image_reid_person/transreid_model.py
+++ b/modelscope/models/cv/image_reid_person/transreid_model.py
@@ -1,4 +1,4 @@
-# The implementation is also open-sourced by the authors as PASS-reID, and is available publicly on
+# The implementation is adopted from PASS-reID, made pubicly available under the Apache-2.0 License at
 # https://github.com/CASIA-IVA-Lab/PASS-reID
 
 import collections.abc as container_abcs
diff --git a/modelscope/models/cv/image_semantic_segmentation/__init__.py b/modelscope/models/cv/image_semantic_segmentation/__init__.py
new file mode 100644
index 00000000..df56c5b8
--- /dev/null
+++ b/modelscope/models/cv/image_semantic_segmentation/__init__.py
@@ -0,0 +1,24 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .semantic_seg_model import SemanticSegmentation
+    from .segformer import Segformer
+
+else:
+    _import_structure = {
+        'semantic_seg_model': ['SemanticSegmentation'],
+        'segformer': ['Segformer']
+    }
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/models/cv/image_semantic_segmentation/pan_merge/__init__.py b/modelscope/models/cv/image_semantic_segmentation/pan_merge/__init__.py
new file mode 100644
index 00000000..6a31a308
--- /dev/null
+++ b/modelscope/models/cv/image_semantic_segmentation/pan_merge/__init__.py
@@ -0,0 +1,2 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from .maskformer_semantic_head import MaskFormerSemanticHead
diff --git a/modelscope/models/cv/image_semantic_segmentation/pan_merge/base_panoptic_fusion_head.py b/modelscope/models/cv/image_semantic_segmentation/pan_merge/base_panoptic_fusion_head.py
new file mode 100644
index 00000000..05e68d89
--- /dev/null
+++ b/modelscope/models/cv/image_semantic_segmentation/pan_merge/base_panoptic_fusion_head.py
@@ -0,0 +1,47 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from abc import ABCMeta, abstractmethod
+
+from mmcv.runner import BaseModule
+from mmdet.models.builder import build_loss
+
+
+class BasePanopticFusionHead(BaseModule, metaclass=ABCMeta):
+    """Base class for panoptic heads."""
+
+    def __init__(self,
+                 num_things_classes=80,
+                 num_stuff_classes=53,
+                 test_cfg=None,
+                 loss_panoptic=None,
+                 init_cfg=None,
+                 **kwargs):
+        super(BasePanopticFusionHead, self).__init__(init_cfg)
+        self.num_things_classes = num_things_classes
+        self.num_stuff_classes = num_stuff_classes
+        self.num_classes = num_things_classes + num_stuff_classes
+        self.test_cfg = test_cfg
+
+        if loss_panoptic:
+            self.loss_panoptic = build_loss(loss_panoptic)
+        else:
+            self.loss_panoptic = None
+
+    @property
+    def with_loss(self):
+        """bool: whether the panoptic head contains loss function."""
+        return self.loss_panoptic is not None
+
+    @abstractmethod
+    def forward_train(self, gt_masks=None, gt_semantic_seg=None, **kwargs):
+        """Forward function during training."""
+
+    @abstractmethod
+    def simple_test(self,
+                    img_metas,
+                    det_labels,
+                    mask_preds,
+                    seg_preds,
+                    det_bboxes,
+                    cfg=None,
+                    **kwargs):
+        """Test without augmentation."""
diff --git a/modelscope/models/cv/image_semantic_segmentation/pan_merge/maskformer_semantic_head.py b/modelscope/models/cv/image_semantic_segmentation/pan_merge/maskformer_semantic_head.py
new file mode 100644
index 00000000..2f3364d0
--- /dev/null
+++ b/modelscope/models/cv/image_semantic_segmentation/pan_merge/maskformer_semantic_head.py
@@ -0,0 +1,58 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import torch
+import torch.nn.functional as F
+from mmdet.models.builder import HEADS
+
+from .base_panoptic_fusion_head import BasePanopticFusionHead
+
+
+@HEADS.register_module()
+class MaskFormerSemanticHead(BasePanopticFusionHead):
+
+    def __init__(self,
+                 num_things_classes=80,
+                 num_stuff_classes=53,
+                 test_cfg=None,
+                 loss_panoptic=None,
+                 init_cfg=None,
+                 **kwargs):
+        super().__init__(num_things_classes, num_stuff_classes, test_cfg,
+                         loss_panoptic, init_cfg, **kwargs)
+
+    def forward_train(self, **kwargs):
+        """MaskFormerFusionHead has no training loss."""
+        return dict()
+
+    def simple_test(self,
+                    mask_cls_results,
+                    mask_pred_results,
+                    img_metas,
+                    rescale=False,
+                    **kwargs):
+        results = []
+        for mask_cls_result, mask_pred_result, meta in zip(
+                mask_cls_results, mask_pred_results, img_metas):
+            # remove padding
+            img_height, img_width = meta['img_shape'][:2]
+            mask_pred_result = mask_pred_result[:, :img_height, :img_width]
+
+            if rescale:
+                # return result in original resolution
+                ori_height, ori_width = meta['ori_shape'][:2]
+                mask_pred_result = F.interpolate(
+                    mask_pred_result[:, None],
+                    size=(ori_height, ori_width),
+                    mode='bilinear',
+                    align_corners=False)[:, 0]
+
+            # semantic inference
+            cls_score = F.softmax(mask_cls_result, dim=-1)[..., :-1]
+            mask_pred = mask_pred_result.sigmoid()
+            seg_mask = torch.einsum('qc,qhw->chw', cls_score, mask_pred)
+            # still need softmax and argmax
+            seg_logit = F.softmax(seg_mask, dim=0)
+            seg_pred = seg_logit.argmax(dim=0)
+            seg_pred = seg_pred.cpu().numpy()
+            results.append(seg_pred)
+
+        return results
diff --git a/modelscope/models/cv/image_semantic_segmentation/segformer.py b/modelscope/models/cv/image_semantic_segmentation/segformer.py
new file mode 100644
index 00000000..46303526
--- /dev/null
+++ b/modelscope/models/cv/image_semantic_segmentation/segformer.py
@@ -0,0 +1,16 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from easycv.models.segmentation import EncoderDecoder
+
+from modelscope.metainfo import Models
+from modelscope.models.builder import MODELS
+from modelscope.models.cv.easycv_base import EasyCVBaseModel
+from modelscope.utils.constant import Tasks
+
+
+@MODELS.register_module(
+    group_key=Tasks.image_segmentation, module_name=Models.segformer)
+class Segformer(EasyCVBaseModel, EncoderDecoder):
+
+    def __init__(self, model_dir=None, *args, **kwargs):
+        EasyCVBaseModel.__init__(self, model_dir, args, kwargs)
+        EncoderDecoder.__init__(self, *args, **kwargs)
diff --git a/modelscope/models/cv/image_semantic_segmentation/semantic_seg_model.py b/modelscope/models/cv/image_semantic_segmentation/semantic_seg_model.py
new file mode 100644
index 00000000..2b38ebad
--- /dev/null
+++ b/modelscope/models/cv/image_semantic_segmentation/semantic_seg_model.py
@@ -0,0 +1,77 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os.path as osp
+
+import numpy as np
+import torch
+
+from modelscope.metainfo import Models
+from modelscope.models.base.base_torch_model import TorchModel
+from modelscope.models.builder import MODELS
+from modelscope.models.cv.image_semantic_segmentation import (pan_merge,
+                                                              vit_adapter)
+from modelscope.outputs import OutputKeys
+from modelscope.utils.constant import ModelFile, Tasks
+
+
+@MODELS.register_module(
+    Tasks.image_segmentation, module_name=Models.swinL_semantic_segmentation)
+@MODELS.register_module(
+    Tasks.image_segmentation,
+    module_name=Models.vitadapter_semantic_segmentation)
+class SemanticSegmentation(TorchModel):
+
+    def __init__(self, model_dir: str, **kwargs):
+        """str -- model file root."""
+        super().__init__(model_dir, **kwargs)
+
+        from mmcv.runner import load_checkpoint
+        import mmcv
+        from mmdet.models import build_detector
+
+        config = osp.join(model_dir, 'mmcv_config.py')
+        cfg = mmcv.Config.fromfile(config)
+        if 'pretrained' in cfg.model:
+            cfg.model.pretrained = None
+        elif 'init_cfg' in cfg.model.backbone:
+            cfg.model.backbone.init_cfg = None
+
+        # build model
+        cfg.model.train_cfg = None
+        self.model = build_detector(cfg.model, test_cfg=cfg.get('test_cfg'))
+
+        # load model
+        model_path = osp.join(model_dir, ModelFile.TORCH_MODEL_FILE)
+        _ = load_checkpoint(self.model, model_path, map_location='cpu')
+
+        self.CLASSES = cfg['CLASSES']  # list
+        self.PALETTE = cfg['PALETTE']  # list
+
+        self.num_classes = len(self.CLASSES)
+        self.cfg = cfg
+
+    def forward(self, Inputs):
+        return self.model(**Inputs)
+
+    def postprocess(self, Inputs):
+        semantic_result = Inputs[0]
+
+        ids = np.unique(semantic_result)[::-1]
+        legal_indices = ids != self.model.num_classes  # for VOID label
+        ids = ids[legal_indices]
+
+        segms = (semantic_result[None] == ids[:, None, None])
+        masks = [it.astype(np.int) for it in segms]
+        labels_txt = np.array(self.CLASSES)[ids].tolist()
+
+        results = {
+            OutputKeys.MASKS: masks,
+            OutputKeys.LABELS: labels_txt,
+            OutputKeys.SCORES: [0.999 for _ in range(len(labels_txt))]
+        }
+        return results
+
+    def inference(self, data):
+        with torch.no_grad():
+            results = self.model(return_loss=False, rescale=True, **data)
+
+        return results
diff --git a/modelscope/models/cv/image_semantic_segmentation/vit_adapter/__init__.py b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/__init__.py
new file mode 100644
index 00000000..3b9a301c
--- /dev/null
+++ b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/__init__.py
@@ -0,0 +1,5 @@
+# The implementation is adopted from VitAdapter,
+# made publicly available under the Apache License at https://github.com/czczup/ViT-Adapter.git
+from .models import backbone, decode_heads, segmentors
+from .utils import (ResizeToMultiple, add_prefix, build_pixel_sampler,
+                    seg_resize)
diff --git a/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/__init__.py b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/__init__.py
new file mode 100644
index 00000000..791dd26f
--- /dev/null
+++ b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/__init__.py
@@ -0,0 +1,5 @@
+# The implementation is adopted from VitAdapter,
+# made publicly available under the Apache License at https://github.com/czczup/ViT-Adapter.git
+from .backbone import BASEBEiT, BEiTAdapter
+from .decode_heads import Mask2FormerHeadFromMMSeg
+from .segmentors import EncoderDecoderMask2Former
diff --git a/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/backbone/__init__.py b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/backbone/__init__.py
new file mode 100644
index 00000000..7abd0ef1
--- /dev/null
+++ b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/backbone/__init__.py
@@ -0,0 +1,6 @@
+# The implementation is adopted from VitAdapter,
+# made publicly available under the Apache License at https://github.com/czczup/ViT-Adapter.git
+from .base import BASEBEiT
+from .beit_adapter import BEiTAdapter
+
+__all__ = ['BEiTAdapter', 'BASEBEiT']
diff --git a/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/backbone/adapter_modules.py b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/backbone/adapter_modules.py
new file mode 100644
index 00000000..cf30cca0
--- /dev/null
+++ b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/backbone/adapter_modules.py
@@ -0,0 +1,522 @@
+# The implementation is adopted from VitAdapter,
+# made publicly available under the Apache License at https://github.com/czczup/ViT-Adapter.git
+
+import logging
+from functools import partial
+
+import torch
+import torch.nn as nn
+import torch.utils.checkpoint as cp
+from mmdet.models.utils.transformer import MultiScaleDeformableAttention
+from timm.models.layers import DropPath
+
+_logger = logging.getLogger(__name__)
+
+
+def get_reference_points(spatial_shapes, device):
+    reference_points_list = []
+    for lvl, (H_, W_) in enumerate(spatial_shapes):
+        ref_y, ref_x = torch.meshgrid(
+            torch.linspace(
+                0.5, H_ - 0.5, H_, dtype=torch.float32, device=device),
+            torch.linspace(
+                0.5, W_ - 0.5, W_, dtype=torch.float32, device=device))
+        ref_y = ref_y.reshape(-1)[None] / H_
+        ref_x = ref_x.reshape(-1)[None] / W_
+        ref = torch.stack((ref_x, ref_y), -1)
+        reference_points_list.append(ref)
+    reference_points = torch.cat(reference_points_list, 1)
+    reference_points = reference_points[:, :, None]
+    return reference_points
+
+
+def deform_inputs(x):
+    bs, c, h, w = x.shape
+    spatial_shapes = torch.as_tensor([(h // 8, w // 8), (h // 16, w // 16),
+                                      (h // 32, w // 32)],
+                                     dtype=torch.long,
+                                     device=x.device)
+    level_start_index = torch.cat((spatial_shapes.new_zeros(
+        (1, )), spatial_shapes.prod(1).cumsum(0)[:-1]))
+    reference_points = get_reference_points([(h // 16, w // 16)], x.device)
+    deform_inputs1 = [reference_points, spatial_shapes, level_start_index]
+
+    spatial_shapes = torch.as_tensor([(h // 16, w // 16)],
+                                     dtype=torch.long,
+                                     device=x.device)
+    level_start_index = torch.cat((spatial_shapes.new_zeros(
+        (1, )), spatial_shapes.prod(1).cumsum(0)[:-1]))
+    reference_points = get_reference_points([(h // 8, w // 8),
+                                             (h // 16, w // 16),
+                                             (h // 32, w // 32)], x.device)
+    deform_inputs2 = [reference_points, spatial_shapes, level_start_index]
+
+    return deform_inputs1, deform_inputs2
+
+
+class ConvFFN(nn.Module):
+
+    def __init__(self,
+                 in_features,
+                 hidden_features=None,
+                 out_features=None,
+                 act_layer=nn.GELU,
+                 drop=0.):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.dwconv = DWConv(hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+
+    def forward(self, x, H, W):
+        x = self.fc1(x)
+        x = self.dwconv(x, H, W)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+
+
+class DWConv(nn.Module):
+
+    def __init__(self, dim=768):
+        super().__init__()
+        self.dwconv = nn.Conv2d(dim, dim, 3, 1, 1, bias=True, groups=dim)
+
+    def forward(self, x, H, W):
+        B, N, C = x.shape
+        n = N // 21
+        x1 = x[:, 0:16 * n, :].transpose(1, 2).view(B, C, H * 2,
+                                                    W * 2).contiguous()
+        x2 = x[:, 16 * n:20 * n, :].transpose(1, 2).view(B, C, H,
+                                                         W).contiguous()
+        x3 = x[:, 20 * n:, :].transpose(1, 2).view(B, C, H // 2,
+                                                   W // 2).contiguous()
+        x1 = self.dwconv(x1).flatten(2).transpose(1, 2)
+        x2 = self.dwconv(x2).flatten(2).transpose(1, 2)
+        x3 = self.dwconv(x3).flatten(2).transpose(1, 2)
+        x = torch.cat([x1, x2, x3], dim=1)
+        return x
+
+
+class Extractor(nn.Module):
+
+    def __init__(self,
+                 dim,
+                 num_heads=6,
+                 n_points=4,
+                 n_levels=1,
+                 deform_ratio=1.0,
+                 with_cffn=True,
+                 cffn_ratio=0.25,
+                 drop=0.,
+                 drop_path=0.,
+                 norm_layer=partial(nn.LayerNorm, eps=1e-6),
+                 with_cp=False):
+        super().__init__()
+        self.query_norm = norm_layer(dim)
+        self.feat_norm = norm_layer(dim)
+        self.attn = MultiScaleDeformableAttention(
+            embed_dims=dim,
+            num_heads=num_heads,
+            num_levels=n_levels,
+            num_points=n_points,
+            batch_first=True)
+
+        # modify to fit the deform_ratio
+        value_proj_in_features = self.attn.value_proj.weight.shape[0]
+        value_proj_out_features = int(value_proj_in_features * deform_ratio)
+        self.attn.value_proj = nn.Linear(value_proj_in_features,
+                                         value_proj_out_features)
+        self.attn.output_proj = nn.Linear(value_proj_out_features,
+                                          value_proj_in_features)
+
+        self.with_cffn = with_cffn
+        self.with_cp = with_cp
+        if with_cffn:
+            self.ffn = ConvFFN(
+                in_features=dim,
+                hidden_features=int(dim * cffn_ratio),
+                drop=drop)
+            self.ffn_norm = norm_layer(dim)
+            self.drop_path = DropPath(
+                drop_path) if drop_path > 0. else nn.Identity()
+
+    def forward(self, query, reference_points, feat, spatial_shapes,
+                level_start_index, H, W):
+
+        def _inner_forward(query, feat):
+            attn = self.attn(
+                query=self.query_norm(query),
+                key=None,
+                value=self.feat_norm(feat),
+                identity=None,
+                query_pos=None,
+                key_padding_mask=None,
+                reference_points=reference_points,
+                spatial_shapes=spatial_shapes,
+                level_start_index=level_start_index)
+
+            query = query + attn
+
+            if self.with_cffn:
+                query = query + self.drop_path(
+                    self.ffn(self.ffn_norm(query), H, W))
+            return query
+
+        if self.with_cp and query.requires_grad:
+            query = cp.checkpoint(_inner_forward, query, feat)
+        else:
+            query = _inner_forward(query, feat)
+
+        return query
+
+
+class Injector(nn.Module):
+
+    def __init__(self,
+                 dim,
+                 num_heads=6,
+                 n_points=4,
+                 n_levels=1,
+                 deform_ratio=1.0,
+                 norm_layer=partial(nn.LayerNorm, eps=1e-6),
+                 init_values=0.,
+                 with_cp=False):
+        super().__init__()
+        self.with_cp = with_cp
+        self.query_norm = norm_layer(dim)
+        self.feat_norm = norm_layer(dim)
+        self.attn = MultiScaleDeformableAttention(
+            embed_dims=dim,
+            num_heads=num_heads,
+            num_levels=n_levels,
+            num_points=n_points,
+            batch_first=True)
+
+        # modify to fit the deform_ratio
+        value_proj_in_features = self.attn.value_proj.weight.shape[0]
+        value_proj_out_features = int(value_proj_in_features * deform_ratio)
+        self.attn.value_proj = nn.Linear(value_proj_in_features,
+                                         value_proj_out_features)
+        self.attn.output_proj = nn.Linear(value_proj_out_features,
+                                          value_proj_in_features)
+
+        self.gamma = nn.Parameter(
+            init_values * torch.ones((dim)), requires_grad=True)
+
+    def forward(self, query, reference_points, feat, spatial_shapes,
+                level_start_index):
+
+        def _inner_forward(query, feat):
+            input_query = self.query_norm(query)
+            input_value = self.feat_norm(feat)
+            attn = self.attn(
+                query=input_query,
+                key=None,
+                value=input_value,
+                identity=None,
+                query_pos=None,
+                key_padding_mask=None,
+                reference_points=reference_points,
+                spatial_shapes=spatial_shapes,
+                level_start_index=level_start_index)
+            return query + self.gamma * attn
+
+        if self.with_cp and query.requires_grad:
+            query = cp.checkpoint(_inner_forward, query, feat)
+        else:
+            query = _inner_forward(query, feat)
+
+        return query
+
+
+class InteractionBlock(nn.Module):
+
+    def __init__(self,
+                 dim,
+                 num_heads=6,
+                 n_points=4,
+                 norm_layer=partial(nn.LayerNorm, eps=1e-6),
+                 drop=0.,
+                 drop_path=0.,
+                 with_cffn=True,
+                 cffn_ratio=0.25,
+                 init_values=0.,
+                 deform_ratio=1.0,
+                 extra_extractor=False,
+                 with_cp=False):
+        super().__init__()
+
+        self.injector = Injector(
+            dim=dim,
+            n_levels=3,
+            num_heads=num_heads,
+            init_values=init_values,
+            n_points=n_points,
+            norm_layer=norm_layer,
+            deform_ratio=deform_ratio,
+            with_cp=with_cp)
+        self.extractor = Extractor(
+            dim=dim,
+            n_levels=1,
+            num_heads=num_heads,
+            n_points=n_points,
+            norm_layer=norm_layer,
+            deform_ratio=deform_ratio,
+            with_cffn=with_cffn,
+            cffn_ratio=cffn_ratio,
+            drop=drop,
+            drop_path=drop_path,
+            with_cp=with_cp)
+        if extra_extractor:
+            self.extra_extractors = nn.Sequential(*[
+                Extractor(
+                    dim=dim,
+                    num_heads=num_heads,
+                    n_points=n_points,
+                    norm_layer=norm_layer,
+                    with_cffn=with_cffn,
+                    cffn_ratio=cffn_ratio,
+                    deform_ratio=deform_ratio,
+                    drop=drop,
+                    drop_path=drop_path,
+                    with_cp=with_cp) for _ in range(2)
+            ])
+        else:
+            self.extra_extractors = None
+
+    def forward(self, x, c, blocks, deform_inputs1, deform_inputs2, H, W):
+        x = self.injector(
+            query=x,
+            reference_points=deform_inputs1[0],
+            feat=c,
+            spatial_shapes=deform_inputs1[1],
+            level_start_index=deform_inputs1[2])
+        for idx, blk in enumerate(blocks):
+            x = blk(x, H, W)
+        c = self.extractor(
+            query=c,
+            reference_points=deform_inputs2[0],
+            feat=x,
+            spatial_shapes=deform_inputs2[1],
+            level_start_index=deform_inputs2[2],
+            H=H,
+            W=W)
+        if self.extra_extractors is not None:
+            for extractor in self.extra_extractors:
+                c = extractor(
+                    query=c,
+                    reference_points=deform_inputs2[0],
+                    feat=x,
+                    spatial_shapes=deform_inputs2[1],
+                    level_start_index=deform_inputs2[2],
+                    H=H,
+                    W=W)
+        return x, c
+
+
+class InteractionBlockWithCls(nn.Module):
+
+    def __init__(self,
+                 dim,
+                 num_heads=6,
+                 n_points=4,
+                 norm_layer=partial(nn.LayerNorm, eps=1e-6),
+                 drop=0.,
+                 drop_path=0.,
+                 with_cffn=True,
+                 cffn_ratio=0.25,
+                 init_values=0.,
+                 deform_ratio=1.0,
+                 extra_extractor=False,
+                 with_cp=False):
+        super().__init__()
+
+        self.injector = Injector(
+            dim=dim,
+            n_levels=3,
+            num_heads=num_heads,
+            init_values=init_values,
+            n_points=n_points,
+            norm_layer=norm_layer,
+            deform_ratio=deform_ratio,
+            with_cp=with_cp)
+        self.extractor = Extractor(
+            dim=dim,
+            n_levels=1,
+            num_heads=num_heads,
+            n_points=n_points,
+            norm_layer=norm_layer,
+            deform_ratio=deform_ratio,
+            with_cffn=with_cffn,
+            cffn_ratio=cffn_ratio,
+            drop=drop,
+            drop_path=drop_path,
+            with_cp=with_cp)
+        if extra_extractor:
+            self.extra_extractors = nn.Sequential(*[
+                Extractor(
+                    dim=dim,
+                    num_heads=num_heads,
+                    n_points=n_points,
+                    norm_layer=norm_layer,
+                    with_cffn=with_cffn,
+                    cffn_ratio=cffn_ratio,
+                    deform_ratio=deform_ratio,
+                    drop=drop,
+                    drop_path=drop_path,
+                    with_cp=with_cp) for _ in range(2)
+            ])
+        else:
+            self.extra_extractors = None
+
+    def forward(self, x, c, cls, blocks, deform_inputs1, deform_inputs2, H, W):
+        x = self.injector(
+            query=x,
+            reference_points=deform_inputs1[0],
+            feat=c,
+            spatial_shapes=deform_inputs1[1],
+            level_start_index=deform_inputs1[2])
+        x = torch.cat((cls, x), dim=1)
+        for idx, blk in enumerate(blocks):
+            x = blk(x, H, W)
+        cls, x = x[:, :1, ], x[:, 1:, ]
+        c = self.extractor(
+            query=c,
+            reference_points=deform_inputs2[0],
+            feat=x,
+            spatial_shapes=deform_inputs2[1],
+            level_start_index=deform_inputs2[2],
+            H=H,
+            W=W)
+        if self.extra_extractors is not None:
+            for extractor in self.extra_extractors:
+                c = extractor(
+                    query=c,
+                    reference_points=deform_inputs2[0],
+                    feat=x,
+                    spatial_shapes=deform_inputs2[1],
+                    level_start_index=deform_inputs2[2],
+                    H=H,
+                    W=W)
+        return x, c, cls
+
+
+class SpatialPriorModule(nn.Module):
+
+    def __init__(self, inplanes=64, embed_dim=384, with_cp=False):
+        super().__init__()
+        self.with_cp = with_cp
+
+        self.stem = nn.Sequential(*[
+            nn.Conv2d(
+                3, inplanes, kernel_size=3, stride=2, padding=1, bias=False),
+            nn.BatchNorm2d(inplanes),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(
+                inplanes,
+                inplanes,
+                kernel_size=3,
+                stride=1,
+                padding=1,
+                bias=False),
+            nn.BatchNorm2d(inplanes),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(
+                inplanes,
+                inplanes,
+                kernel_size=3,
+                stride=1,
+                padding=1,
+                bias=False),
+            nn.BatchNorm2d(inplanes),
+            nn.ReLU(inplace=True),
+            nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+        ])
+        self.conv2 = nn.Sequential(*[
+            nn.Conv2d(
+                inplanes,
+                2 * inplanes,
+                kernel_size=3,
+                stride=2,
+                padding=1,
+                bias=False),
+            nn.BatchNorm2d(2 * inplanes),
+            nn.ReLU(inplace=True)
+        ])
+        self.conv3 = nn.Sequential(*[
+            nn.Conv2d(
+                2 * inplanes,
+                4 * inplanes,
+                kernel_size=3,
+                stride=2,
+                padding=1,
+                bias=False),
+            nn.BatchNorm2d(4 * inplanes),
+            nn.ReLU(inplace=True)
+        ])
+        self.conv4 = nn.Sequential(*[
+            nn.Conv2d(
+                4 * inplanes,
+                4 * inplanes,
+                kernel_size=3,
+                stride=2,
+                padding=1,
+                bias=False),
+            nn.BatchNorm2d(4 * inplanes),
+            nn.ReLU(inplace=True)
+        ])
+        self.fc1 = nn.Conv2d(
+            inplanes, embed_dim, kernel_size=1, stride=1, padding=0, bias=True)
+        self.fc2 = nn.Conv2d(
+            2 * inplanes,
+            embed_dim,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            bias=True)
+        self.fc3 = nn.Conv2d(
+            4 * inplanes,
+            embed_dim,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            bias=True)
+        self.fc4 = nn.Conv2d(
+            4 * inplanes,
+            embed_dim,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            bias=True)
+
+    def forward(self, x):
+
+        def _inner_forward(x):
+            c1 = self.stem(x)
+            c2 = self.conv2(c1)
+            c3 = self.conv3(c2)
+            c4 = self.conv4(c3)
+            c1 = self.fc1(c1)
+            c2 = self.fc2(c2)
+            c3 = self.fc3(c3)
+            c4 = self.fc4(c4)
+
+            bs, dim, _, _ = c1.shape
+
+            c2 = c2.view(bs, dim, -1).transpose(1, 2)  # 8s
+            c3 = c3.view(bs, dim, -1).transpose(1, 2)  # 16s
+            c4 = c4.view(bs, dim, -1).transpose(1, 2)  # 32s
+
+            return c1, c2, c3, c4
+
+        if self.with_cp and x.requires_grad:
+            outs = cp.checkpoint(_inner_forward, x)
+        else:
+            outs = _inner_forward(x)
+        return outs
diff --git a/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/backbone/base/__init__.py b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/backbone/base/__init__.py
new file mode 100644
index 00000000..5b33031f
--- /dev/null
+++ b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/backbone/base/__init__.py
@@ -0,0 +1,5 @@
+# The implementation is adopted from VitAdapter,
+# made publicly available under the Apache License at https://github.com/czczup/ViT-Adapter.git
+from .beit import BASEBEiT
+
+__all__ = ['BASEBEiT']
diff --git a/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/backbone/base/beit.py b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/backbone/base/beit.py
new file mode 100644
index 00000000..62f873ec
--- /dev/null
+++ b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/backbone/base/beit.py
@@ -0,0 +1,474 @@
+# The implementation is adopted from VitAdapter,
+# made publicly available under the Apache License at https://github.com/czczup/ViT-Adapter.git
+import math
+from functools import partial
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.checkpoint as cp
+from mmcv.runner import _load_checkpoint
+from mmdet.models.builder import BACKBONES
+from mmdet.utils import get_root_logger
+from timm.models.layers import drop_path, to_2tuple, trunc_normal_
+
+
+class DropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample  (when applied in main path of
+    residual blocks)."""
+
+    def __init__(self, drop_prob=None):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training)
+
+    def extra_repr(self) -> str:
+        return 'p={}'.format(self.drop_prob)
+
+
+class Mlp(nn.Module):
+
+    def __init__(self,
+                 in_features,
+                 hidden_features=None,
+                 out_features=None,
+                 act_layer=nn.GELU,
+                 drop=0.):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        # commit dropout for the original BERT implement
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+
+
+class Attention(nn.Module):
+
+    def __init__(self,
+                 dim,
+                 num_heads=8,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 attn_drop=0.,
+                 proj_drop=0.,
+                 window_size=None,
+                 attn_head_dim=None):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        if attn_head_dim is not None:
+            head_dim = attn_head_dim
+        all_head_dim = head_dim * self.num_heads
+        # NOTE scale factor was wrong in my original version, can set manually to be compat with prev weights
+        self.scale = qk_scale or head_dim**-0.5
+
+        self.qkv = nn.Linear(dim, all_head_dim * 3, bias=False)
+        if qkv_bias:
+            self.q_bias = nn.Parameter(torch.zeros(all_head_dim))
+            self.v_bias = nn.Parameter(torch.zeros(all_head_dim))
+        else:
+            self.q_bias = None
+            self.v_bias = None
+
+        if window_size:
+            self.window_size = window_size
+            self.num_relative_distance = (2 * window_size[0]
+                                          - 1) * (2 * window_size[1] - 1) + 3
+            self.relative_position_bias_table = nn.Parameter(
+                torch.zeros(self.num_relative_distance,
+                            num_heads))  # 2*Wh-1 * 2*Ww-1, nH
+            # cls to token & token 2 cls & cls to cls
+
+            # get pair-wise relative position index for each token inside the window
+            coords_h = torch.arange(window_size[0])
+            coords_w = torch.arange(window_size[1])
+            coords = torch.stack(torch.meshgrid([coords_h,
+                                                 coords_w]))  # 2, Wh, Ww
+            coords_flatten = torch.flatten(coords, 1)  # 2, Wh*Ww
+            relative_coords = coords_flatten[:, :,
+                                             None] - coords_flatten[:,
+                                                                    None, :]  # 2, Wh*Ww, Wh*Ww
+            relative_coords = relative_coords.permute(
+                1, 2, 0).contiguous()  # Wh*Ww, Wh*Ww, 2
+            relative_coords[:, :,
+                            0] += window_size[0] - 1  # shift to start from 0
+            relative_coords[:, :, 1] += window_size[1] - 1
+            relative_coords[:, :, 0] *= 2 * window_size[1] - 1
+            relative_position_index = \
+                torch.zeros(size=(window_size[0] * window_size[1] + 1,) * 2, dtype=relative_coords.dtype)
+            relative_position_index[1:, 1:] = relative_coords.sum(
+                -1)  # Wh*Ww, Wh*Ww
+            relative_position_index[0, 0:] = self.num_relative_distance - 3
+            relative_position_index[0:, 0] = self.num_relative_distance - 2
+            relative_position_index[0, 0] = self.num_relative_distance - 1
+            self.register_buffer('relative_position_index',
+                                 relative_position_index)
+
+        else:
+            self.window_size = None
+            self.relative_position_bias_table = None
+            self.relative_position_index = None
+
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(all_head_dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+
+    def forward(self, x, rel_pos_bias=None):
+        B, N, C = x.shape
+        qkv_bias = None
+        if self.q_bias is not None:
+            qkv_bias = torch.cat(
+                (self.q_bias,
+                 torch.zeros_like(self.v_bias,
+                                  requires_grad=False), self.v_bias))
+
+        qkv = F.linear(input=x, weight=self.qkv.weight, bias=qkv_bias)
+        qkv = qkv.reshape(B, N, 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv[0], qkv[1], qkv[
+            2]  # make torchscript happy (cannot use tensor as tuple)
+
+        q = q * self.scale
+        attn = (q @ k.transpose(-2, -1))
+
+        if self.relative_position_bias_table is not None:
+            relative_position_bias = \
+                self.relative_position_bias_table[self.relative_position_index.view(-1)].view(
+                    self.window_size[0] * self.window_size[1] + 1,
+                    self.window_size[0] * self.window_size[1] + 1, -1)  # Wh*Ww,Wh*Ww,nH
+            relative_position_bias = relative_position_bias.permute(
+                2, 0, 1).contiguous()  # nH, Wh*Ww, Wh*Ww
+
+            attn = attn + relative_position_bias.unsqueeze(0)
+
+        if rel_pos_bias is not None:
+            attn = attn + rel_pos_bias
+
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+
+        x = (attn @ v).transpose(1, 2).reshape(B, N, -1)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+
+
+class Block(nn.Module):
+
+    def __init__(self,
+                 dim,
+                 num_heads,
+                 mlp_ratio=4.,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 drop=0.,
+                 attn_drop=0.,
+                 drop_path=0.,
+                 init_values=None,
+                 act_layer=nn.GELU,
+                 norm_layer=nn.LayerNorm,
+                 window_size=None,
+                 attn_head_dim=None,
+                 with_cp=False):
+        super().__init__()
+        self.with_cp = with_cp
+        self.norm1 = norm_layer(dim)
+        self.attn = Attention(
+            dim,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            qk_scale=qk_scale,
+            attn_drop=attn_drop,
+            proj_drop=drop,
+            window_size=window_size,
+            attn_head_dim=attn_head_dim)
+        # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
+        self.drop_path = DropPath(
+            drop_path) if drop_path > 0. else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(
+            in_features=dim,
+            hidden_features=mlp_hidden_dim,
+            act_layer=act_layer,
+            drop=drop)
+
+        if init_values is not None:
+            self.gamma_1 = nn.Parameter(
+                init_values * torch.ones((dim)), requires_grad=True)
+            self.gamma_2 = nn.Parameter(
+                init_values * torch.ones((dim)), requires_grad=True)
+        else:
+            self.gamma_1, self.gamma_2 = None, None
+
+    def forward(self, x, H, W, rel_pos_bias=None):
+
+        def _inner_forward(x):
+            if self.gamma_1 is None:
+                x = x + self.drop_path(
+                    self.attn(self.norm1(x), rel_pos_bias=rel_pos_bias))
+                x = x + self.drop_path(self.mlp(self.norm2(x)))
+            else:
+                x = x + self.drop_path(self.gamma_1 * self.attn(
+                    self.norm1(x), rel_pos_bias=rel_pos_bias))
+                x = x + self.drop_path(self.gamma_2 * self.mlp(self.norm2(x)))
+            return x
+
+        if self.with_cp and x.requires_grad:
+            x = cp.checkpoint(_inner_forward, x)
+        else:
+            x = _inner_forward(x)
+        return x
+
+
+class PatchEmbed(nn.Module):
+    """ Image to Patch Embedding
+    """
+
+    def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768):
+        super().__init__()
+        img_size = to_2tuple(img_size)
+        patch_size = to_2tuple(patch_size)
+        num_patches = (img_size[1] // patch_size[1]) * (
+            img_size[0] // patch_size[0])
+        self.patch_shape = (img_size[0] // patch_size[0],
+                            img_size[1] // patch_size[1])
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.num_patches = num_patches
+
+        self.proj = nn.Conv2d(
+            in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
+
+    def forward(self, x, **kwargs):
+        B, C, H, W = x.shape
+        # FIXME look at relaxing size constraints
+        # assert H == self.img_size[0] and W == self.img_size[1], \
+        #     f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})."
+        x = self.proj(x)
+        Hp, Wp = x.shape[2], x.shape[3]
+
+        x = x.flatten(2).transpose(1, 2)
+        return x, Hp, Wp
+
+
+class HybridEmbed(nn.Module):
+    """ CNN Feature Map Embedding
+    Extract feature map from CNN, flatten, project to embedding dim.
+    """
+
+    def __init__(self,
+                 backbone,
+                 img_size=224,
+                 feature_size=None,
+                 in_chans=3,
+                 embed_dim=768):
+        super().__init__()
+        assert isinstance(backbone, nn.Module)
+        img_size = to_2tuple(img_size)
+        self.img_size = img_size
+        self.backbone = backbone
+        if feature_size is None:
+            with torch.no_grad():
+                # FIXME this is hacky, but most reliable way of determining the exact dim of the output feature
+                # map for all networks, the feature metadata has reliable channel and stride info, but using
+                # stride to calc feature dim requires info about padding of each stage that isn't captured.
+                training = backbone.training
+                if training:
+                    backbone.eval()
+                o = self.backbone(
+                    torch.zeros(1, in_chans, img_size[0], img_size[1]))[-1]
+                feature_size = o.shape[-2:]
+                feature_dim = o.shape[1]
+                backbone.train(training)
+        else:
+            feature_size = to_2tuple(feature_size)
+            feature_dim = self.backbone.feature_info.channels()[-1]
+        self.num_patches = feature_size[0] * feature_size[1]
+        self.proj = nn.Linear(feature_dim, embed_dim)
+
+    def forward(self, x):
+        x = self.backbone(x)[-1]
+        x = x.flatten(2).transpose(1, 2)
+        x = self.proj(x)
+        return x
+
+
+class RelativePositionBias(nn.Module):
+
+    def __init__(self, window_size, num_heads):
+        super().__init__()
+        self.window_size = window_size
+        self.num_relative_distance = (2 * window_size[0]
+                                      - 1) * (2 * window_size[1] - 1) + 3
+        self.relative_position_bias_table = nn.Parameter(
+            torch.zeros(self.num_relative_distance,
+                        num_heads))  # 2*Wh-1 * 2*Ww-1, nH
+        # cls to token & token 2 cls & cls to cls
+
+        # get pair-wise relative position index for each token inside the window
+        coords_h = torch.arange(window_size[0])
+        coords_w = torch.arange(window_size[1])
+        coords = torch.stack(torch.meshgrid([coords_h, coords_w]))  # 2, Wh, Ww
+        coords_flatten = torch.flatten(coords, 1)  # 2, Wh*Ww
+        relative_coords = coords_flatten[:, :,
+                                         None] - coords_flatten[:,
+                                                                None, :]  # 2, Wh*Ww, Wh*Ww
+        relative_coords = relative_coords.permute(
+            1, 2, 0).contiguous()  # Wh*Ww, Wh*Ww, 2
+        relative_coords[:, :, 0] += window_size[0] - 1  # shift to start from 0
+        relative_coords[:, :, 1] += window_size[1] - 1
+        relative_coords[:, :, 0] *= 2 * window_size[1] - 1
+        relative_position_index = \
+            torch.zeros(size=(window_size[0] * window_size[1] + 1,) * 2, dtype=relative_coords.dtype)
+        relative_position_index[1:,
+                                1:] = relative_coords.sum(-1)  # Wh*Ww, Wh*Ww
+        relative_position_index[0, 0:] = self.num_relative_distance - 3
+        relative_position_index[0:, 0] = self.num_relative_distance - 2
+        relative_position_index[0, 0] = self.num_relative_distance - 1
+
+        self.register_buffer('relative_position_index',
+                             relative_position_index)
+
+    def forward(self):
+        relative_position_bias = \
+            self.relative_position_bias_table[self.relative_position_index.view(-1)].view(
+                self.window_size[0] * self.window_size[1] + 1,
+                self.window_size[0] * self.window_size[1] + 1, -1)  # Wh*Ww,Wh*Ww,nH
+        return relative_position_bias.permute(
+            2, 0, 1).contiguous()  # nH, Wh*Ww, Wh*Ww
+
+
+@BACKBONES.register_module()
+class BASEBEiT(nn.Module):
+    """ Vision Transformer with support for patch or hybrid CNN input stage
+    """
+
+    def __init__(self,
+                 img_size=512,
+                 patch_size=16,
+                 in_chans=3,
+                 num_classes=80,
+                 embed_dim=768,
+                 depth=12,
+                 num_heads=12,
+                 mlp_ratio=4.,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 drop_rate=0.,
+                 attn_drop_rate=0.,
+                 drop_path_rate=0.,
+                 hybrid_backbone=None,
+                 norm_layer=None,
+                 init_values=None,
+                 use_checkpoint=False,
+                 use_abs_pos_emb=False,
+                 use_rel_pos_bias=True,
+                 use_shared_rel_pos_bias=False,
+                 pretrained=None,
+                 with_cp=False):
+        super().__init__()
+        norm_layer = norm_layer or partial(nn.LayerNorm, eps=1e-6)
+        self.norm_layer = norm_layer
+        self.num_classes = num_classes
+        self.num_features = self.embed_dim = embed_dim  # num_features for consistency with other models
+        self.drop_path_rate = drop_path_rate
+        if hybrid_backbone is not None:
+            self.patch_embed = HybridEmbed(
+                hybrid_backbone,
+                img_size=img_size,
+                in_chans=in_chans,
+                embed_dim=embed_dim)
+        else:
+            self.patch_embed = PatchEmbed(
+                img_size=img_size,
+                patch_size=patch_size,
+                in_chans=in_chans,
+                embed_dim=embed_dim)
+        num_patches = self.patch_embed.num_patches
+
+        self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
+        if use_abs_pos_emb:
+            self.pos_embed = nn.Parameter(
+                torch.zeros(1, num_patches + 1, embed_dim))
+        else:
+            self.pos_embed = None
+        self.pos_drop = nn.Dropout(p=drop_rate)
+
+        if use_shared_rel_pos_bias:
+            self.rel_pos_bias = RelativePositionBias(
+                window_size=self.patch_embed.patch_shape, num_heads=num_heads)
+        else:
+            self.rel_pos_bias = None
+
+        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)
+               ]  # stochastic depth decay rule
+        self.use_rel_pos_bias = use_rel_pos_bias
+        self.use_checkpoint = use_checkpoint
+        self.blocks = nn.ModuleList([
+            Block(
+                dim=embed_dim,
+                num_heads=num_heads,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                drop=drop_rate,
+                attn_drop=attn_drop_rate,
+                drop_path=dpr[i],
+                norm_layer=norm_layer,
+                with_cp=with_cp,
+                init_values=init_values,
+                window_size=self.patch_embed.patch_shape
+                if use_rel_pos_bias else None) for i in range(depth)
+        ])
+
+        trunc_normal_(self.cls_token, std=.02)
+        self.apply(self._init_weights)
+        self.init_weights(pretrained)
+
+    def init_weights(self, pretrained=None):
+        """Initialize the weights in backbone.
+
+        Args:
+            pretrained (str, optional): Path to pre-trained weights.
+                Defaults to None.
+        """
+        if isinstance(pretrained, str):
+            logger = get_root_logger()
+            init_cfg = dict(type='Pretrained', checkpoint=pretrained)
+
+            checkpoint = _load_checkpoint(
+                init_cfg['checkpoint'], logger=logger, map_location='cpu')
+            state_dict = self.resize_rel_pos_embed(checkpoint)
+            self.load_state_dict(state_dict, False)
+
+    def fix_init_weight(self):
+
+        def rescale(param, layer_id):
+            param.div_(math.sqrt(2.0 * layer_id))
+
+        for layer_id, layer in enumerate(self.blocks):
+            rescale(layer.attn.proj.weight.data, layer_id + 1)
+            rescale(layer.mlp.fc2.weight.data, layer_id + 1)
+
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+
+    def get_num_layers(self):
+        return len(self.blocks)
diff --git a/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/backbone/beit_adapter.py b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/backbone/beit_adapter.py
new file mode 100644
index 00000000..182fc0c1
--- /dev/null
+++ b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/backbone/beit_adapter.py
@@ -0,0 +1,168 @@
+# The implementation is adopted from VitAdapter,
+# made publicly available under the Apache License at https://github.com/czczup/ViT-Adapter.git
+import logging
+import math
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmdet.models.builder import BACKBONES
+from mmdet.models.utils.transformer import MultiScaleDeformableAttention
+from timm.models.layers import DropPath, trunc_normal_
+from torch.nn.init import normal_
+
+from .adapter_modules import InteractionBlockWithCls as InteractionBlock
+from .adapter_modules import SpatialPriorModule, deform_inputs
+from .base.beit import BASEBEiT
+
+_logger = logging.getLogger(__name__)
+
+
+@BACKBONES.register_module()
+class BEiTAdapter(BASEBEiT):
+
+    def __init__(self,
+                 pretrain_size=224,
+                 conv_inplane=64,
+                 n_points=4,
+                 deform_num_heads=6,
+                 init_values=0.,
+                 cffn_ratio=0.25,
+                 deform_ratio=1.0,
+                 with_cffn=True,
+                 interaction_indexes=None,
+                 add_vit_feature=True,
+                 with_cp=False,
+                 *args,
+                 **kwargs):
+
+        super().__init__(
+            init_values=init_values, with_cp=with_cp, *args, **kwargs)
+
+        self.num_block = len(self.blocks)
+        self.pretrain_size = (pretrain_size, pretrain_size)
+        self.flags = [
+            i for i in range(-1, self.num_block, self.num_block // 4)
+        ][1:]
+        self.interaction_indexes = interaction_indexes
+        self.add_vit_feature = add_vit_feature
+        embed_dim = self.embed_dim
+
+        self.level_embed = nn.Parameter(torch.zeros(3, embed_dim))
+        self.spm = SpatialPriorModule(
+            inplanes=conv_inplane, embed_dim=embed_dim, with_cp=False)
+        self.interactions = nn.Sequential(*[
+            InteractionBlock(
+                dim=embed_dim,
+                num_heads=deform_num_heads,
+                n_points=n_points,
+                init_values=init_values,
+                drop_path=self.drop_path_rate,
+                norm_layer=self.norm_layer,
+                with_cffn=with_cffn,
+                cffn_ratio=cffn_ratio,
+                deform_ratio=deform_ratio,
+                extra_extractor=True if i == len(interaction_indexes)
+                - 1 else False,
+                with_cp=with_cp) for i in range(len(interaction_indexes))
+        ])
+
+        self.up = nn.ConvTranspose2d(embed_dim, embed_dim, 2, 2)
+        self.norm1 = nn.BatchNorm2d(embed_dim)
+        self.norm2 = nn.BatchNorm2d(embed_dim)
+        self.norm3 = nn.BatchNorm2d(embed_dim)
+        self.norm4 = nn.BatchNorm2d(embed_dim)
+
+        self.up.apply(self._init_weights)
+        self.spm.apply(self._init_weights)
+        self.interactions.apply(self._init_weights)
+        self.apply(self._init_deform_weights)
+        normal_(self.level_embed)
+
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm) or isinstance(m, nn.BatchNorm2d):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+        elif isinstance(m, nn.Conv2d) or isinstance(m, nn.ConvTranspose2d):
+            fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+            fan_out //= m.groups
+            m.weight.data.normal_(0, math.sqrt(2.0 / fan_out))
+            if m.bias is not None:
+                m.bias.data.zero_()
+
+    def _get_pos_embed(self, pos_embed, H, W):
+        pos_embed = pos_embed.reshape(1, self.pretrain_size[0] // 16,
+                                      self.pretrain_size[1] // 16,
+                                      -1).permute(0, 3, 1, 2)
+        pos_embed = F.interpolate(pos_embed, size=(H, W), mode='bicubic', align_corners=False). \
+            reshape(1, -1, H * W).permute(0, 2, 1)
+        return pos_embed
+
+    def _init_deform_weights(self, m):
+        if isinstance(m, MultiScaleDeformableAttention):
+            m.init_weights()
+
+    def _add_level_embed(self, c2, c3, c4):
+        c2 = c2 + self.level_embed[0]
+        c3 = c3 + self.level_embed[1]
+        c4 = c4 + self.level_embed[2]
+        return c2, c3, c4
+
+    def forward(self, x):
+        deform_inputs1, deform_inputs2 = deform_inputs(x)
+
+        # SPM forward
+        c1, c2, c3, c4 = self.spm(x)
+        c2, c3, c4 = self._add_level_embed(c2, c3, c4)
+        c = torch.cat([c2, c3, c4], dim=1)
+
+        # Patch Embedding forward
+        x, H, W = self.patch_embed(x)
+        bs, n, dim = x.shape
+        cls = self.cls_token.expand(
+            bs, -1, -1)  # stole cls_tokens impl from Phil Wang, thanks
+
+        if self.pos_embed is not None:
+            pos_embed = self._get_pos_embed(self.pos_embed, H, W)
+            x = x + pos_embed
+        x = self.pos_drop(x)
+
+        # Interaction
+        outs = list()
+        for i, layer in enumerate(self.interactions):
+            indexes = self.interaction_indexes[i]
+            x, c, cls = layer(x, c, cls,
+                              self.blocks[indexes[0]:indexes[-1] + 1],
+                              deform_inputs1, deform_inputs2, H, W)
+            outs.append(x.transpose(1, 2).view(bs, dim, H, W).contiguous())
+
+        # Split & Reshape
+        c2 = c[:, 0:c2.size(1), :]
+        c3 = c[:, c2.size(1):c2.size(1) + c3.size(1), :]
+        c4 = c[:, c2.size(1) + c3.size(1):, :]
+
+        c2 = c2.transpose(1, 2).view(bs, dim, H * 2, W * 2).contiguous()
+        c3 = c3.transpose(1, 2).view(bs, dim, H, W).contiguous()
+        c4 = c4.transpose(1, 2).view(bs, dim, H // 2, W // 2).contiguous()
+        c1 = self.up(c2) + c1
+
+        if self.add_vit_feature:
+            x1, x2, x3, x4 = outs
+            x1 = F.interpolate(
+                x1, scale_factor=4, mode='bilinear', align_corners=False)
+            x2 = F.interpolate(
+                x2, scale_factor=2, mode='bilinear', align_corners=False)
+            x4 = F.interpolate(
+                x4, scale_factor=0.5, mode='bilinear', align_corners=False)
+            c1, c2, c3, c4 = c1 + x1, c2 + x2, c3 + x3, c4 + x4
+
+        # Final Norm
+        f1 = self.norm1(c1)
+        f2 = self.norm2(c2)
+        f3 = self.norm3(c3)
+        f4 = self.norm4(c4)
+        return [f1, f2, f3, f4]
diff --git a/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/decode_heads/__init__.py b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/decode_heads/__init__.py
new file mode 100644
index 00000000..12bf2a21
--- /dev/null
+++ b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/decode_heads/__init__.py
@@ -0,0 +1,5 @@
+# The implementation is adopted from VitAdapter,
+# made publicly available under the Apache License at https://github.com/czczup/ViT-Adapter.git
+from .mask2former_head_from_mmseg import Mask2FormerHeadFromMMSeg
+
+__all__ = ['Mask2FormerHeadFromMMSeg']
diff --git a/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/decode_heads/base_decode_head.py b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/decode_heads/base_decode_head.py
new file mode 100644
index 00000000..ae7a0416
--- /dev/null
+++ b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/decode_heads/base_decode_head.py
@@ -0,0 +1,266 @@
+# The implementation is adopted from VitAdapter,
+# made publicly available under the Apache License at https://github.com/czczup/ViT-Adapter.git
+from abc import ABCMeta, abstractmethod
+
+import torch
+import torch.nn as nn
+from mmcv.runner import BaseModule, auto_fp16, force_fp32
+from mmdet.models.builder import build_loss
+from mmdet.models.losses import accuracy
+
+from ...utils import build_pixel_sampler, seg_resize
+
+
+class BaseDecodeHead(BaseModule, metaclass=ABCMeta):
+    """Base class for BaseDecodeHead.
+
+    Args:
+        in_channels (int|Sequence[int]): Input channels.
+        channels (int): Channels after modules, before conv_seg.
+        num_classes (int): Number of classes.
+        dropout_ratio (float): Ratio of dropout layer. Default: 0.1.
+        conv_cfg (dict|None): Config of conv layers. Default: None.
+        norm_cfg (dict|None): Config of norm layers. Default: None.
+        act_cfg (dict): Config of activation layers.
+            Default: dict(type='ReLU')
+        in_index (int|Sequence[int]): Input feature index. Default: -1
+        input_transform (str|None): Transformation type of input features.
+            Options: 'resize_concat', 'multiple_select', None.
+            'resize_concat': Multiple feature maps will be resize to the
+                same size as first one and than concat together.
+                Usually used in FCN head of HRNet.
+            'multiple_select': Multiple feature maps will be bundle into
+                a list and passed into decode head.
+            None: Only one select feature map is allowed.
+            Default: None.
+        loss_decode (dict | Sequence[dict]): Config of decode loss.
+            The `loss_name` is property of corresponding loss function which
+            could be shown in training log. If you want this loss
+            item to be included into the backward graph, `loss_` must be the
+            prefix of the name. Defaults to 'loss_ce'.
+             e.g. dict(type='CrossEntropyLoss'),
+             [dict(type='CrossEntropyLoss', loss_name='loss_ce'),
+              dict(type='DiceLoss', loss_name='loss_dice')]
+            Default: dict(type='CrossEntropyLoss').
+        ignore_index (int | None): The label index to be ignored. When using
+            masked BCE loss, ignore_index should be set to None. Default: 255.
+        sampler (dict|None): The config of segmentation map sampler.
+            Default: None.
+        align_corners (bool): align_corners argument of F.interpolate.
+            Default: False.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 channels,
+                 *,
+                 num_classes,
+                 dropout_ratio=0.1,
+                 conv_cfg=None,
+                 norm_cfg=None,
+                 act_cfg=dict(type='ReLU'),
+                 in_index=-1,
+                 input_transform=None,
+                 loss_decode=dict(
+                     type='CrossEntropyLoss',
+                     use_sigmoid=False,
+                     loss_weight=1.0),
+                 ignore_index=255,
+                 sampler=None,
+                 align_corners=False,
+                 init_cfg=dict(
+                     type='Normal', std=0.01, override=dict(name='conv_seg'))):
+        super(BaseDecodeHead, self).__init__(init_cfg)
+        self._init_inputs(in_channels, in_index, input_transform)
+        self.channels = channels
+        self.num_classes = num_classes
+        self.dropout_ratio = dropout_ratio
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.act_cfg = act_cfg
+        self.in_index = in_index
+
+        self.ignore_index = ignore_index
+        self.align_corners = align_corners
+
+        if isinstance(loss_decode, dict):
+            self.loss_decode = build_loss(loss_decode)
+        elif isinstance(loss_decode, (list, tuple)):
+            self.loss_decode = nn.ModuleList()
+            for loss in loss_decode:
+                self.loss_decode.append(build_loss(loss))
+        else:
+            raise TypeError(f'loss_decode must be a dict or sequence of dict,\
+                but got {type(loss_decode)}')
+
+        if sampler is not None:
+            self.sampler = build_pixel_sampler(sampler, context=self)
+        else:
+            self.sampler = None
+
+        self.conv_seg = nn.Conv2d(channels, num_classes, kernel_size=1)
+        if dropout_ratio > 0:
+            self.dropout = nn.Dropout2d(dropout_ratio)
+        else:
+            self.dropout = None
+        self.fp16_enabled = False
+
+    def extra_repr(self):
+        """Extra repr."""
+        s = f'input_transform={self.input_transform}, ' \
+            f'ignore_index={self.ignore_index}, ' \
+            f'align_corners={self.align_corners}'
+        return s
+
+    def _init_inputs(self, in_channels, in_index, input_transform):
+        """Check and initialize input transforms.
+
+        The in_channels, in_index and input_transform must match.
+        Specifically, when input_transform is None, only single feature map
+        will be selected. So in_channels and in_index must be of type int.
+        When input_transform
+
+        Args:
+            in_channels (int|Sequence[int]): Input channels.
+            in_index (int|Sequence[int]): Input feature index.
+            input_transform (str|None): Transformation type of input features.
+                Options: 'resize_concat', 'multiple_select', None.
+                'resize_concat': Multiple feature maps will be resize to the
+                    same size as first one and than concat together.
+                    Usually used in FCN head of HRNet.
+                'multiple_select': Multiple feature maps will be bundle into
+                    a list and passed into decode head.
+                None: Only one select feature map is allowed.
+        """
+
+        if input_transform is not None:
+            assert input_transform in ['resize_concat', 'multiple_select']
+        self.input_transform = input_transform
+        self.in_index = in_index
+        if input_transform is not None:
+            assert isinstance(in_channels, (list, tuple))
+            assert isinstance(in_index, (list, tuple))
+            assert len(in_channels) == len(in_index)
+            if input_transform == 'resize_concat':
+                self.in_channels = sum(in_channels)
+            else:
+                self.in_channels = in_channels
+        else:
+            assert isinstance(in_channels, int)
+            assert isinstance(in_index, int)
+            self.in_channels = in_channels
+
+    def _transform_inputs(self, inputs):
+        """Transform inputs for decoder.
+
+        Args:
+            inputs (list[Tensor]): List of multi-level img features.
+
+        Returns:
+            Tensor: The transformed inputs
+        """
+
+        if self.input_transform == 'resize_concat':
+            inputs = [inputs[i] for i in self.in_index]
+            upsampled_inputs = [
+                seg_resize(
+                    input=x,
+                    size=inputs[0].shape[2:],
+                    mode='bilinear',
+                    align_corners=self.align_corners) for x in inputs
+            ]
+            inputs = torch.cat(upsampled_inputs, dim=1)
+        elif self.input_transform == 'multiple_select':
+            inputs = [inputs[i] for i in self.in_index]
+        else:
+            inputs = inputs[self.in_index]
+
+        return inputs
+
+    @auto_fp16()
+    @abstractmethod
+    def forward(self, inputs):
+        """Placeholder of forward function."""
+        pass
+
+    def forward_train(self, inputs, img_metas, gt_semantic_seg, train_cfg):
+        """Forward function for training.
+        Args:
+            inputs (list[Tensor]): List of multi-level img features.
+            img_metas (list[dict]): List of image info dict where each dict
+                has: 'img_shape', 'scale_factor', 'flip', and may also contain
+                'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'.
+                For details on the values of these keys see
+                `mmseg/datasets/pipelines/formatting.py:Collect`.
+            gt_semantic_seg (Tensor): Semantic segmentation masks
+                used if the architecture supports semantic segmentation task.
+            train_cfg (dict): The training config.
+
+        Returns:
+            dict[str, Tensor]: a dictionary of loss components
+        """
+        seg_logits = self.forward(inputs)
+        losses = self.losses(seg_logits, gt_semantic_seg)
+        return losses
+
+    def forward_test(self, inputs, img_metas, test_cfg):
+        """Forward function for testing.
+
+        Args:
+            inputs (list[Tensor]): List of multi-level img features.
+            img_metas (list[dict]): List of image info dict where each dict
+                has: 'img_shape', 'scale_factor', 'flip', and may also contain
+                'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'.
+                For details on the values of these keys see
+                `mmseg/datasets/pipelines/formatting.py:Collect`.
+            test_cfg (dict): The testing config.
+
+        Returns:
+            Tensor: Output segmentation map.
+        """
+        return self.forward(inputs)
+
+    def cls_seg(self, feat):
+        """Classify each pixel."""
+        if self.dropout is not None:
+            feat = self.dropout(feat)
+        output = self.conv_seg(feat)
+        return output
+
+    @force_fp32(apply_to=('seg_logit', ))
+    def losses(self, seg_logit, seg_label):
+        """Compute segmentation loss."""
+        loss = dict()
+        seg_logit = seg_resize(
+            input=seg_logit,
+            size=seg_label.shape[2:],
+            mode='bilinear',
+            align_corners=self.align_corners)
+        if self.sampler is not None:
+            seg_weight = self.sampler.sample(seg_logit, seg_label)
+        else:
+            seg_weight = None
+        seg_label = seg_label.squeeze(1)
+
+        if not isinstance(self.loss_decode, nn.ModuleList):
+            losses_decode = [self.loss_decode]
+        else:
+            losses_decode = self.loss_decode
+        for loss_decode in losses_decode:
+            if loss_decode.loss_name not in loss:
+                loss[loss_decode.loss_name] = loss_decode(
+                    seg_logit,
+                    seg_label,
+                    weight=seg_weight,
+                    ignore_index=self.ignore_index)
+            else:
+                loss[loss_decode.loss_name] += loss_decode(
+                    seg_logit,
+                    seg_label,
+                    weight=seg_weight,
+                    ignore_index=self.ignore_index)
+
+        loss['acc_seg'] = accuracy(
+            seg_logit, seg_label, ignore_index=self.ignore_index)
+        return loss
diff --git a/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/decode_heads/mask2former_head_from_mmseg.py b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/decode_heads/mask2former_head_from_mmseg.py
new file mode 100644
index 00000000..c0681d2b
--- /dev/null
+++ b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/decode_heads/mask2former_head_from_mmseg.py
@@ -0,0 +1,580 @@
+# The implementation is adopted from VitAdapter,
+# made publicly available under the Apache License at https://github.com/czczup/ViT-Adapter.git
+
+import copy
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import Conv2d, build_plugin_layer, caffe2_xavier_init
+from mmcv.cnn.bricks.transformer import (build_positional_encoding,
+                                         build_transformer_layer_sequence)
+from mmcv.ops import point_sample
+from mmcv.runner import ModuleList, force_fp32
+from mmdet.core import build_assigner, build_sampler, multi_apply, reduce_mean
+from mmdet.models.builder import HEADS, build_loss
+from mmdet.models.utils import get_uncertain_point_coords_with_randomness
+
+from .base_decode_head import BaseDecodeHead
+
+
+@HEADS.register_module()
+class Mask2FormerHeadFromMMSeg(BaseDecodeHead):
+    """Implements the Mask2Former head.
+
+    See `Masked-attention Mask Transformer for Universal Image
+    Segmentation <https://arxiv.org/pdf/2112.01527>`_ for details.
+
+    Args:
+        in_channels (list[int]): Number of channels in the input feature map.
+        feat_channels (int): Number of channels for features.
+        out_channels (int): Number of channels for output.
+        num_things_classes (int): Number of things.
+        num_stuff_classes (int): Number of stuff.
+        num_queries (int): Number of query in Transformer decoder.
+        pixel_decoder (:obj:`mmcv.ConfigDict` | dict): Config for pixel
+            decoder. Defaults to None.
+        enforce_decoder_input_project (bool, optional): Whether to add
+            a layer to change the embed_dim of tranformer encoder in
+            pixel decoder to the embed_dim of transformer decoder.
+            Defaults to False.
+        transformer_decoder (:obj:`mmcv.ConfigDict` | dict): Config for
+            transformer decoder. Defaults to None.
+        positional_encoding (:obj:`mmcv.ConfigDict` | dict): Config for
+            transformer decoder position encoding. Defaults to None.
+        loss_cls (:obj:`mmcv.ConfigDict` | dict): Config of the classification
+            loss. Defaults to None.
+        loss_mask (:obj:`mmcv.ConfigDict` | dict): Config of the mask loss.
+            Defaults to None.
+        loss_dice (:obj:`mmcv.ConfigDict` | dict): Config of the dice loss.
+            Defaults to None.
+        train_cfg (:obj:`mmcv.ConfigDict` | dict): Training config of
+            Mask2Former head.
+        test_cfg (:obj:`mmcv.ConfigDict` | dict): Testing config of
+            Mask2Former head.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 feat_channels,
+                 out_channels,
+                 num_things_classes=80,
+                 num_stuff_classes=53,
+                 num_queries=100,
+                 num_transformer_feat_level=3,
+                 pixel_decoder=None,
+                 enforce_decoder_input_project=False,
+                 transformer_decoder=None,
+                 positional_encoding=None,
+                 loss_cls=None,
+                 loss_mask=None,
+                 loss_dice=None,
+                 train_cfg=None,
+                 test_cfg=None,
+                 init_cfg=None,
+                 **kwargs):
+        super(Mask2FormerHeadFromMMSeg, self).__init__(
+            in_channels=in_channels,
+            channels=feat_channels,
+            num_classes=(num_things_classes + num_stuff_classes),
+            init_cfg=init_cfg,
+            input_transform='multiple_select',
+            **kwargs)
+        self.num_things_classes = num_things_classes
+        self.num_stuff_classes = num_stuff_classes
+        self.num_classes = self.num_things_classes + self.num_stuff_classes
+        self.num_queries = num_queries
+        self.num_transformer_feat_level = num_transformer_feat_level
+        self.num_heads = transformer_decoder.transformerlayers. \
+            attn_cfgs.num_heads
+        self.num_transformer_decoder_layers = transformer_decoder.num_layers
+        assert pixel_decoder.encoder.transformerlayers.attn_cfgs.num_levels == num_transformer_feat_level
+        pixel_decoder_ = copy.deepcopy(pixel_decoder)
+        pixel_decoder_.update(
+            in_channels=in_channels,
+            feat_channels=feat_channels,
+            out_channels=out_channels)
+        self.pixel_decoder = build_plugin_layer(pixel_decoder_)[1]
+        self.transformer_decoder = build_transformer_layer_sequence(
+            transformer_decoder)
+        self.decoder_embed_dims = self.transformer_decoder.embed_dims
+
+        self.decoder_input_projs = ModuleList()
+        # from low resolution to high resolution
+        for _ in range(num_transformer_feat_level):
+            if (self.decoder_embed_dims != feat_channels
+                    or enforce_decoder_input_project):
+                self.decoder_input_projs.append(
+                    Conv2d(
+                        feat_channels, self.decoder_embed_dims, kernel_size=1))
+            else:
+                self.decoder_input_projs.append(nn.Identity())
+        self.decoder_positional_encoding = build_positional_encoding(
+            positional_encoding)
+        self.query_embed = nn.Embedding(self.num_queries, feat_channels)
+        self.query_feat = nn.Embedding(self.num_queries, feat_channels)
+        # from low resolution to high resolution
+        self.level_embed = nn.Embedding(self.num_transformer_feat_level,
+                                        feat_channels)
+
+        self.cls_embed = nn.Linear(feat_channels, self.num_classes + 1)
+        self.mask_embed = nn.Sequential(
+            nn.Linear(feat_channels, feat_channels), nn.ReLU(inplace=True),
+            nn.Linear(feat_channels, feat_channels), nn.ReLU(inplace=True),
+            nn.Linear(feat_channels, out_channels))
+        self.conv_seg = None  # fix a bug here (conv_seg is not used)
+
+        self.test_cfg = test_cfg
+        self.train_cfg = train_cfg
+        if train_cfg:
+            self.assigner = build_assigner(self.train_cfg.assigner)
+            self.sampler = build_sampler(self.train_cfg.sampler, context=self)
+            self.num_points = self.train_cfg.get('num_points', 12544)
+            self.oversample_ratio = self.train_cfg.get('oversample_ratio', 3.0)
+            self.importance_sample_ratio = self.train_cfg.get(
+                'importance_sample_ratio', 0.75)
+
+        self.class_weight = loss_cls.class_weight
+        self.loss_cls = build_loss(loss_cls)
+        self.loss_mask = build_loss(loss_mask)
+        self.loss_dice = build_loss(loss_dice)
+
+    def init_weights(self):
+        for m in self.decoder_input_projs:
+            if isinstance(m, Conv2d):
+                caffe2_xavier_init(m, bias=0)
+
+        self.pixel_decoder.init_weights()
+
+        for p in self.transformer_decoder.parameters():
+            if p.dim() > 1:
+                nn.init.xavier_normal_(p)
+
+    def get_targets(self, cls_scores_list, mask_preds_list, gt_labels_list,
+                    gt_masks_list, img_metas):
+        """Compute classification and mask targets for all images for a decoder
+        layer.
+
+        Args:
+            cls_scores_list (list[Tensor]): Mask score logits from a single
+                decoder layer for all images. Each with shape [num_queries,
+                cls_out_channels].
+            mask_preds_list (list[Tensor]): Mask logits from a single decoder
+                layer for all images. Each with shape [num_queries, h, w].
+            gt_labels_list (list[Tensor]): Ground truth class indices for all
+                images. Each with shape (n, ), n is the sum of number of stuff
+                type and number of instance in a image.
+            gt_masks_list (list[Tensor]): Ground truth mask for each image,
+                each with shape (n, h, w).
+            img_metas (list[dict]): List of image meta information.
+
+        Returns:
+            tuple[list[Tensor]]: a tuple containing the following targets.
+
+                - labels_list (list[Tensor]): Labels of all images.
+                    Each with shape [num_queries, ].
+                - label_weights_list (list[Tensor]): Label weights of all
+                    images.Each with shape [num_queries, ].
+                - mask_targets_list (list[Tensor]): Mask targets of all images.
+                    Each with shape [num_queries, h, w].
+                - mask_weights_list (list[Tensor]): Mask weights of all images.
+                    Each with shape [num_queries, ].
+                - num_total_pos (int): Number of positive samples in all
+                    images.
+                - num_total_neg (int): Number of negative samples in all
+                    images.
+        """
+        (labels_list, label_weights_list, mask_targets_list, mask_weights_list,
+         pos_inds_list,
+         neg_inds_list) = multi_apply(self._get_target_single, cls_scores_list,
+                                      mask_preds_list, gt_labels_list,
+                                      gt_masks_list, img_metas)
+
+        num_total_pos = sum((inds.numel() for inds in pos_inds_list))
+        num_total_neg = sum((inds.numel() for inds in neg_inds_list))
+        return (labels_list, label_weights_list, mask_targets_list,
+                mask_weights_list, num_total_pos, num_total_neg)
+
+    def _get_target_single(self, cls_score, mask_pred, gt_labels, gt_masks,
+                           img_metas):
+        """Compute classification and mask targets for one image.
+
+        Args:
+            cls_score (Tensor): Mask score logits from a single decoder layer
+                for one image. Shape (num_queries, cls_out_channels).
+            mask_pred (Tensor): Mask logits for a single decoder layer for one
+                image. Shape (num_queries, h, w).
+            gt_labels (Tensor): Ground truth class indices for one image with
+                shape (num_gts, ).
+            gt_masks (Tensor): Ground truth mask for each image, each with
+                shape (num_gts, h, w).
+            img_metas (dict): Image informtation.
+
+        Returns:
+            tuple[Tensor]: A tuple containing the following for one image.
+
+                - labels (Tensor): Labels of each image. \
+                    shape (num_queries, ).
+                - label_weights (Tensor): Label weights of each image. \
+                    shape (num_queries, ).
+                - mask_targets (Tensor): Mask targets of each image. \
+                    shape (num_queries, h, w).
+                - mask_weights (Tensor): Mask weights of each image. \
+                    shape (num_queries, ).
+                - pos_inds (Tensor): Sampled positive indices for each \
+                    image.
+                - neg_inds (Tensor): Sampled negative indices for each \
+                    image.
+        """
+        # sample points
+        num_queries = cls_score.shape[0]
+        num_gts = gt_labels.shape[0]
+
+        point_coords = torch.rand((1, self.num_points, 2),
+                                  device=cls_score.device)
+        # shape (num_queries, num_points)
+        mask_points_pred = point_sample(
+            mask_pred.unsqueeze(1), point_coords.repeat(num_queries, 1,
+                                                        1)).squeeze(1)
+        # shape (num_gts, num_points)
+        gt_points_masks = point_sample(
+            gt_masks.unsqueeze(1).float(), point_coords.repeat(num_gts, 1,
+                                                               1)).squeeze(1)
+
+        # assign and sample
+        assign_result = self.assigner.assign(cls_score, mask_points_pred,
+                                             gt_labels, gt_points_masks,
+                                             img_metas)
+        sampling_result = self.sampler.sample(assign_result, mask_pred,
+                                              gt_masks)
+        pos_inds = sampling_result.pos_inds
+        neg_inds = sampling_result.neg_inds
+
+        # label target
+        labels = gt_labels.new_full((self.num_queries, ),
+                                    self.num_classes,
+                                    dtype=torch.long)
+        labels[pos_inds] = gt_labels[sampling_result.pos_assigned_gt_inds]
+        label_weights = gt_labels.new_ones((self.num_queries, ))
+
+        # mask target
+        mask_targets = gt_masks[sampling_result.pos_assigned_gt_inds]
+        mask_weights = mask_pred.new_zeros((self.num_queries, ))
+        mask_weights[pos_inds] = 1.0
+
+        return (labels, label_weights, mask_targets, mask_weights, pos_inds,
+                neg_inds)
+
+    def loss_single(self, cls_scores, mask_preds, gt_labels_list,
+                    gt_masks_list, img_metas):
+        """Loss function for outputs from a single decoder layer.
+
+        Args:
+            cls_scores (Tensor): Mask score logits from a single decoder layer
+                for all images. Shape (batch_size, num_queries,
+                cls_out_channels). Note `cls_out_channels` should includes
+                background.
+            mask_preds (Tensor): Mask logits for a pixel decoder for all
+                images. Shape (batch_size, num_queries, h, w).
+            gt_labels_list (list[Tensor]): Ground truth class indices for each
+                image, each with shape (num_gts, ).
+            gt_masks_list (list[Tensor]): Ground truth mask for each image,
+                each with shape (num_gts, h, w).
+            img_metas (list[dict]): List of image meta information.
+
+        Returns:
+            tuple[Tensor]: Loss components for outputs from a single \
+                decoder layer.
+        """
+        num_imgs = cls_scores.size(0)
+        cls_scores_list = [cls_scores[i] for i in range(num_imgs)]
+        mask_preds_list = [mask_preds[i] for i in range(num_imgs)]
+        (labels_list, label_weights_list, mask_targets_list, mask_weights_list,
+         num_total_pos,
+         num_total_neg) = self.get_targets(cls_scores_list, mask_preds_list,
+                                           gt_labels_list, gt_masks_list,
+                                           img_metas)
+        # shape (batch_size, num_queries)
+        labels = torch.stack(labels_list, dim=0)
+        # shape (batch_size, num_queries)
+        label_weights = torch.stack(label_weights_list, dim=0)
+        # shape (num_total_gts, h, w)
+        mask_targets = torch.cat(mask_targets_list, dim=0)
+        # shape (batch_size, num_queries)
+        mask_weights = torch.stack(mask_weights_list, dim=0)
+
+        # classfication loss
+        # shape (batch_size * num_queries, )
+        cls_scores = cls_scores.flatten(0, 1)
+        labels = labels.flatten(0, 1)
+        label_weights = label_weights.flatten(0, 1)
+
+        class_weight = cls_scores.new_tensor(self.class_weight)
+        loss_cls = self.loss_cls(
+            cls_scores,
+            labels,
+            label_weights,
+            avg_factor=class_weight[labels].sum())
+
+        num_total_masks = reduce_mean(cls_scores.new_tensor([num_total_pos]))
+        num_total_masks = max(num_total_masks, 1)
+
+        # extract positive ones
+        # shape (batch_size, num_queries, h, w) -> (num_total_gts, h, w)
+        mask_preds = mask_preds[mask_weights > 0]
+
+        if mask_targets.shape[0] == 0:
+            # zero match
+            loss_dice = mask_preds.sum()
+            loss_mask = mask_preds.sum()
+            return loss_cls, loss_mask, loss_dice
+
+        with torch.no_grad():
+            points_coords = get_uncertain_point_coords_with_randomness(
+                mask_preds.unsqueeze(1), None, self.num_points,
+                self.oversample_ratio, self.importance_sample_ratio)
+            # shape (num_total_gts, h, w) -> (num_total_gts, num_points)
+            mask_point_targets = point_sample(
+                mask_targets.unsqueeze(1).float(), points_coords).squeeze(1)
+        # shape (num_queries, h, w) -> (num_queries, num_points)
+        mask_point_preds = point_sample(
+            mask_preds.unsqueeze(1), points_coords).squeeze(1)
+
+        # dice loss
+        loss_dice = self.loss_dice(
+            mask_point_preds, mask_point_targets, avg_factor=num_total_masks)
+
+        # mask loss
+        # shape (num_queries, num_points) -> (num_queries * num_points, )
+        mask_point_preds = mask_point_preds.reshape(-1, 1)
+        # shape (num_total_gts, num_points) -> (num_total_gts * num_points, )
+        mask_point_targets = mask_point_targets.reshape(-1)
+        loss_mask = self.loss_mask(
+            mask_point_preds,
+            mask_point_targets,
+            avg_factor=num_total_masks * self.num_points)
+
+        return loss_cls, loss_mask, loss_dice
+
+    @force_fp32(apply_to=('all_cls_scores', 'all_mask_preds'))
+    def loss(self, all_cls_scores, all_mask_preds, gt_labels_list,
+             gt_masks_list, img_metas):
+        """Loss function.
+
+        Args:
+            all_cls_scores (Tensor): Classification scores for all decoder
+                layers with shape [num_decoder, batch_size, num_queries,
+                cls_out_channels].
+            all_mask_preds (Tensor): Mask scores for all decoder layers with
+                shape [num_decoder, batch_size, num_queries, h, w].
+            gt_labels_list (list[Tensor]): Ground truth class indices for each
+                image with shape (n, ). n is the sum of number of stuff type
+                and number of instance in a image.
+            gt_masks_list (list[Tensor]): Ground truth mask for each image with
+                shape (n, h, w).
+            img_metas (list[dict]): List of image meta information.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        num_dec_layers = len(all_cls_scores)
+        all_gt_labels_list = [gt_labels_list for _ in range(num_dec_layers)]
+        all_gt_masks_list = [gt_masks_list for _ in range(num_dec_layers)]
+        img_metas_list = [img_metas for _ in range(num_dec_layers)]
+        losses_cls, losses_mask, losses_dice = multi_apply(
+            self.loss_single, all_cls_scores, all_mask_preds,
+            all_gt_labels_list, all_gt_masks_list, img_metas_list)
+
+        loss_dict = dict()
+        # loss from the last decoder layer
+        loss_dict['loss_cls'] = losses_cls[-1]
+        loss_dict['loss_mask'] = losses_mask[-1]
+        loss_dict['loss_dice'] = losses_dice[-1]
+        # loss from other decoder layers
+        num_dec_layer = 0
+        for loss_cls_i, loss_mask_i, loss_dice_i in zip(
+                losses_cls[:-1], losses_mask[:-1], losses_dice[:-1]):
+            loss_dict[f'd{num_dec_layer}.loss_cls'] = loss_cls_i
+            loss_dict[f'd{num_dec_layer}.loss_mask'] = loss_mask_i
+            loss_dict[f'd{num_dec_layer}.loss_dice'] = loss_dice_i
+            num_dec_layer += 1
+        return loss_dict
+
+    def forward_head(self, decoder_out, mask_feature, attn_mask_target_size):
+        """Forward for head part which is called after every decoder layer.
+
+        Args:
+            decoder_out (Tensor): in shape (num_queries, batch_size, c).
+            mask_feature (Tensor): in shape (batch_size, c, h, w).
+            attn_mask_target_size (tuple[int, int]): target attention
+                mask size.
+
+        Returns:
+            tuple: A tuple contain three elements.
+
+            - cls_pred (Tensor): Classification scores in shape \
+                (batch_size, num_queries, cls_out_channels). \
+                Note `cls_out_channels` should includes background.
+            - mask_pred (Tensor): Mask scores in shape \
+                (batch_size, num_queries,h, w).
+            - attn_mask (Tensor): Attention mask in shape \
+                (batch_size * num_heads, num_queries, h, w).
+        """
+        decoder_out = self.transformer_decoder.post_norm(decoder_out)
+        decoder_out = decoder_out.transpose(0, 1)
+        # shape (num_queries, batch_size, c)
+        cls_pred = self.cls_embed(decoder_out)
+        # shape (num_queries, batch_size, c)
+        mask_embed = self.mask_embed(decoder_out)
+        # shape (num_queries, batch_size, h, w)
+        mask_pred = torch.einsum('bqc,bchw->bqhw', mask_embed, mask_feature)
+        attn_mask = F.interpolate(
+            mask_pred,
+            attn_mask_target_size,
+            mode='bilinear',
+            align_corners=False)
+        # shape (num_queries, batch_size, h, w) ->
+        #   (batch_size * num_head, num_queries, h, w)
+        attn_mask = attn_mask.flatten(2).unsqueeze(1).repeat(
+            (1, self.num_heads, 1, 1)).flatten(0, 1)
+        attn_mask = attn_mask.sigmoid() < 0.5
+        attn_mask = attn_mask.detach()
+
+        return cls_pred, mask_pred, attn_mask
+
+    def forward(self, feats, img_metas):
+        """Forward function.
+
+        Args:
+            feats (list[Tensor]): Multi scale Features from the
+                upstream network, each is a 4D-tensor.
+            img_metas (list[dict]): List of image information.
+
+        Returns:
+            tuple: A tuple contains two elements.
+
+            - cls_pred_list (list[Tensor)]: Classification logits \
+                for each decoder layer. Each is a 3D-tensor with shape \
+                (batch_size, num_queries, cls_out_channels). \
+                Note `cls_out_channels` should includes background.
+            - mask_pred_list (list[Tensor]): Mask logits for each \
+                decoder layer. Each with shape (batch_size, num_queries, \
+                 h, w).
+        """
+        batch_size = len(img_metas)
+        mask_features, multi_scale_memorys = self.pixel_decoder(feats)
+        # multi_scale_memorys (from low resolution to high resolution)
+        decoder_inputs = []
+        decoder_positional_encodings = []
+        for i in range(self.num_transformer_feat_level):
+            decoder_input = self.decoder_input_projs[i](multi_scale_memorys[i])
+            # shape (batch_size, c, h, w) -> (h*w, batch_size, c)
+            decoder_input = decoder_input.flatten(2).permute(2, 0, 1)
+            level_embed = self.level_embed.weight[i].view(1, 1, -1)
+            decoder_input = decoder_input + level_embed
+            # shape (batch_size, c, h, w) -> (h*w, batch_size, c)
+            mask = decoder_input.new_zeros(
+                (batch_size, ) + multi_scale_memorys[i].shape[-2:],
+                dtype=torch.bool)
+            decoder_positional_encoding = self.decoder_positional_encoding(
+                mask)
+            decoder_positional_encoding = decoder_positional_encoding.flatten(
+                2).permute(2, 0, 1)
+            decoder_inputs.append(decoder_input)
+            decoder_positional_encodings.append(decoder_positional_encoding)
+        # shape (num_queries, c) -> (num_queries, batch_size, c)
+        query_feat = self.query_feat.weight.unsqueeze(1).repeat(
+            (1, batch_size, 1))
+        query_embed = self.query_embed.weight.unsqueeze(1).repeat(
+            (1, batch_size, 1))
+
+        cls_pred_list = []
+        mask_pred_list = []
+        cls_pred, mask_pred, attn_mask = self.forward_head(
+            query_feat, mask_features, multi_scale_memorys[0].shape[-2:])
+        cls_pred_list.append(cls_pred)
+        mask_pred_list.append(mask_pred)
+
+        for i in range(self.num_transformer_decoder_layers):
+            level_idx = i % self.num_transformer_feat_level
+            # if a mask is all True(all background), then set it all False.
+            attn_mask[torch.where(
+                attn_mask.sum(-1) == attn_mask.shape[-1])] = False
+
+            # cross_attn + self_attn
+            layer = self.transformer_decoder.layers[i]
+            attn_masks = [attn_mask, None]
+            query_feat = layer(
+                query=query_feat,
+                key=decoder_inputs[level_idx],
+                value=decoder_inputs[level_idx],
+                query_pos=query_embed,
+                key_pos=decoder_positional_encodings[level_idx],
+                attn_masks=attn_masks,
+                query_key_padding_mask=None,
+                # here we do not apply masking on padded region
+                key_padding_mask=None)
+            cls_pred, mask_pred, attn_mask = self.forward_head(
+                query_feat, mask_features, multi_scale_memorys[
+                    (i + 1) % self.num_transformer_feat_level].shape[-2:])
+
+            cls_pred_list.append(cls_pred)
+            mask_pred_list.append(mask_pred)
+
+        return cls_pred_list, mask_pred_list
+
+    def forward_train(self, x, img_metas, gt_semantic_seg, gt_labels,
+                      gt_masks):
+        """Forward function for training mode.
+
+        Args:
+            x (list[Tensor]): Multi-level features from the upstream network,
+                each is a 4D-tensor.
+            img_metas (list[Dict]): List of image information.
+            gt_semantic_seg (list[tensor]):Each element is the ground truth
+                of semantic segmentation with the shape (N, H, W).
+            train_cfg (dict): The training config, which not been used in
+                maskformer.
+            gt_labels (list[Tensor]): Each element is ground truth labels of
+                each box, shape (num_gts,).
+            gt_masks (list[BitmapMasks]): Each element is masks of instances
+                of a image, shape (num_gts, h, w).
+
+        Returns:
+            losses (dict[str, Tensor]): a dictionary of loss components
+        """
+
+        # forward
+        all_cls_scores, all_mask_preds = self(x, img_metas)
+
+        # loss
+        losses = self.loss(all_cls_scores, all_mask_preds, gt_labels, gt_masks,
+                           img_metas)
+
+        return losses
+
+    def forward_test(self, inputs, img_metas, test_cfg):
+        """Test segment without test-time aumengtation.
+
+        Only the output of last decoder layers was used.
+
+        Args:
+            inputs (list[Tensor]): Multi-level features from the
+                upstream network, each is a 4D-tensor.
+            img_metas (list[dict]): List of image information.
+            test_cfg (dict): Testing config.
+
+        Returns:
+            seg_mask (Tensor): Predicted semantic segmentation logits.
+        """
+        all_cls_scores, all_mask_preds = self(inputs, img_metas)
+        cls_score, mask_pred = all_cls_scores[-1], all_mask_preds[-1]
+        ori_h, ori_w, _ = img_metas[0]['ori_shape']
+
+        # semantic inference
+        cls_score = F.softmax(cls_score, dim=-1)[..., :-1]
+        mask_pred = mask_pred.sigmoid()
+        seg_mask = torch.einsum('bqc,bqhw->bchw', cls_score, mask_pred)
+        return seg_mask
diff --git a/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/segmentors/__init__.py b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/segmentors/__init__.py
new file mode 100644
index 00000000..18bbce0d
--- /dev/null
+++ b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/segmentors/__init__.py
@@ -0,0 +1,5 @@
+# The implementation is adopted from VitAdapter,
+# made publicly available under the Apache License at https://github.com/czczup/ViT-Adapter.git
+from .encoder_decoder_mask2former import EncoderDecoderMask2Former
+
+__all__ = ['EncoderDecoderMask2Former']
diff --git a/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/segmentors/base_segmentor.py b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/segmentors/base_segmentor.py
new file mode 100644
index 00000000..311352c2
--- /dev/null
+++ b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/segmentors/base_segmentor.py
@@ -0,0 +1,313 @@
+# The implementation is adopted from VitAdapter,
+# made publicly available under the Apache License at https://github.com/czczup/ViT-Adapter.git
+import warnings
+from abc import ABCMeta, abstractmethod
+from collections import OrderedDict
+
+import mmcv
+import numpy as np
+import torch
+import torch.distributed as dist
+from mmcv.runner import BaseModule, auto_fp16
+
+
+class BaseSegmentor(BaseModule, metaclass=ABCMeta):
+    """Base class for segmentors."""
+
+    def __init__(self, init_cfg=None):
+        super(BaseSegmentor, self).__init__(init_cfg)
+        self.fp16_enabled = False
+
+    @property
+    def with_neck(self):
+        """bool: whether the segmentor has neck"""
+        return hasattr(self, 'neck') and self.neck is not None
+
+    @property
+    def with_auxiliary_head(self):
+        """bool: whether the segmentor has auxiliary head"""
+        return hasattr(self,
+                       'auxiliary_head') and self.auxiliary_head is not None
+
+    @property
+    def with_decode_head(self):
+        """bool: whether the segmentor has decode head"""
+        return hasattr(self, 'decode_head') and self.decode_head is not None
+
+    @abstractmethod
+    def extract_feat(self, imgs):
+        """Placeholder for extract features from images."""
+        pass
+
+    @abstractmethod
+    def encode_decode(self, img, img_metas):
+        """Placeholder for encode images with backbone and decode into a
+        semantic segmentation map of the same size as input."""
+        pass
+
+    @abstractmethod
+    def forward_train(self, imgs, img_metas, **kwargs):
+        """Placeholder for Forward function for training."""
+        pass
+
+    @abstractmethod
+    def simple_test(self, img, img_meta, **kwargs):
+        """Placeholder for single image test."""
+        pass
+
+    @abstractmethod
+    def aug_test(self, imgs, img_metas, **kwargs):
+        """Placeholder for augmentation test."""
+        pass
+
+    def forward_test(self, imgs, img_metas, **kwargs):
+        """
+        Args:
+            imgs (List[Tensor]): the outer list indicates test-time
+                augmentations and inner Tensor should have a shape NxCxHxW,
+                which contains all images in the batch.
+            img_metas (List[List[dict]]): the outer list indicates test-time
+                augs (multiscale, flip, etc.) and the inner list indicates
+                images in a batch.
+        """
+        for var, name in [(imgs, 'imgs'), (img_metas, 'img_metas')]:
+            if not isinstance(var, list):
+                raise TypeError(f'{name} must be a list, but got '
+                                f'{type(var)}')
+
+        num_augs = len(imgs)
+        if num_augs != len(img_metas):
+            raise ValueError(f'num of augmentations ({len(imgs)}) != '
+                             f'num of image meta ({len(img_metas)})')
+
+        # all images in the same aug batch all of the same ori_shape and pad
+        # shape
+        def tensor_to_tuple(input_tensor):
+            return tuple(input_tensor.cpu().numpy())
+
+        for img_meta in img_metas:
+            ori_shapes = [_['ori_shape'] for _ in img_meta]
+            if isinstance(ori_shapes[0], torch.Tensor):
+                assert all(
+                    tensor_to_tuple(shape) == tensor_to_tuple(ori_shapes[0])
+                    for shape in ori_shapes)
+            else:
+                assert all(shape == ori_shapes[0] for shape in ori_shapes)
+
+            img_shapes = [_['img_shape'] for _ in img_meta]
+            if isinstance(img_shapes[0], torch.Tensor):
+                assert all(
+                    tensor_to_tuple(shape) == tensor_to_tuple(img_shapes[0])
+                    for shape in img_shapes)
+            else:
+                assert all(shape == img_shapes[0] for shape in img_shapes)
+
+            pad_shapes = [_['pad_shape'] for _ in img_meta]
+            if isinstance(pad_shapes[0], torch.Tensor):
+                assert all(
+                    tensor_to_tuple(shape) == tensor_to_tuple(pad_shapes[0])
+                    for shape in pad_shapes)
+            else:
+                assert all(shape == pad_shapes[0] for shape in pad_shapes)
+
+        if num_augs == 1:
+            return self.simple_test(imgs[0], img_metas[0], **kwargs)
+        else:
+            return self.aug_test(imgs, img_metas, **kwargs)
+
+    @auto_fp16(apply_to=('img', ))
+    def forward(self, img, img_metas, return_loss=True, **kwargs):
+        """Calls either :func:`forward_train` or :func:`forward_test` depending
+        on whether ``return_loss`` is ``True``.
+
+        Note this setting will change the expected inputs. When
+        ``return_loss=True``, img and img_meta are single-nested (i.e. Tensor
+        and List[dict]), and when ``resturn_loss=False``, img and img_meta
+        should be double nested (i.e.  List[Tensor], List[List[dict]]), with
+        the outer list indicating test time augmentations.
+        """
+        if return_loss:
+            return self.forward_train(img, img_metas, **kwargs)
+        else:
+            return self.forward_test(img, img_metas, **kwargs)
+
+    def train_step(self, data_batch, optimizer, **kwargs):
+        """The iteration step during training.
+
+        This method defines an iteration step during training, except for the
+        back propagation and optimizer updating, which are done in an optimizer
+        hook. Note that in some complicated cases or models, the whole process
+        including back propagation and optimizer updating is also defined in
+        this method, such as GAN.
+
+        Args:
+            data (dict): The output of dataloader.
+            optimizer (:obj:`torch.optim.Optimizer` | dict): The optimizer of
+                runner is passed to ``train_step()``. This argument is unused
+                and reserved.
+
+        Returns:
+            dict: It should contain at least 3 keys: ``loss``, ``log_vars``,
+                ``num_samples``.
+                ``loss`` is a tensor for back propagation, which can be a
+                weighted sum of multiple losses.
+                ``log_vars`` contains all the variables to be sent to the
+                logger.
+                ``num_samples`` indicates the batch size (when the model is
+                DDP, it means the batch size on each GPU), which is used for
+                averaging the logs.
+        """
+        losses = self(**data_batch)
+        loss, log_vars = self._parse_losses(losses)
+
+        outputs = dict(
+            loss=loss,
+            log_vars=log_vars,
+            num_samples=len(data_batch['img_metas']))
+
+        return outputs
+
+    def val_step(self, data_batch, optimizer=None, **kwargs):
+        """The iteration step during validation.
+
+        This method shares the same signature as :func:`train_step`, but used
+        during val epochs. Note that the evaluation after training epochs is
+        not implemented with this method, but an evaluation hook.
+        """
+        losses = self(**data_batch)
+        loss, log_vars = self._parse_losses(losses)
+
+        log_vars_ = dict()
+        for loss_name, loss_value in log_vars.items():
+            k = loss_name + '_val'
+            log_vars_[k] = loss_value
+
+        outputs = dict(
+            loss=loss,
+            log_vars=log_vars_,
+            num_samples=len(data_batch['img_metas']))
+
+        return outputs
+
+    @staticmethod
+    def _parse_losses(losses):
+        """Parse the raw outputs (losses) of the network.
+
+        Args:
+            losses (dict): Raw output of the network, which usually contain
+                losses and other necessary information.
+
+        Returns:
+            tuple[Tensor, dict]: (loss, log_vars), loss is the loss tensor
+                which may be a weighted sum of all losses, log_vars contains
+                all the variables to be sent to the logger.
+        """
+        log_vars = OrderedDict()
+        for loss_name, loss_value in losses.items():
+            if isinstance(loss_value, torch.Tensor):
+                log_vars[loss_name] = loss_value.mean()
+            elif isinstance(loss_value, list):
+                log_vars[loss_name] = sum(_loss.mean() for _loss in loss_value)
+            else:
+                raise TypeError(
+                    f'{loss_name} is not a tensor or list of tensors')
+
+        loss = sum(_value for _key, _value in log_vars.items()
+                   if 'loss' in _key)
+
+        # If the loss_vars has different length, raise assertion error
+        # to prevent GPUs from infinite waiting.
+        if dist.is_available() and dist.is_initialized():
+            log_var_length = torch.tensor(len(log_vars), device=loss.device)
+            dist.all_reduce(log_var_length)
+            message = (f'rank {dist.get_rank()}'
+                       + f' len(log_vars): {len(log_vars)}' + ' keys: '
+                       + ','.join(log_vars.keys()) + '\n')
+            assert log_var_length == len(log_vars) * dist.get_world_size(), \
+                'loss log variables are different across GPUs!\n' + message
+
+        log_vars['loss'] = loss
+        for loss_name, loss_value in log_vars.items():
+            # reduce loss when distributed training
+            if dist.is_available() and dist.is_initialized():
+                loss_value = loss_value.data.clone()
+                dist.all_reduce(loss_value.div_(dist.get_world_size()))
+            log_vars[loss_name] = loss_value.item()
+
+        return loss, log_vars
+
+    def show_result(self,
+                    img,
+                    result,
+                    palette=None,
+                    win_name='',
+                    show=False,
+                    wait_time=0,
+                    out_file=None,
+                    opacity=0.5):
+        """Draw `result` over `img`.
+
+        Args:
+            img (str or Tensor): The image to be displayed.
+            result (Tensor): The semantic segmentation results to draw over
+                `img`.
+            palette (list[list[int]]] | np.ndarray | None): The palette of
+                segmentation map. If None is given, random palette will be
+                generated. Default: None
+            win_name (str): The window name.
+            wait_time (int): Value of waitKey param.
+                Default: 0.
+            show (bool): Whether to show the image.
+                Default: False.
+            out_file (str or None): The filename to write the image.
+                Default: None.
+            opacity(float): Opacity of painted segmentation map.
+                Default 0.5.
+                Must be in (0, 1] range.
+        Returns:
+            img (Tensor): Only if not `show` or `out_file`
+        """
+        img = mmcv.imread(img)
+        img = img.copy()
+        seg = result[0]
+        if palette is None:
+            if self.PALETTE is None:
+                # Get random state before set seed,
+                # and restore random state later.
+                # It will prevent loss of randomness, as the palette
+                # may be different in each iteration if not specified.
+                # See: https://github.com/open-mmlab/mmdetection/issues/5844
+                state = np.random.get_state()
+                np.random.seed(42)
+                # random palette
+                palette = np.random.randint(
+                    0, 255, size=(len(self.CLASSES), 3))
+                np.random.set_state(state)
+            else:
+                palette = self.PALETTE
+        palette = np.array(palette)
+        assert palette.shape[0] == len(self.CLASSES)
+        assert palette.shape[1] == 3
+        assert len(palette.shape) == 2
+        assert 0 < opacity <= 1.0
+        color_seg = np.zeros((seg.shape[0], seg.shape[1], 3), dtype=np.uint8)
+        for label, color in enumerate(palette):
+            color_seg[seg == label, :] = color
+        # convert to BGR
+        color_seg = color_seg[..., ::-1]
+
+        img = img * (1 - opacity) + color_seg * opacity
+        img = img.astype(np.uint8)
+        # if out_file specified, do not show image in window
+        if out_file is not None:
+            show = False
+
+        if show:
+            mmcv.imshow(img, win_name, wait_time)
+        if out_file is not None:
+            mmcv.imwrite(img, out_file)
+
+        if not (show or out_file):
+            warnings.warn('show==False and out_file is not specified, only '
+                          'result image will be returned')
+            return img
diff --git a/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/segmentors/encoder_decoder_mask2former.py b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/segmentors/encoder_decoder_mask2former.py
new file mode 100644
index 00000000..50492374
--- /dev/null
+++ b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/segmentors/encoder_decoder_mask2former.py
@@ -0,0 +1,302 @@
+# The implementation is adopted from VitAdapter,
+# made publicly available under the Apache License at https://github.com/czczup/ViT-Adapter.git
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmdet.models import builder
+from mmdet.models.builder import DETECTORS
+
+from ...utils import add_prefix, seg_resize
+from .base_segmentor import BaseSegmentor
+
+
+@DETECTORS.register_module()
+class EncoderDecoderMask2Former(BaseSegmentor):
+    """Encoder Decoder segmentors.
+
+    EncoderDecoder typically consists of backbone, decode_head, auxiliary_head.
+    Note that auxiliary_head is only used for deep supervision during training,
+    which could be dumped during inference.
+    """
+
+    def __init__(self,
+                 backbone,
+                 decode_head,
+                 neck=None,
+                 auxiliary_head=None,
+                 train_cfg=None,
+                 test_cfg=None,
+                 pretrained=None,
+                 init_cfg=None):
+        super(EncoderDecoderMask2Former, self).__init__(init_cfg)
+        if pretrained is not None:
+            assert backbone.get('pretrained') is None, \
+                'both backbone and segmentor set pretrained weight'
+            backbone.pretrained = pretrained
+        self.backbone = builder.build_backbone(backbone)
+        if neck is not None:
+            self.neck = builder.build_neck(neck)
+        decode_head.update(train_cfg=train_cfg)
+        decode_head.update(test_cfg=test_cfg)
+        self._init_decode_head(decode_head)
+        self._init_auxiliary_head(auxiliary_head)
+
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+
+        assert self.with_decode_head
+
+    def _init_decode_head(self, decode_head):
+        """Initialize ``decode_head``"""
+        self.decode_head = builder.build_head(decode_head)
+        self.align_corners = self.decode_head.align_corners
+        self.num_classes = self.decode_head.num_classes
+
+    def _init_auxiliary_head(self, auxiliary_head):
+        """Initialize ``auxiliary_head``"""
+        if auxiliary_head is not None:
+            if isinstance(auxiliary_head, list):
+                self.auxiliary_head = nn.ModuleList()
+                for head_cfg in auxiliary_head:
+                    self.auxiliary_head.append(builder.build_head(head_cfg))
+            else:
+                self.auxiliary_head = builder.build_head(auxiliary_head)
+
+    def extract_feat(self, img):
+        """Extract features from images."""
+        x = self.backbone(img)
+        if self.with_neck:
+            x = self.neck(x)
+        return x
+
+    def encode_decode(self, img, img_metas):
+        """Encode images with backbone and decode into a semantic segmentation
+        map of the same size as input."""
+        x = self.extract_feat(img)
+        out = self._decode_head_forward_test(x, img_metas)
+        out = seg_resize(
+            input=out,
+            size=img.shape[2:],
+            mode='bilinear',
+            align_corners=self.align_corners)
+        return out
+
+    def _decode_head_forward_train(self, x, img_metas, gt_semantic_seg,
+                                   **kwargs):
+        """Run forward function and calculate loss for decode head in
+        training."""
+        losses = dict()
+        loss_decode = self.decode_head.forward_train(x, img_metas,
+                                                     gt_semantic_seg, **kwargs)
+
+        losses.update(add_prefix(loss_decode, 'decode'))
+        return losses
+
+    def _decode_head_forward_test(self, x, img_metas):
+        """Run forward function and calculate loss for decode head in
+        inference."""
+        seg_logits = self.decode_head.forward_test(x, img_metas, self.test_cfg)
+        return seg_logits
+
+    def _auxiliary_head_forward_train(self, x, img_metas, gt_semantic_seg):
+        """Run forward function and calculate loss for auxiliary head in
+        training."""
+        losses = dict()
+        if isinstance(self.auxiliary_head, nn.ModuleList):
+            for idx, aux_head in enumerate(self.auxiliary_head):
+                loss_aux = aux_head.forward_train(x, img_metas,
+                                                  gt_semantic_seg,
+                                                  self.train_cfg)
+                losses.update(add_prefix(loss_aux, f'aux_{idx}'))
+        else:
+            loss_aux = self.auxiliary_head.forward_train(
+                x, img_metas, gt_semantic_seg, self.train_cfg)
+            losses.update(add_prefix(loss_aux, 'aux'))
+
+        return losses
+
+    def forward_dummy(self, img):
+        """Dummy forward function."""
+        seg_logit = self.encode_decode(img, None)
+
+        return seg_logit
+
+    def forward_train(self, img, img_metas, gt_semantic_seg, **kwargs):
+        """Forward function for training.
+
+        Args:
+            img (Tensor): Input images.
+            img_metas (list[dict]): List of image info dict where each dict
+                has: 'img_shape', 'scale_factor', 'flip', and may also contain
+                'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'.
+                For details on the values of these keys see
+                `mmseg/datasets/pipelines/formatting.py:Collect`.
+            gt_semantic_seg (Tensor): Semantic segmentation masks
+                used if the architecture supports semantic segmentation task.
+
+        Returns:
+            dict[str, Tensor]: a dictionary of loss components
+        """
+
+        x = self.extract_feat(img)
+
+        losses = dict()
+
+        loss_decode = self._decode_head_forward_train(x, img_metas,
+                                                      gt_semantic_seg,
+                                                      **kwargs)
+        losses.update(loss_decode)
+
+        if self.with_auxiliary_head:
+            loss_aux = self._auxiliary_head_forward_train(
+                x, img_metas, gt_semantic_seg)
+            losses.update(loss_aux)
+
+        return losses
+
+    # TODO refactor
+    def slide_inference(self, img, img_meta, rescale):
+        """Inference by sliding-window with overlap.
+
+        If h_crop > h_img or w_crop > w_img, the small patch will be used to
+        decode without padding.
+        """
+
+        h_stride, w_stride = self.test_cfg.stride
+        h_crop, w_crop = self.test_cfg.crop_size
+        batch_size, _, h_img, w_img = img.size()
+        num_classes = self.num_classes
+        h_grids = max(h_img - h_crop + h_stride - 1, 0) // h_stride + 1
+        w_grids = max(w_img - w_crop + w_stride - 1, 0) // w_stride + 1
+        preds = img.new_zeros((batch_size, num_classes, h_img, w_img))
+        count_mat = img.new_zeros((batch_size, 1, h_img, w_img))
+        for h_idx in range(h_grids):
+            for w_idx in range(w_grids):
+                y1 = h_idx * h_stride
+                x1 = w_idx * w_stride
+                y2 = min(y1 + h_crop, h_img)
+                x2 = min(x1 + w_crop, w_img)
+                y1 = max(y2 - h_crop, 0)
+                x1 = max(x2 - w_crop, 0)
+                crop_img = img[:, :, y1:y2, x1:x2]
+                crop_seg_logit = self.encode_decode(crop_img, img_meta)
+                preds += F.pad(crop_seg_logit,
+                               (int(x1), int(preds.shape[3] - x2), int(y1),
+                                int(preds.shape[2] - y2)))
+
+                count_mat[:, :, y1:y2, x1:x2] += 1
+        assert (count_mat == 0).sum() == 0
+        if torch.onnx.is_in_onnx_export():
+            # cast count_mat to constant while exporting to ONNX
+            count_mat = torch.from_numpy(
+                count_mat.cpu().detach().numpy()).to(device=img.device)
+        preds = preds / count_mat
+
+        def tensor_to_tuple(input_tensor):
+            return tuple(input_tensor.cpu().numpy())
+
+        if rescale:
+            preds = seg_resize(
+                preds,
+                size=tensor_to_tuple(img_meta[0]['ori_shape'])[:2]
+                if isinstance(img_meta[0]['ori_shape'], torch.Tensor) else
+                img_meta[0]['ori_shape'],
+                mode='bilinear',
+                align_corners=self.align_corners,
+                warning=False)
+        return preds
+
+    def whole_inference(self, img, img_meta, rescale):
+        """Inference with full image."""
+
+        seg_logit = self.encode_decode(img, img_meta)
+        if rescale:
+            # support dynamic shape for onnx
+            if torch.onnx.is_in_onnx_export():
+                size = img.shape[2:]
+            else:
+                size = img_meta[0]['ori_shape'][:2]
+            seg_logit = seg_resize(
+                seg_logit,
+                size=size,
+                mode='bilinear',
+                align_corners=self.align_corners,
+                warning=False)
+
+        return seg_logit
+
+    def inference(self, img, img_meta, rescale):
+        """Inference with slide/whole style.
+
+        Args:
+            img (Tensor): The input image of shape (N, 3, H, W).
+            img_meta (dict): Image info dict where each dict has: 'img_shape',
+                'scale_factor', 'flip', and may also contain
+                'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'.
+                For details on the values of these keys see
+                `mmseg/datasets/pipelines/formatting.py:Collect`.
+            rescale (bool): Whether rescale back to original shape.
+
+        Returns:
+            Tensor: The output segmentation map.
+        """
+
+        assert self.test_cfg.mode in ['slide', 'whole']
+        ori_shape = img_meta[0]['ori_shape']
+
+        def tensor_to_tuple(input_tensor):
+            return tuple(input_tensor.cpu().numpy())
+
+        if isinstance(ori_shape, torch.Tensor):
+            assert all(
+                tensor_to_tuple(_['ori_shape']) == tensor_to_tuple(ori_shape)
+                for _ in img_meta)
+        else:
+            assert all(_['ori_shape'] == ori_shape for _ in img_meta)
+        if self.test_cfg.mode == 'slide':
+            seg_logit = self.slide_inference(img, img_meta, rescale)
+        else:
+            seg_logit = self.whole_inference(img, img_meta, rescale)
+        output = F.softmax(seg_logit, dim=1)
+        flip = img_meta[0]['flip']
+        if flip:
+            flip_direction = img_meta[0]['flip_direction']
+            assert flip_direction in ['horizontal', 'vertical']
+            if flip_direction == 'horizontal':
+                output = output.flip(dims=(3, ))
+            elif flip_direction == 'vertical':
+                output = output.flip(dims=(2, ))
+
+        return output
+
+    def simple_test(self, img, img_meta, rescale=True):
+        """Simple test with single image."""
+        seg_logit = self.inference(img, img_meta, rescale)
+        seg_pred = seg_logit.argmax(dim=1)
+        if torch.onnx.is_in_onnx_export():
+            # our inference backend only support 4D output
+            seg_pred = seg_pred.unsqueeze(0)
+            return seg_pred
+        seg_pred = seg_pred.cpu().numpy()
+        # unravel batch dim
+        seg_pred = list(seg_pred)
+        return seg_pred
+
+    def aug_test(self, imgs, img_metas, rescale=True):
+        """Test with augmentations.
+
+        Only rescale=True is supported.
+        """
+        # aug_test rescale all imgs back to ori_shape for now
+        assert rescale
+        # to save memory, we get augmented seg logit inplace
+        seg_logit = self.inference(imgs[0], img_metas[0], rescale)
+        for i in range(1, len(imgs)):
+            cur_seg_logit = self.inference(imgs[i], img_metas[i], rescale)
+            seg_logit += cur_seg_logit
+        seg_logit /= len(imgs)
+        seg_pred = seg_logit.argmax(dim=1)
+        seg_pred = seg_pred.cpu().numpy()
+        # unravel batch dim
+        seg_pred = list(seg_pred)
+        return seg_pred
diff --git a/modelscope/models/cv/image_semantic_segmentation/vit_adapter/utils/__init__.py b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/utils/__init__.py
new file mode 100644
index 00000000..9c4d5c4c
--- /dev/null
+++ b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/utils/__init__.py
@@ -0,0 +1,9 @@
+# The implementation is adopted from VitAdapter,
+# made publicly available under the Apache License at https://github.com/czczup/ViT-Adapter.git
+from .builder import build_pixel_sampler
+from .data_process_func import ResizeToMultiple
+from .seg_func import add_prefix, seg_resize
+
+__all__ = [
+    'seg_resize', 'add_prefix', 'build_pixel_sampler', 'ResizeToMultiple'
+]
diff --git a/modelscope/models/cv/image_semantic_segmentation/vit_adapter/utils/builder.py b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/utils/builder.py
new file mode 100644
index 00000000..0603ef94
--- /dev/null
+++ b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/utils/builder.py
@@ -0,0 +1,10 @@
+# The implementation is adopted from VitAdapter,
+# made publicly available under the Apache License at https://github.com/czczup/ViT-Adapter.git
+from mmcv.utils import Registry, build_from_cfg
+
+PIXEL_SAMPLERS = Registry('pixel sampler')
+
+
+def build_pixel_sampler(cfg, **default_args):
+    """Build pixel sampler for segmentation map."""
+    return build_from_cfg(cfg, PIXEL_SAMPLERS, default_args)
diff --git a/modelscope/models/cv/image_semantic_segmentation/vit_adapter/utils/data_process_func.py b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/utils/data_process_func.py
new file mode 100644
index 00000000..194361af
--- /dev/null
+++ b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/utils/data_process_func.py
@@ -0,0 +1,60 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import mmcv
+from mmdet.datasets.builder import PIPELINES
+
+
+@PIPELINES.register_module()
+class ResizeToMultiple(object):
+    """Resize images & seg to multiple of divisor.
+
+    Args:
+        size_divisor (int): images and gt seg maps need to resize to multiple
+            of size_divisor. Default: 32.
+        interpolation (str, optional): The interpolation mode of image resize.
+            Default: None
+    """
+
+    def __init__(self, size_divisor=32, interpolation=None):
+        self.size_divisor = size_divisor
+        self.interpolation = interpolation
+
+    def __call__(self, results):
+        """Call function to resize images, semantic segmentation map to
+        multiple of size divisor.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Resized results, 'img_shape', 'pad_shape' keys are updated.
+        """
+        # Align image to multiple of size divisor.
+        img = results['img']
+        img = mmcv.imresize_to_multiple(
+            img,
+            self.size_divisor,
+            scale_factor=1,
+            interpolation=self.interpolation
+            if self.interpolation else 'bilinear')
+
+        results['img'] = img
+        results['img_shape'] = img.shape
+        results['pad_shape'] = img.shape
+
+        # Align segmentation map to multiple of size divisor.
+        for key in results.get('seg_fields', []):
+            gt_seg = results[key]
+            gt_seg = mmcv.imresize_to_multiple(
+                gt_seg,
+                self.size_divisor,
+                scale_factor=1,
+                interpolation='nearest')
+            results[key] = gt_seg
+
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += (f'(size_divisor={self.size_divisor}, '
+                     f'interpolation={self.interpolation})')
+        return repr_str
diff --git a/modelscope/models/cv/image_semantic_segmentation/vit_adapter/utils/seg_func.py b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/utils/seg_func.py
new file mode 100644
index 00000000..db564cca
--- /dev/null
+++ b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/utils/seg_func.py
@@ -0,0 +1,47 @@
+# The implementation is adopted from VitAdapter,
+# made publicly available under the Apache License at https://github.com/czczup/ViT-Adapter.git
+
+import warnings
+
+import torch.nn.functional as F
+
+
+def seg_resize(input,
+               size=None,
+               scale_factor=None,
+               mode='nearest',
+               align_corners=None,
+               warning=True):
+    if warning:
+        if size is not None and align_corners:
+            input_h, input_w = tuple(int(x) for x in input.shape[2:])
+            output_h, output_w = tuple(int(x) for x in size)
+            if output_h > input_h or output_w > input_w:
+                if ((output_h > 1 and output_w > 1 and input_h > 1
+                     and input_w > 1) and (output_h - 1) % (input_h - 1)
+                        and (output_w - 1) % (input_w - 1)):
+                    warnings.warn(
+                        f'When align_corners={align_corners}, '
+                        'the output would more aligned if '
+                        f'input size {(input_h, input_w)} is `x+1` and '
+                        f'out size {(output_h, output_w)} is `nx+1`')
+    return F.interpolate(input, size, scale_factor, mode, align_corners)
+
+
+def add_prefix(inputs, prefix):
+    """Add prefix for dict.
+
+    Args:
+        inputs (dict): The input dict with str keys.
+        prefix (str): The prefix to add.
+
+    Returns:
+
+        dict: The dict with keys updated with ``prefix``.
+    """
+
+    outputs = dict()
+    for name, value in inputs.items():
+        outputs[f'{prefix}.{name}'] = value
+
+    return outputs
diff --git a/modelscope/models/cv/image_to_image_generation/model.py b/modelscope/models/cv/image_to_image_generation/model.py
index 37479b43..94e5dd7b 100644
--- a/modelscope/models/cv/image_to_image_generation/model.py
+++ b/modelscope/models/cv/image_to_image_generation/model.py
@@ -1,3 +1,4 @@
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
 import math
 
 import torch
diff --git a/modelscope/models/cv/image_to_image_generation/models/autoencoder.py b/modelscope/models/cv/image_to_image_generation/models/autoencoder.py
index 181472de..dce256f6 100644
--- a/modelscope/models/cv/image_to_image_generation/models/autoencoder.py
+++ b/modelscope/models/cv/image_to_image_generation/models/autoencoder.py
@@ -1,3 +1,4 @@
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
 import math
 
 import torch
diff --git a/modelscope/models/cv/image_to_image_generation/models/clip.py b/modelscope/models/cv/image_to_image_generation/models/clip.py
index 35d9d882..d3dd22b4 100644
--- a/modelscope/models/cv/image_to_image_generation/models/clip.py
+++ b/modelscope/models/cv/image_to_image_generation/models/clip.py
@@ -1,3 +1,5 @@
+# Part of the implementation is borrowed and modified from CLIP, publicly avaialbe at https://github.com/openai/CLIP.
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
 import math
 
 import torch
diff --git a/modelscope/models/cv/image_to_image_generation/ops/diffusion.py b/modelscope/models/cv/image_to_image_generation/ops/diffusion.py
index bcbb6402..b8ffbbbb 100644
--- a/modelscope/models/cv/image_to_image_generation/ops/diffusion.py
+++ b/modelscope/models/cv/image_to_image_generation/ops/diffusion.py
@@ -1,3 +1,4 @@
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
 import math
 
 import torch
diff --git a/modelscope/models/cv/image_to_image_generation/ops/losses.py b/modelscope/models/cv/image_to_image_generation/ops/losses.py
index 23e8d246..46b9540a 100644
--- a/modelscope/models/cv/image_to_image_generation/ops/losses.py
+++ b/modelscope/models/cv/image_to_image_generation/ops/losses.py
@@ -1,3 +1,4 @@
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
 import math
 
 import torch
diff --git a/modelscope/models/cv/image_to_image_translation/data/transforms.py b/modelscope/models/cv/image_to_image_translation/data/transforms.py
index 5376d813..29a25b4b 100644
--- a/modelscope/models/cv/image_to_image_translation/data/transforms.py
+++ b/modelscope/models/cv/image_to_image_translation/data/transforms.py
@@ -1,3 +1,4 @@
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
 import math
 import random
 
diff --git a/modelscope/models/cv/image_to_image_translation/model_translation.py b/modelscope/models/cv/image_to_image_translation/model_translation.py
index 722b175d..f2a9e7db 100644
--- a/modelscope/models/cv/image_to_image_translation/model_translation.py
+++ b/modelscope/models/cv/image_to_image_translation/model_translation.py
@@ -1,3 +1,4 @@
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
 import math
 
 import torch
diff --git a/modelscope/models/cv/image_to_image_translation/models/autoencoder.py b/modelscope/models/cv/image_to_image_translation/models/autoencoder.py
index 181472de..dce256f6 100644
--- a/modelscope/models/cv/image_to_image_translation/models/autoencoder.py
+++ b/modelscope/models/cv/image_to_image_translation/models/autoencoder.py
@@ -1,3 +1,4 @@
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
 import math
 
 import torch
diff --git a/modelscope/models/cv/image_to_image_translation/models/clip.py b/modelscope/models/cv/image_to_image_translation/models/clip.py
index 35d9d882..d3dd22b4 100644
--- a/modelscope/models/cv/image_to_image_translation/models/clip.py
+++ b/modelscope/models/cv/image_to_image_translation/models/clip.py
@@ -1,3 +1,5 @@
+# Part of the implementation is borrowed and modified from CLIP, publicly avaialbe at https://github.com/openai/CLIP.
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
 import math
 
 import torch
diff --git a/modelscope/models/cv/image_to_image_translation/ops/apps.py b/modelscope/models/cv/image_to_image_translation/ops/apps.py
index ee4be489..39d2e015 100644
--- a/modelscope/models/cv/image_to_image_translation/ops/apps.py
+++ b/modelscope/models/cv/image_to_image_translation/ops/apps.py
@@ -1,3 +1,4 @@
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
 # APPs that facilitate the use of pretrained neural networks.
 
 import os.path as osp
diff --git a/modelscope/models/cv/image_to_image_translation/ops/degradation.py b/modelscope/models/cv/image_to_image_translation/ops/degradation.py
index c3b3d1df..9061e7be 100644
--- a/modelscope/models/cv/image_to_image_translation/ops/degradation.py
+++ b/modelscope/models/cv/image_to_image_translation/ops/degradation.py
@@ -1,3 +1,4 @@
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
 import math
 import os
 import random
diff --git a/modelscope/models/cv/image_to_image_translation/ops/diffusion.py b/modelscope/models/cv/image_to_image_translation/ops/diffusion.py
index bcbb6402..5ff37dc3 100644
--- a/modelscope/models/cv/image_to_image_translation/ops/diffusion.py
+++ b/modelscope/models/cv/image_to_image_translation/ops/diffusion.py
@@ -1,3 +1,6 @@
+# Part of the implementation is borrowed and modified from latent-diffusion,
+# publicly avaialbe at https://github.com/CompVis/latent-diffusion.
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
 import math
 
 import torch
diff --git a/modelscope/models/cv/image_to_image_translation/ops/losses.py b/modelscope/models/cv/image_to_image_translation/ops/losses.py
index 23e8d246..46b9540a 100644
--- a/modelscope/models/cv/image_to_image_translation/ops/losses.py
+++ b/modelscope/models/cv/image_to_image_translation/ops/losses.py
@@ -1,3 +1,4 @@
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
 import math
 
 import torch
diff --git a/modelscope/models/cv/image_to_image_translation/ops/metrics.py b/modelscope/models/cv/image_to_image_translation/ops/metrics.py
index 4a63c51f..c1023fa0 100644
--- a/modelscope/models/cv/image_to_image_translation/ops/metrics.py
+++ b/modelscope/models/cv/image_to_image_translation/ops/metrics.py
@@ -1,3 +1,4 @@
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
 import numpy as np
 import scipy.linalg as linalg
 import torch
diff --git a/modelscope/models/cv/image_to_image_translation/ops/random_color.py b/modelscope/models/cv/image_to_image_translation/ops/random_color.py
index 97e2f848..75692836 100644
--- a/modelscope/models/cv/image_to_image_translation/ops/random_color.py
+++ b/modelscope/models/cv/image_to_image_translation/ops/random_color.py
@@ -1,3 +1,4 @@
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
 import colorsys
 import random
 
diff --git a/modelscope/models/cv/image_to_image_translation/ops/random_mask.py b/modelscope/models/cv/image_to_image_translation/ops/random_mask.py
index a6b55916..bda1ec11 100644
--- a/modelscope/models/cv/image_to_image_translation/ops/random_mask.py
+++ b/modelscope/models/cv/image_to_image_translation/ops/random_mask.py
@@ -1,3 +1,4 @@
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
 import cv2
 import numpy as np
 
diff --git a/modelscope/models/cv/image_to_image_translation/ops/svd.py b/modelscope/models/cv/image_to_image_translation/ops/svd.py
index c5173de1..96f7e825 100644
--- a/modelscope/models/cv/image_to_image_translation/ops/svd.py
+++ b/modelscope/models/cv/image_to_image_translation/ops/svd.py
@@ -1,3 +1,4 @@
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
 r"""SVD of linear degradation matrices described in the paper
     ``Denoising Diffusion Restoration Models.''
     @article{kawar2022denoising,
diff --git a/modelscope/models/cv/image_to_image_translation/ops/utils.py b/modelscope/models/cv/image_to_image_translation/ops/utils.py
index 3e523f4c..c2aacedc 100644
--- a/modelscope/models/cv/image_to_image_translation/ops/utils.py
+++ b/modelscope/models/cv/image_to_image_translation/ops/utils.py
@@ -1,3 +1,4 @@
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
 import base64
 import binascii
 import hashlib
diff --git a/modelscope/models/cv/movie_scene_segmentation/__init__.py b/modelscope/models/cv/movie_scene_segmentation/__init__.py
new file mode 100644
index 00000000..25dcda96
--- /dev/null
+++ b/modelscope/models/cv/movie_scene_segmentation/__init__.py
@@ -0,0 +1,25 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+
+    from .model import MovieSceneSegmentationModel
+    from .datasets import MovieSceneSegmentationDataset
+
+else:
+    _import_structure = {
+        'model': ['MovieSceneSegmentationModel'],
+        'datasets': ['MovieSceneSegmentationDataset'],
+    }
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/models/cv/movie_scene_segmentation/get_model.py b/modelscope/models/cv/movie_scene_segmentation/get_model.py
new file mode 100644
index 00000000..5c66fc02
--- /dev/null
+++ b/modelscope/models/cv/movie_scene_segmentation/get_model.py
@@ -0,0 +1,45 @@
+# ------------------------------------------------------------------------------------
+# BaSSL
+# Copyright (c) 2021 KakaoBrain. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# Github: https://github.com/kakaobrain/bassl
+# ------------------------------------------------------------------------------------
+
+from .utils.shot_encoder import resnet50
+from .utils.trn import TransformerCRN
+
+
+def get_shot_encoder(cfg):
+    name = cfg['model']['shot_encoder']['name']
+    shot_encoder_args = cfg['model']['shot_encoder'][name]
+    if name == 'resnet':
+        depth = shot_encoder_args['depth']
+        if depth == 50:
+            shot_encoder = resnet50(**shot_encoder_args['params'], )
+        else:
+            raise NotImplementedError
+    else:
+        raise NotImplementedError
+
+    return shot_encoder
+
+
+def get_contextual_relation_network(cfg):
+    crn = None
+
+    if cfg['model']['contextual_relation_network']['enabled']:
+        name = cfg['model']['contextual_relation_network']['name']
+        crn_args = cfg['model']['contextual_relation_network']['params'][name]
+        if name == 'trn':
+            sampling_name = cfg['model']['loss']['sampling_method']['name']
+            crn_args['neighbor_size'] = (
+                2 * cfg['model']['loss']['sampling_method']['params']
+                [sampling_name]['neighbor_size'])
+            crn = TransformerCRN(crn_args)
+        else:
+            raise NotImplementedError
+
+    return crn
+
+
+__all__ = ['get_shot_encoder', 'get_contextual_relation_network']
diff --git a/modelscope/models/cv/movie_scene_segmentation/model.py b/modelscope/models/cv/movie_scene_segmentation/model.py
new file mode 100644
index 00000000..1232d427
--- /dev/null
+++ b/modelscope/models/cv/movie_scene_segmentation/model.py
@@ -0,0 +1,195 @@
+# The implementation here is modified based on BaSSL,
+# originally Apache 2.0 License and publicly avaialbe at https://github.com/kakaobrain/bassl
+
+import os
+import os.path as osp
+from typing import Any, Dict
+
+import einops
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torchvision.transforms as TF
+from PIL import Image
+from shotdetect_scenedetect_lgss import shot_detect
+
+from modelscope.metainfo import Models
+from modelscope.models.base.base_torch_model import TorchModel
+from modelscope.models.builder import MODELS
+from modelscope.utils.config import Config
+from modelscope.utils.constant import ModelFile, Tasks
+from modelscope.utils.logger import get_logger
+from .get_model import get_contextual_relation_network, get_shot_encoder
+from .utils.save_op import get_pred_boundary, pred2scene, scene2video
+
+logger = get_logger()
+
+
+@MODELS.register_module(
+    Tasks.movie_scene_segmentation, module_name=Models.resnet50_bert)
+class MovieSceneSegmentationModel(TorchModel):
+
+    def __init__(self, model_dir: str, *args, **kwargs):
+        """str -- model file root."""
+        super().__init__(model_dir, *args, **kwargs)
+
+        model_path = osp.join(model_dir, ModelFile.TORCH_MODEL_FILE)
+        params = torch.load(model_path, map_location='cpu')
+
+        config_path = osp.join(model_dir, ModelFile.CONFIGURATION)
+        self.cfg = Config.from_file(config_path)
+
+        def load_param_with_prefix(prefix, model, src_params):
+            own_state = model.state_dict()
+            for name, param in own_state.items():
+                src_name = prefix + '.' + name
+                own_state[name] = src_params[src_name]
+
+            model.load_state_dict(own_state)
+
+        self.shot_encoder = get_shot_encoder(self.cfg)
+        load_param_with_prefix('shot_encoder', self.shot_encoder, params)
+        self.crn = get_contextual_relation_network(self.cfg)
+        load_param_with_prefix('crn', self.crn, params)
+
+        crn_name = self.cfg.model.contextual_relation_network.name
+        hdim = self.cfg.model.contextual_relation_network.params[crn_name][
+            'hidden_size']
+        self.head_sbd = nn.Linear(hdim, 2)
+        load_param_with_prefix('head_sbd', self.head_sbd, params)
+
+        self.test_transform = TF.Compose([
+            TF.Resize(size=256, interpolation=Image.BICUBIC),
+            TF.CenterCrop(224),
+            TF.ToTensor(),
+            TF.Normalize(
+                mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
+        ])
+
+        self.infer_result = {'vid': [], 'sid': [], 'pred': []}
+        sampling_method = self.cfg.dataset.sampling_method.name
+        self.neighbor_size = self.cfg.dataset.sampling_method.params[
+            sampling_method].neighbor_size
+
+        self.eps = 1e-5
+
+    def forward(self, inputs: Dict[str, Any]) -> Dict[str, torch.Tensor]:
+        data = inputs['video']
+        labels = inputs['label']
+        outputs = self.shared_step(data)
+
+        loss = F.cross_entropy(
+            outputs.squeeze(), labels.squeeze(), reduction='none')
+        lpos = labels == 1
+        lneg = labels == 0
+
+        pp, nn = 1, 1
+        wp = (pp / float(pp + nn)) * lpos / (lpos.sum() + self.eps)
+        wn = (nn / float(pp + nn)) * lneg / (lneg.sum() + self.eps)
+        w = wp + wn
+        loss = (w * loss).sum()
+
+        probs = torch.argmax(outputs, dim=1)
+
+        re = dict(pred=probs, loss=loss)
+        return re
+
+    def inference(self, batch):
+        logger.info('Begin scene detect ......')
+        bs = self.cfg.pipeline.batch_size_per_gpu
+        sids = batch['sid']
+        inputs = batch['shot_feat']
+
+        shot_num = len(sids)
+        cnt = shot_num // bs + 1
+
+        for i in range(cnt):
+            start = i * bs
+            end = (i + 1) * bs if (i + 1) * bs < shot_num else shot_num
+            input_ = inputs[start:end]
+            sid_ = sids[start:end]
+            input_ = torch.stack(input_)
+            outputs = self.shared_step(input_)  # shape [b,2]
+            prob = F.softmax(outputs, dim=1)
+            self.infer_result['sid'].extend(sid_.cpu().detach().numpy())
+            self.infer_result['pred'].extend(prob[:, 1].cpu().detach().numpy())
+        self.infer_result['pred'] = np.stack(self.infer_result['pred'])
+
+        assert len(self.infer_result['sid']) == len(sids)
+        assert len(self.infer_result['pred']) == len(inputs)
+        return self.infer_result
+
+    def shared_step(self, inputs):
+        with torch.no_grad():
+            # infer shot encoder
+            shot_repr = self.extract_shot_representation(inputs)
+            assert len(shot_repr.shape) == 3
+
+        # infer CRN
+        _, pooled = self.crn(shot_repr, mask=None)
+        # infer boundary score
+        pred = self.head_sbd(pooled)
+        return pred
+
+    def save_shot_feat(self, _repr):
+        feat = _repr.float().cpu().numpy()
+        pth = self.cfg.dataset.img_path + '/features'
+        os.makedirs(pth)
+
+        for idx in range(_repr.shape[0]):
+            name = f'shot_{str(idx).zfill(4)}.npy'
+            name = osp.join(pth, name)
+            np.save(name, feat[idx])
+
+    def extract_shot_representation(self,
+                                    inputs: torch.Tensor) -> torch.Tensor:
+        """ inputs [b s k c h w] -> output [b d] """
+        assert len(inputs.shape) == 6  # (B Shot Keyframe C H W)
+        b, s, k, c, h, w = inputs.shape
+        inputs = einops.rearrange(inputs, 'b s k c h w -> (b s) k c h w', s=s)
+        keyframe_repr = [self.shot_encoder(inputs[:, _k]) for _k in range(k)]
+        # [k (b s) d] -> [(b s) d]
+        shot_repr = torch.stack(keyframe_repr).mean(dim=0)
+
+        shot_repr = einops.rearrange(shot_repr, '(b s) d -> b s d', s=s)
+        return shot_repr
+
+    def postprocess(self, inputs: Dict[str, Any], **kwargs):
+        logger.info('Generate scene .......')
+
+        pred_dict = inputs['feat']
+        thres = self.cfg.pipeline.save_threshold
+
+        anno_dict = get_pred_boundary(pred_dict, thres)
+        scene_dict_lst, scene_list = pred2scene(self.shot2keyf, anno_dict)
+        if self.cfg.pipeline.save_split_scene:
+            re_dir = scene2video(inputs['input_video_pth'], scene_list, thres)
+            print(f'Split scene video saved to {re_dir}')
+        return len(scene_list), scene_dict_lst
+
+    def preprocess(self, inputs):
+        logger.info('Begin shot detect......')
+        shot_keyf_lst, anno, shot2keyf = shot_detect(
+            inputs, **self.cfg.preprocessor.shot_detect)
+        logger.info('Shot detect done!')
+
+        single_shot_feat, sid = [], []
+        for idx, one_shot in enumerate(shot_keyf_lst):
+            one_shot = [
+                self.test_transform(one_frame) for one_frame in one_shot
+            ]
+            one_shot = torch.stack(one_shot, dim=0)
+            single_shot_feat.append(one_shot)
+            sid.append(idx)
+        single_shot_feat = torch.stack(single_shot_feat, dim=0)
+        shot_feat = []
+        for idx, one_shot in enumerate(anno):
+            shot_idx = int(one_shot['shot_id']) + np.arange(
+                -self.neighbor_size, self.neighbor_size + 1)
+            shot_idx = np.clip(shot_idx, 0, one_shot['num_shot'])
+            _one_shot = single_shot_feat[shot_idx]
+            shot_feat.append(_one_shot)
+        self.shot2keyf = shot2keyf
+        self.anno = anno
+        return shot_feat, sid
diff --git a/modelscope/models/cv/movie_scene_segmentation/utils/__init__.py b/modelscope/models/cv/movie_scene_segmentation/utils/__init__.py
new file mode 100644
index 00000000..e5a929aa
--- /dev/null
+++ b/modelscope/models/cv/movie_scene_segmentation/utils/__init__.py
@@ -0,0 +1,4 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from .save_op import get_pred_boundary, pred2scene, scene2video
+from .shot_encoder import resnet50
+from .trn import TransformerCRN
diff --git a/modelscope/models/cv/movie_scene_segmentation/utils/head.py b/modelscope/models/cv/movie_scene_segmentation/utils/head.py
new file mode 100644
index 00000000..d6468c53
--- /dev/null
+++ b/modelscope/models/cv/movie_scene_segmentation/utils/head.py
@@ -0,0 +1,25 @@
+# The implementation here is modified based on BaSSL,
+# originally Apache 2.0 License and publicly avaialbe at https://github.com/kakaobrain/bassl
+
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class MlpHead(nn.Module):
+
+    def __init__(self, input_dim=2048, hidden_dim=2048, output_dim=128):
+        super().__init__()
+        self.output_dim = output_dim
+        self.input_dim = input_dim
+        self.hidden_dim = hidden_dim
+
+        self.model = nn.Sequential(
+            nn.Linear(self.input_dim, self.hidden_dim, bias=True),
+            nn.ReLU(),
+            nn.Linear(self.hidden_dim, self.output_dim, bias=True),
+        )
+
+    def forward(self, x):
+        # x shape: [b t d] where t means the number of views
+        x = self.model(x)
+        return F.normalize(x, dim=-1)
diff --git a/modelscope/models/cv/movie_scene_segmentation/utils/save_op.py b/modelscope/models/cv/movie_scene_segmentation/utils/save_op.py
new file mode 100644
index 00000000..b350ff13
--- /dev/null
+++ b/modelscope/models/cv/movie_scene_segmentation/utils/save_op.py
@@ -0,0 +1,119 @@
+# The implementation here is modified based on SceneSeg,
+# originally Apache 2.0 License and publicly avaialbe at https://github.com/AnyiRao/SceneSeg
+import os
+import os.path as osp
+import subprocess
+
+import cv2
+import numpy as np
+from tqdm import tqdm
+
+
+def get_pred_boundary(pred_dict, threshold=0.5):
+    pred = pred_dict['pred']
+    tmp = (pred > threshold).astype(np.int32)
+    anno_dict = {}
+    for idx in range(len(tmp)):
+        anno_dict.update({str(pred_dict['sid'][idx]).zfill(4): int(tmp[idx])})
+    return anno_dict
+
+
+def pred2scene(shot2keyf, anno_dict):
+    scene_list, pair_list = get_demo_scene_list(shot2keyf, anno_dict)
+
+    scene_dict_lst = []
+    assert len(scene_list) == len(pair_list)
+    for scene_ind, scene_item in enumerate(scene_list):
+        scene_dict_lst.append({
+            'shot': pair_list[scene_ind],
+            'frame': scene_item[0],
+            'timestamp': scene_item[1]
+        })
+
+    return scene_dict_lst, scene_list
+
+
+def scene2video(source_movie_fn, scene_list, thres):
+
+    vcap = cv2.VideoCapture(source_movie_fn)
+    fps = vcap.get(cv2.CAP_PROP_FPS)  # video.fps
+    out_video_dir_fn = os.path.join(os.getcwd(),
+                                    f'pred_result/scene_video_{thres}')
+    os.makedirs(out_video_dir_fn, exist_ok=True)
+
+    for scene_ind, scene_item in tqdm(enumerate(scene_list)):
+        scene = str(scene_ind).zfill(4)
+        start_frame = int(scene_item[0][0])
+        end_frame = int(scene_item[0][1])
+        start_time, end_time = start_frame / fps, end_frame / fps
+        duration_time = end_time - start_time
+        out_video_fn = os.path.join(out_video_dir_fn,
+                                    'scene_{}.mp4'.format(scene))
+        if os.path.exists(out_video_fn):
+            continue
+        call_list = ['ffmpeg']
+        call_list += ['-v', 'quiet']
+        call_list += [
+            '-y', '-ss',
+            str(start_time), '-t',
+            str(duration_time), '-i', source_movie_fn
+        ]
+        call_list += ['-map_chapters', '-1']
+        call_list += [out_video_fn]
+        subprocess.call(call_list)
+    return osp.join(os.getcwd(), 'pred_result')
+
+
+def get_demo_scene_list(shot2keyf, anno_dict):
+    pair_list = get_pair_list(anno_dict)
+
+    scene_list = []
+    for pair in pair_list:
+        start_shot, end_shot = int(pair[0]), int(pair[-1])
+        start_frame = shot2keyf[start_shot].split(' ')[0]
+        end_frame = shot2keyf[end_shot].split(' ')[1]
+        start_timestamp = shot2keyf[start_shot].split(' ')[-2]
+        end_timestamp = shot2keyf[end_shot].split(' ')[-1]
+        scene_list.append([[start_frame, end_frame],
+                           [start_timestamp, end_timestamp]])
+    return scene_list, pair_list
+
+
+def get_pair_list(anno_dict):
+    sort_anno_dict_key = sorted(anno_dict.keys())
+    tmp = 0
+    tmp_list = []
+    tmp_label_list = []
+    anno_list = []
+    anno_label_list = []
+    for key in sort_anno_dict_key:
+        value = anno_dict.get(key)
+        tmp += value
+        tmp_list.append(key)
+        tmp_label_list.append(value)
+        if tmp == 1:
+            anno_list.append(tmp_list)
+            anno_label_list.append(tmp_label_list)
+            tmp = 0
+            tmp_list = []
+            tmp_label_list = []
+            continue
+        if key == sort_anno_dict_key[-1]:
+            if len(tmp_list) > 0:
+                anno_list.append(tmp_list)
+                anno_label_list.append(tmp_label_list)
+    if len(anno_list) == 0:
+        return None
+    while [] in anno_list:
+        anno_list.remove([])
+    tmp_anno_list = [anno_list[0]]
+    pair_list = []
+    for ind in range(len(anno_list) - 1):
+        cont_count = int(anno_list[ind + 1][0]) - int(anno_list[ind][-1])
+        if cont_count > 1:
+            pair_list.extend(tmp_anno_list)
+            tmp_anno_list = [anno_list[ind + 1]]
+            continue
+        tmp_anno_list.append(anno_list[ind + 1])
+    pair_list.extend(tmp_anno_list)
+    return pair_list
diff --git a/modelscope/models/cv/movie_scene_segmentation/utils/shot_encoder.py b/modelscope/models/cv/movie_scene_segmentation/utils/shot_encoder.py
new file mode 100644
index 00000000..11d20b13
--- /dev/null
+++ b/modelscope/models/cv/movie_scene_segmentation/utils/shot_encoder.py
@@ -0,0 +1,329 @@
+# The implementation is adopted from torchvision
+
+from typing import Any, Callable, List, Optional, Type, Union
+
+import torch
+import torch.nn as nn
+from torch import Tensor
+
+
+def conv3x3(in_planes: int,
+            out_planes: int,
+            stride: int = 1,
+            groups: int = 1,
+            dilation: int = 1) -> nn.Conv2d:
+    """3x3 convolution with padding"""
+    return nn.Conv2d(
+        in_planes,
+        out_planes,
+        kernel_size=3,
+        stride=stride,
+        padding=dilation,
+        groups=groups,
+        bias=False,
+        dilation=dilation,
+    )
+
+
+def conv1x1(in_planes: int, out_planes: int, stride: int = 1) -> nn.Conv2d:
+    """1x1 convolution"""
+    return nn.Conv2d(
+        in_planes, out_planes, kernel_size=1, stride=stride, bias=False)
+
+
+class BasicBlock(nn.Module):
+    expansion: int = 1
+
+    def __init__(
+        self,
+        inplanes: int,
+        planes: int,
+        stride: int = 1,
+        downsample: Optional[nn.Module] = None,
+        groups: int = 1,
+        base_width: int = 64,
+        dilation: int = 1,
+        norm_layer: Optional[Callable[..., nn.Module]] = None,
+    ) -> None:
+        super(BasicBlock, self).__init__()
+        if norm_layer is None:
+            norm_layer = nn.BatchNorm2d
+        if groups != 1 or base_width != 64:
+            raise ValueError(
+                'BasicBlock only supports groups=1 and base_width=64')
+        if dilation > 1:
+            raise NotImplementedError(
+                'Dilation > 1 not supported in BasicBlock')
+        # Both self.conv1 and self.downsample layers downsample the input when stride != 1
+        self.conv1 = conv3x3(inplanes, planes, stride)
+        self.bn1 = norm_layer(planes)
+        self.relu = nn.ReLU(inplace=True)
+        self.conv2 = conv3x3(planes, planes)
+        self.bn2 = norm_layer(planes)
+        self.downsample = downsample
+        self.stride = stride
+
+    def forward(self, x: Tensor) -> Tensor:
+        identity = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+
+        if self.downsample is not None:
+            identity = self.downsample(x)
+
+        out += identity
+        out = self.relu(out)
+
+        return out
+
+
+class Bottleneck(nn.Module):
+    # Bottleneck in torchvision places the stride for downsampling at 3x3 convolution(self.conv2)
+    # while original implementation places the stride at the first 1x1 convolution(self.conv1)
+    # according to "Deep residual learning for image recognition"https://arxiv.org/abs/1512.03385.
+    # This variant is also known as ResNet V1.5 and improves accuracy according to
+    # https://ngc.nvidia.com/catalog/model-scripts/nvidia:resnet_50_v1_5_for_pytorch.
+
+    expansion: int = 4
+
+    def __init__(
+        self,
+        inplanes: int,
+        planes: int,
+        stride: int = 1,
+        downsample: Optional[nn.Module] = None,
+        groups: int = 1,
+        base_width: int = 64,
+        dilation: int = 1,
+        norm_layer: Optional[Callable[..., nn.Module]] = None,
+    ) -> None:
+        super(Bottleneck, self).__init__()
+        if norm_layer is None:
+            norm_layer = nn.BatchNorm2d
+        width = int(planes * (base_width / 64.0)) * groups
+        # Both self.conv2 and self.downsample layers downsample the input when stride != 1
+        self.conv1 = conv1x1(inplanes, width)
+        self.bn1 = norm_layer(width)
+        self.conv2 = conv3x3(width, width, stride, groups, dilation)
+        self.bn2 = norm_layer(width)
+        self.conv3 = conv1x1(width, planes * self.expansion)
+        self.bn3 = norm_layer(planes * self.expansion)
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = downsample
+        self.stride = stride
+
+    def forward(self, x: Tensor) -> Tensor:
+        identity = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out = self.relu(out)
+
+        out = self.conv3(out)
+        out = self.bn3(out)
+
+        if self.downsample is not None:
+            identity = self.downsample(x)
+
+        out += identity
+        out = self.relu(out)
+
+        return out
+
+
+class ResNet(nn.Module):
+
+    def __init__(
+        self,
+        block: Type[Union[BasicBlock, Bottleneck]],
+        layers: List[int],
+        in_channel_dim: int = 3,
+        zero_init_residual: bool = False,
+        use_last_block_grid: bool = False,
+        groups: int = 1,
+        width_per_group: int = 64,
+        replace_stride_with_dilation: Optional[List[bool]] = None,
+        norm_layer: Optional[Callable[..., nn.Module]] = None,
+    ) -> None:
+        super(ResNet, self).__init__()
+        if norm_layer is None:
+            norm_layer = nn.BatchNorm2d
+        self._norm_layer = norm_layer
+
+        self.use_last_block_grid = use_last_block_grid
+        self.inplanes = 64
+        self.dilation = 1
+        if replace_stride_with_dilation is None:
+            # each element in the tuple indicates if we should replace
+            # the 2x2 stride with a dilated convolution instead
+            replace_stride_with_dilation = [False, False, False]
+        if len(replace_stride_with_dilation) != 3:
+            raise ValueError('replace_stride_with_dilation should be None '
+                             'or a 3-element tuple, got {}'.format(
+                                 replace_stride_with_dilation))
+        self.groups = groups
+        self.base_width = width_per_group
+        self.conv1 = nn.Conv2d(
+            in_channel_dim,
+            self.inplanes,
+            kernel_size=7,
+            stride=2,
+            padding=3,
+            bias=False,
+        )
+        self.bn1 = norm_layer(self.inplanes)
+        self.relu = nn.ReLU(inplace=True)
+        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+        self.layer1 = self._make_layer(block, 64, layers[0])
+        self.layer2 = self._make_layer(
+            block,
+            128,
+            layers[1],
+            stride=2,
+            dilate=replace_stride_with_dilation[0])
+        self.layer3 = self._make_layer(
+            block,
+            256,
+            layers[2],
+            stride=2,
+            dilate=replace_stride_with_dilation[1])
+        self.layer4 = self._make_layer(
+            block,
+            512,
+            layers[3],
+            stride=2,
+            dilate=replace_stride_with_dilation[2])
+        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
+
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(
+                    m.weight, mode='fan_out', nonlinearity='relu')
+            elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)):
+                nn.init.constant_(m.weight, 1)
+                nn.init.constant_(m.bias, 0)
+
+        # Zero-initialize the last BN in each residual branch,
+        # so that the residual branch starts with zeros, and each residual block behaves like an identity.
+        # This improves the model by 0.2~0.3% according to https://arxiv.org/abs/1706.02677
+        if zero_init_residual:
+            for m in self.modules():
+                if isinstance(m, Bottleneck):
+                    nn.init.constant_(m.bn3.weight,
+                                      0)  # type: ignore[arg-type]
+                elif isinstance(m, BasicBlock):
+                    nn.init.constant_(m.bn2.weight,
+                                      0)  # type: ignore[arg-type]
+
+    def _make_layer(
+        self,
+        block: Type[Union[BasicBlock, Bottleneck]],
+        planes: int,
+        blocks: int,
+        stride: int = 1,
+        dilate: bool = False,
+    ) -> nn.Sequential:
+        norm_layer = self._norm_layer
+        downsample = None
+        previous_dilation = self.dilation
+        if dilate:
+            self.dilation *= stride
+            stride = 1
+        if stride != 1 or self.inplanes != planes * block.expansion:
+            downsample = nn.Sequential(
+                conv1x1(self.inplanes, planes * block.expansion, stride),
+                norm_layer(planes * block.expansion),
+            )
+
+        layers = []
+        layers.append(
+            block(
+                self.inplanes,
+                planes,
+                stride,
+                downsample,
+                self.groups,
+                self.base_width,
+                previous_dilation,
+                norm_layer,
+            ))
+        self.inplanes = planes * block.expansion
+        for _ in range(1, blocks):
+            layers.append(
+                block(
+                    self.inplanes,
+                    planes,
+                    groups=self.groups,
+                    base_width=self.base_width,
+                    dilation=self.dilation,
+                    norm_layer=norm_layer,
+                ))
+
+        return nn.Sequential(*layers)
+
+    def _forward_impl(self, x: Tensor, grid: bool, level: List, both: bool,
+                      grid_only: bool) -> Tensor:
+        # See note [TorchScript super()]
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = self.relu(x)
+        x = self.maxpool(x)
+
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+
+        if grid:
+            x_grid = []
+
+        if 3 in level:
+            x_grid.append(x.detach().clone())
+            if not both and len(level) == 1:
+                return x_grid
+
+        x = self.layer4(x)
+
+        if 4 in level:
+            x_grid.append(x.detach().clone())
+            if not both and len(level) == 1:
+                return x_grid
+
+        x = self.avgpool(x)
+        x = torch.flatten(x, 1)
+
+        if not grid or len(level) == 0:
+            return x
+
+        if grid_only:
+            return x_grid
+
+        if both:
+            return x, x_grid
+
+        return x
+
+    def forward(
+        self,
+        x: Tensor,
+        grid: bool = False,
+        level: List = [],
+        both: bool = False,
+        grid_only: bool = False,
+    ) -> Tensor:
+        return self._forward_impl(x, grid, level, both, grid_only)
+
+
+def resnet50(**kwargs: Any) -> ResNet:
+    r"""ResNet-50 model from
+    `"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>`_.
+    """
+    return ResNet(Bottleneck, [3, 4, 6, 3], **kwargs)
diff --git a/modelscope/models/cv/movie_scene_segmentation/utils/trn.py b/modelscope/models/cv/movie_scene_segmentation/utils/trn.py
new file mode 100644
index 00000000..769e9ee4
--- /dev/null
+++ b/modelscope/models/cv/movie_scene_segmentation/utils/trn.py
@@ -0,0 +1,132 @@
+# ------------------------------------------------------------------------------------
+# BaSSL
+# Copyright (c) 2021 KakaoBrain. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# Github: https://github.com/kakaobrain/bassl
+# ------------------------------------------------------------------------------------
+
+import torch
+import torch.nn as nn
+from transformers.models.bert.modeling_bert import BertEncoder
+
+
+class ShotEmbedding(nn.Module):
+
+    def __init__(self, cfg):
+        super().__init__()
+
+        nn_size = cfg.neighbor_size + 2  # +1 for center shot, +1 for cls
+        self.shot_embedding = nn.Linear(cfg.input_dim, cfg.hidden_size)
+        self.position_embedding = nn.Embedding(nn_size, cfg.hidden_size)
+        self.mask_embedding = nn.Embedding(2, cfg.input_dim, padding_idx=0)
+
+        # tf naming convention for layer norm
+        self.LayerNorm = nn.LayerNorm(cfg.hidden_size, eps=1e-12)
+        self.dropout = nn.Dropout(cfg.hidden_dropout_prob)
+
+        self.register_buffer('pos_ids',
+                             torch.arange(nn_size, dtype=torch.long))
+
+    def forward(
+        self,
+        shot_emb: torch.Tensor,
+        mask: torch.Tensor = None,
+        pos_ids: torch.Tensor = None,
+    ) -> torch.Tensor:
+
+        assert len(shot_emb.size()) == 3
+
+        if pos_ids is None:
+            pos_ids = self.pos_ids
+
+        # this for mask embedding (un-masked ones remain unchanged)
+        if mask is not None:
+            self.mask_embedding.weight.data[0, :].fill_(0)
+            mask_emb = self.mask_embedding(mask.long())
+            shot_emb = (shot_emb * (1 - mask).float()[:, :, None]) + mask_emb
+
+        # we set [CLS] token to averaged feature
+        cls_emb = shot_emb.mean(dim=1)
+
+        # embedding shots
+        shot_emb = torch.cat([cls_emb[:, None, :], shot_emb], dim=1)
+        shot_emb = self.shot_embedding(shot_emb)
+        pos_emb = self.position_embedding(pos_ids)
+        embeddings = shot_emb + pos_emb[None, :]
+        embeddings = self.dropout(self.LayerNorm(embeddings))
+        return embeddings
+
+
+class TransformerCRN(nn.Module):
+
+    def __init__(self, cfg):
+        super().__init__()
+
+        self.pooling_method = cfg.pooling_method
+        self.shot_embedding = ShotEmbedding(cfg)
+        self.encoder = BertEncoder(cfg)
+
+        nn_size = cfg.neighbor_size + 2  # +1 for center shot, +1 for cls
+        self.register_buffer(
+            'attention_mask',
+            self._get_extended_attention_mask(
+                torch.ones((1, nn_size)).float()),
+        )
+
+    def forward(
+        self,
+        shot: torch.Tensor,
+        mask: torch.Tensor = None,
+        pos_ids: torch.Tensor = None,
+        pooling_method: str = None,
+    ):
+        if self.attention_mask.shape[1] != (shot.shape[1] + 1):
+            n_shot = shot.shape[1] + 1  # +1 for CLS token
+            attention_mask = self._get_extended_attention_mask(
+                torch.ones((1, n_shot), dtype=torch.float, device=shot.device))
+        else:
+            attention_mask = self.attention_mask
+
+        shot_emb = self.shot_embedding(shot, mask=mask, pos_ids=pos_ids)
+        encoded_emb = self.encoder(
+            shot_emb, attention_mask=attention_mask).last_hidden_state
+
+        return encoded_emb, self.pooler(
+            encoded_emb, pooling_method=pooling_method)
+
+    def pooler(self, sequence_output, pooling_method=None):
+        if pooling_method is None:
+            pooling_method = self.pooling_method
+
+        if pooling_method == 'cls':
+            return sequence_output[:, 0, :]
+        elif pooling_method == 'avg':
+            return sequence_output[:, 1:].mean(dim=1)
+        elif pooling_method == 'max':
+            return sequence_output[:, 1:].max(dim=1)[0]
+        elif pooling_method == 'center':
+            cidx = sequence_output.shape[1] // 2
+            return sequence_output[:, cidx, :]
+        else:
+            raise ValueError
+
+    def _get_extended_attention_mask(self, attention_mask):
+
+        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+        # ourselves in which case we just need to make it broadcastable to all heads.
+        if attention_mask.dim() == 3:
+            extended_attention_mask = attention_mask[:, None, :, :]
+        elif attention_mask.dim() == 2:
+            extended_attention_mask = attention_mask[:, None, None, :]
+        else:
+            raise ValueError(
+                f'Wrong shape for attention_mask (shape {attention_mask.shape})'
+            )
+
+        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
+        # masked positions, this operation will create a tensor which is 0.0 for
+        # positions we want to attend and -10000.0 for masked positions.
+        # Since we are adding it to the raw scores before the softmax, this is
+        # effectively the same as removing these entirely.
+        extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
+        return extended_attention_mask
diff --git a/modelscope/models/cv/object_detection/__init__.py b/modelscope/models/cv/object_detection/__init__.py
index fa73686d..974375ce 100644
--- a/modelscope/models/cv/object_detection/__init__.py
+++ b/modelscope/models/cv/object_detection/__init__.py
@@ -5,10 +5,12 @@ from modelscope.utils.import_utils import LazyImportModule
 
 if TYPE_CHECKING:
     from .mmdet_model import DetectionModel
+    from .yolox_pai import YOLOX
 
 else:
     _import_structure = {
         'mmdet_model': ['DetectionModel'],
+        'yolox_pai': ['YOLOX']
     }
 
     import sys
diff --git a/modelscope/models/cv/object_detection/mmdet_model.py b/modelscope/models/cv/object_detection/mmdet_model.py
index 7bf81349..485d440a 100644
--- a/modelscope/models/cv/object_detection/mmdet_model.py
+++ b/modelscope/models/cv/object_detection/mmdet_model.py
@@ -1,3 +1,4 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
 import os.path as osp
 
 import numpy as np
diff --git a/modelscope/models/cv/object_detection/mmdet_ms/__init__.py b/modelscope/models/cv/object_detection/mmdet_ms/__init__.py
index 2e47ce76..3a1fdd0b 100644
--- a/modelscope/models/cv/object_detection/mmdet_ms/__init__.py
+++ b/modelscope/models/cv/object_detection/mmdet_ms/__init__.py
@@ -1,3 +1,5 @@
+# Implementation in this file is modified based on ViTAE-Transformer
+# Originally Apache 2.0 License and publicly avaialbe at https://github.com/ViTAE-Transformer/ViTDet
 from .backbones import ViT
 from .dense_heads import AnchorNHead, RPNNHead
 from .necks import FPNF
diff --git a/modelscope/models/cv/object_detection/mmdet_ms/backbones/__init__.py b/modelscope/models/cv/object_detection/mmdet_ms/backbones/__init__.py
index 3b34dad6..c0697d48 100644
--- a/modelscope/models/cv/object_detection/mmdet_ms/backbones/__init__.py
+++ b/modelscope/models/cv/object_detection/mmdet_ms/backbones/__init__.py
@@ -1,3 +1,5 @@
+# Implementation in this file is modified based on ViTAE-Transformer
+# Originally Apache 2.0 License and publicly avaialbe at https://github.com/ViTAE-Transformer/ViTDet
 from .vit import ViT
 
 __all__ = ['ViT']
diff --git a/modelscope/models/cv/object_detection/mmdet_ms/dense_heads/__init__.py b/modelscope/models/cv/object_detection/mmdet_ms/dense_heads/__init__.py
index 0fba8c00..0d34e996 100644
--- a/modelscope/models/cv/object_detection/mmdet_ms/dense_heads/__init__.py
+++ b/modelscope/models/cv/object_detection/mmdet_ms/dense_heads/__init__.py
@@ -1,3 +1,5 @@
+# Implementation in this file is modified based on ViTAE-Transformer
+# Originally Apache 2.0 License and publicly avaialbe at https://github.com/ViTAE-Transformer/ViTDet
 from .anchor_head import AnchorNHead
 from .rpn_head import RPNNHead
 
diff --git a/modelscope/models/cv/object_detection/mmdet_ms/dense_heads/anchor_head.py b/modelscope/models/cv/object_detection/mmdet_ms/dense_heads/anchor_head.py
index b4114652..d4ea5282 100644
--- a/modelscope/models/cv/object_detection/mmdet_ms/dense_heads/anchor_head.py
+++ b/modelscope/models/cv/object_detection/mmdet_ms/dense_heads/anchor_head.py
@@ -1,5 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-# Implementation in this file is modifed from source code avaiable via https://github.com/ViTAE-Transformer/ViTDet
+# Implementation in this file is modified based on ViTAE-Transformer
+# Originally Apache 2.0 License and publicly avaialbe at https://github.com/ViTAE-Transformer/ViTDet
 from mmdet.models.builder import HEADS
 from mmdet.models.dense_heads import AnchorHead
 
diff --git a/modelscope/models/cv/object_detection/mmdet_ms/dense_heads/rpn_head.py b/modelscope/models/cv/object_detection/mmdet_ms/dense_heads/rpn_head.py
index f53368ce..8e934a5c 100644
--- a/modelscope/models/cv/object_detection/mmdet_ms/dense_heads/rpn_head.py
+++ b/modelscope/models/cv/object_detection/mmdet_ms/dense_heads/rpn_head.py
@@ -1,5 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-# Implementation in this file is modifed from source code avaiable via https://github.com/ViTAE-Transformer/ViTDet
+# Implementation in this file is modified based on ViTAE-Transformer
+# Originally Apache 2.0 License and publicly avaialbe at https://github.com/ViTAE-Transformer/ViTDet
 import copy
 
 import torch
diff --git a/modelscope/models/cv/object_detection/mmdet_ms/necks/__init__.py b/modelscope/models/cv/object_detection/mmdet_ms/necks/__init__.py
index 5b0b6210..d164987e 100644
--- a/modelscope/models/cv/object_detection/mmdet_ms/necks/__init__.py
+++ b/modelscope/models/cv/object_detection/mmdet_ms/necks/__init__.py
@@ -1,3 +1,5 @@
+# Implementation in this file is modified based on ViTAE-Transformer
+# Originally Apache 2.0 License and publicly avaialbe at https://github.com/ViTAE-Transformer/ViTDet
 from .fpn import FPNF
 
 __all__ = ['FPNF']
diff --git a/modelscope/models/cv/object_detection/mmdet_ms/necks/fpn.py b/modelscope/models/cv/object_detection/mmdet_ms/necks/fpn.py
index 52529b28..5f8648ce 100644
--- a/modelscope/models/cv/object_detection/mmdet_ms/necks/fpn.py
+++ b/modelscope/models/cv/object_detection/mmdet_ms/necks/fpn.py
@@ -1,5 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-# Implementation in this file is modifed from source code avaiable via https://github.com/ViTAE-Transformer/ViTDet
+# Implementation in this file is modified based on ViTAE-Transformer
+# Originally Apache 2.0 License and publicly avaialbe at https://github.com/ViTAE-Transformer/ViTDet
 import torch.nn as nn
 import torch.nn.functional as F
 from mmcv.runner import BaseModule, auto_fp16
diff --git a/modelscope/models/cv/object_detection/mmdet_ms/roi_heads/__init__.py b/modelscope/models/cv/object_detection/mmdet_ms/roi_heads/__init__.py
index a6be3775..658280df 100644
--- a/modelscope/models/cv/object_detection/mmdet_ms/roi_heads/__init__.py
+++ b/modelscope/models/cv/object_detection/mmdet_ms/roi_heads/__init__.py
@@ -1,3 +1,5 @@
+# Implementation in this file is modified based on ViTAE-Transformer
+# Originally Apache 2.0 License and publicly avaialbe at https://github.com/ViTAE-Transformer/ViTDet
 from .bbox_heads import (ConvFCBBoxNHead, Shared2FCBBoxNHead,
                          Shared4Conv1FCBBoxNHead)
 from .mask_heads import FCNMaskNHead
diff --git a/modelscope/models/cv/object_detection/mmdet_ms/roi_heads/bbox_heads/__init__.py b/modelscope/models/cv/object_detection/mmdet_ms/roi_heads/bbox_heads/__init__.py
index 0d4d5b6b..61d93503 100644
--- a/modelscope/models/cv/object_detection/mmdet_ms/roi_heads/bbox_heads/__init__.py
+++ b/modelscope/models/cv/object_detection/mmdet_ms/roi_heads/bbox_heads/__init__.py
@@ -1,3 +1,5 @@
+# Implementation in this file is modified based on ViTAE-Transformer
+# Originally Apache 2.0 License and publicly avaialbe at https://github.com/ViTAE-Transformer/ViTDet
 from .convfc_bbox_head import (ConvFCBBoxNHead, Shared2FCBBoxNHead,
                                Shared4Conv1FCBBoxNHead)
 
diff --git a/modelscope/models/cv/object_detection/mmdet_ms/roi_heads/bbox_heads/convfc_bbox_head.py b/modelscope/models/cv/object_detection/mmdet_ms/roi_heads/bbox_heads/convfc_bbox_head.py
index d2e04b80..726329a1 100644
--- a/modelscope/models/cv/object_detection/mmdet_ms/roi_heads/bbox_heads/convfc_bbox_head.py
+++ b/modelscope/models/cv/object_detection/mmdet_ms/roi_heads/bbox_heads/convfc_bbox_head.py
@@ -1,5 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-# Implementation in this file is modifed from source code avaiable via https://github.com/ViTAE-Transformer/ViTDet
+# Implementation in this file is modified based on ViTAE-Transformer
+# Originally Apache 2.0 License and publicly avaialbe at https://github.com/ViTAE-Transformer/ViTDet
 import torch.nn as nn
 from mmdet.models.builder import HEADS
 from mmdet.models.roi_heads.bbox_heads.bbox_head import BBoxHead
diff --git a/modelscope/models/cv/object_detection/mmdet_ms/roi_heads/mask_heads/__init__.py b/modelscope/models/cv/object_detection/mmdet_ms/roi_heads/mask_heads/__init__.py
index 8f816850..043e62a0 100644
--- a/modelscope/models/cv/object_detection/mmdet_ms/roi_heads/mask_heads/__init__.py
+++ b/modelscope/models/cv/object_detection/mmdet_ms/roi_heads/mask_heads/__init__.py
@@ -1,3 +1,5 @@
+# Implementation in this file is modified based on ViTAE-Transformer
+# Originally Apache 2.0 License and publicly avaialbe at https://github.com/ViTAE-Transformer/ViTDet
 from .fcn_mask_head import FCNMaskNHead
 
 __all__ = ['FCNMaskNHead']
diff --git a/modelscope/models/cv/object_detection/mmdet_ms/roi_heads/mask_heads/fcn_mask_head.py b/modelscope/models/cv/object_detection/mmdet_ms/roi_heads/mask_heads/fcn_mask_head.py
index e5aedc98..335f6b8f 100644
--- a/modelscope/models/cv/object_detection/mmdet_ms/roi_heads/mask_heads/fcn_mask_head.py
+++ b/modelscope/models/cv/object_detection/mmdet_ms/roi_heads/mask_heads/fcn_mask_head.py
@@ -1,5 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-# Implementation in this file is modifed from source code avaiable via https://github.com/ViTAE-Transformer/ViTDet
+# Implementation in this file is modified based on ViTAE-Transformer
+# Originally Apache 2.0 License and publicly avaialbe at https://github.com/ViTAE-Transformer/ViTDet
 from warnings import warn
 
 import numpy as np
diff --git a/modelscope/models/cv/object_detection/mmdet_ms/utils/__init__.py b/modelscope/models/cv/object_detection/mmdet_ms/utils/__init__.py
index 971a0232..34f240c6 100644
--- a/modelscope/models/cv/object_detection/mmdet_ms/utils/__init__.py
+++ b/modelscope/models/cv/object_detection/mmdet_ms/utils/__init__.py
@@ -1,3 +1,5 @@
+# Implementation in this file is modified based on ViTAE-Transformer
+# Originally Apache 2.0 License and publicly avaialbe at https://github.com/ViTAE-Transformer/ViTDet
 from .checkpoint import load_checkpoint
 from .convModule_norm import ConvModule_Norm
 
diff --git a/modelscope/models/cv/object_detection/mmdet_ms/utils/checkpoint.py b/modelscope/models/cv/object_detection/mmdet_ms/utils/checkpoint.py
index 593af1cc..7833f592 100644
--- a/modelscope/models/cv/object_detection/mmdet_ms/utils/checkpoint.py
+++ b/modelscope/models/cv/object_detection/mmdet_ms/utils/checkpoint.py
@@ -1,5 +1,6 @@
 # Copyright (c) Open-MMLab. All rights reserved.
-# Implementation adopted from ViTAE-Transformer, source code avaiable via https://github.com/ViTAE-Transformer/ViTDet
+# Implementation in this file is modified based on ViTAE-Transformer
+# Originally Apache 2.0 License and publicly avaialbe at https://github.com/ViTAE-Transformer/ViTDet
 import io
 import os
 import os.path as osp
diff --git a/modelscope/models/cv/object_detection/mmdet_ms/utils/convModule_norm.py b/modelscope/models/cv/object_detection/mmdet_ms/utils/convModule_norm.py
index d81c24e1..a15780f7 100644
--- a/modelscope/models/cv/object_detection/mmdet_ms/utils/convModule_norm.py
+++ b/modelscope/models/cv/object_detection/mmdet_ms/utils/convModule_norm.py
@@ -1,5 +1,5 @@
-# Implementation adopted from ViTAE-Transformer, source code avaiable via https://github.com/ViTAE-Transformer/ViTDet
-
+# Implementation in this file is modified based on ViTAE-Transformer
+# Originally Apache 2.0 License and publicly avaialbe at https://github.com/ViTAE-Transformer/ViTDet
 from mmcv.cnn import ConvModule
 
 
diff --git a/modelscope/models/cv/object_detection/yolox_pai.py b/modelscope/models/cv/object_detection/yolox_pai.py
new file mode 100644
index 00000000..985cc136
--- /dev/null
+++ b/modelscope/models/cv/object_detection/yolox_pai.py
@@ -0,0 +1,16 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from easycv.models.detection.detectors import YOLOX as _YOLOX
+
+from modelscope.metainfo import Models
+from modelscope.models.builder import MODELS
+from modelscope.models.cv.easycv_base import EasyCVBaseModel
+from modelscope.utils.constant import Tasks
+
+
+@MODELS.register_module(
+    group_key=Tasks.image_object_detection, module_name=Models.yolox)
+class YOLOX(EasyCVBaseModel, _YOLOX):
+
+    def __init__(self, model_dir=None, *args, **kwargs):
+        EasyCVBaseModel.__init__(self, model_dir, args, kwargs)
+        _YOLOX.__init__(self, *args, **kwargs)
diff --git a/modelscope/models/cv/product_segmentation/__init__.py b/modelscope/models/cv/product_segmentation/__init__.py
new file mode 100644
index 00000000..e87c8db1
--- /dev/null
+++ b/modelscope/models/cv/product_segmentation/__init__.py
@@ -0,0 +1,20 @@
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .seg_infer import F3NetProductSegmentation
+
+else:
+    _import_structure = {'seg_infer': ['F3NetProductSegmentation']}
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/models/cv/product_segmentation/net.py b/modelscope/models/cv/product_segmentation/net.py
new file mode 100644
index 00000000..454c99d8
--- /dev/null
+++ b/modelscope/models/cv/product_segmentation/net.py
@@ -0,0 +1,197 @@
+# The implementation here is modified based on F3Net,
+# originally Apache 2.0 License and publicly avaialbe at https://github.com/weijun88/F3Net
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class Bottleneck(nn.Module):
+
+    def __init__(self,
+                 inplanes,
+                 planes,
+                 stride=1,
+                 downsample=None,
+                 dilation=1):
+        super(Bottleneck, self).__init__()
+        self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(planes)
+        self.conv2 = nn.Conv2d(
+            planes,
+            planes,
+            kernel_size=3,
+            stride=stride,
+            padding=(3 * dilation - 1) // 2,
+            bias=False,
+            dilation=dilation)
+        self.bn2 = nn.BatchNorm2d(planes)
+        self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=False)
+        self.bn3 = nn.BatchNorm2d(planes * 4)
+        self.downsample = downsample
+
+    def forward(self, x):
+        out = F.relu(self.bn1(self.conv1(x)), inplace=True)
+        out = F.relu(self.bn2(self.conv2(out)), inplace=True)
+        out = self.bn3(self.conv3(out))
+        if self.downsample is not None:
+            x = self.downsample(x)
+        return F.relu(out + x, inplace=True)
+
+
+class ResNet(nn.Module):
+
+    def __init__(self):
+        super(ResNet, self).__init__()
+        self.inplanes = 64
+        self.conv1 = nn.Conv2d(
+            3, 64, kernel_size=7, stride=2, padding=3, bias=False)
+        self.bn1 = nn.BatchNorm2d(64)
+        self.layer1 = self.make_layer(64, 3, stride=1, dilation=1)
+        self.layer2 = self.make_layer(128, 4, stride=2, dilation=1)
+        self.layer3 = self.make_layer(256, 6, stride=2, dilation=1)
+        self.layer4 = self.make_layer(512, 3, stride=2, dilation=1)
+
+    def make_layer(self, planes, blocks, stride, dilation):
+        downsample = nn.Sequential(
+            nn.Conv2d(
+                self.inplanes,
+                planes * 4,
+                kernel_size=1,
+                stride=stride,
+                bias=False), nn.BatchNorm2d(planes * 4))
+        layers = [
+            Bottleneck(
+                self.inplanes, planes, stride, downsample, dilation=dilation)
+        ]
+        self.inplanes = planes * 4
+        for _ in range(1, blocks):
+            layers.append(Bottleneck(self.inplanes, planes, dilation=dilation))
+        return nn.Sequential(*layers)
+
+    def forward(self, x):
+        x = x.reshape(1, 3, 448, 448)
+        out1 = F.relu(self.bn1(self.conv1(x)), inplace=True)
+        out1 = F.max_pool2d(out1, kernel_size=3, stride=2, padding=1)
+        out2 = self.layer1(out1)
+        out3 = self.layer2(out2)
+        out4 = self.layer3(out3)
+        out5 = self.layer4(out4)
+        return out2, out3, out4, out5
+
+
+class CFM(nn.Module):
+
+    def __init__(self):
+        super(CFM, self).__init__()
+        self.conv1h = nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1)
+        self.bn1h = nn.BatchNorm2d(64)
+        self.conv2h = nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1)
+        self.bn2h = nn.BatchNorm2d(64)
+        self.conv3h = nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1)
+        self.bn3h = nn.BatchNorm2d(64)
+        self.conv4h = nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1)
+        self.bn4h = nn.BatchNorm2d(64)
+
+        self.conv1v = nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1)
+        self.bn1v = nn.BatchNorm2d(64)
+        self.conv2v = nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1)
+        self.bn2v = nn.BatchNorm2d(64)
+        self.conv3v = nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1)
+        self.bn3v = nn.BatchNorm2d(64)
+        self.conv4v = nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1)
+        self.bn4v = nn.BatchNorm2d(64)
+
+    def forward(self, left, down):
+        if down.size()[2:] != left.size()[2:]:
+            down = F.interpolate(down, size=left.size()[2:], mode='bilinear')
+        out1h = F.relu(self.bn1h(self.conv1h(left)), inplace=True)
+        out2h = F.relu(self.bn2h(self.conv2h(out1h)), inplace=True)
+        out1v = F.relu(self.bn1v(self.conv1v(down)), inplace=True)
+        out2v = F.relu(self.bn2v(self.conv2v(out1v)), inplace=True)
+        fuse = out2h * out2v
+        out3h = F.relu(self.bn3h(self.conv3h(fuse)), inplace=True) + out1h
+        out4h = F.relu(self.bn4h(self.conv4h(out3h)), inplace=True)
+        out3v = F.relu(self.bn3v(self.conv3v(fuse)), inplace=True) + out1v
+        out4v = F.relu(self.bn4v(self.conv4v(out3v)), inplace=True)
+        return out4h, out4v
+
+
+class Decoder(nn.Module):
+
+    def __init__(self):
+        super(Decoder, self).__init__()
+        self.cfm45 = CFM()
+        self.cfm34 = CFM()
+        self.cfm23 = CFM()
+
+    def forward(self, out2h, out3h, out4h, out5v, fback=None):
+        if fback is not None:
+            refine5 = F.interpolate(
+                fback, size=out5v.size()[2:], mode='bilinear')
+            refine4 = F.interpolate(
+                fback, size=out4h.size()[2:], mode='bilinear')
+            refine3 = F.interpolate(
+                fback, size=out3h.size()[2:], mode='bilinear')
+            refine2 = F.interpolate(
+                fback, size=out2h.size()[2:], mode='bilinear')
+            out5v = out5v + refine5
+            out4h, out4v = self.cfm45(out4h + refine4, out5v)
+            out3h, out3v = self.cfm34(out3h + refine3, out4v)
+            out2h, pred = self.cfm23(out2h + refine2, out3v)
+        else:
+            out4h, out4v = self.cfm45(out4h, out5v)
+            out3h, out3v = self.cfm34(out3h, out4v)
+            out2h, pred = self.cfm23(out2h, out3v)
+        return out2h, out3h, out4h, out5v, pred
+
+
+class F3Net(nn.Module):
+
+    def __init__(self):
+        super(F3Net, self).__init__()
+        self.bkbone = ResNet()
+        self.squeeze5 = nn.Sequential(
+            nn.Conv2d(2048, 64, 1), nn.BatchNorm2d(64), nn.ReLU(inplace=True))
+        self.squeeze4 = nn.Sequential(
+            nn.Conv2d(1024, 64, 1), nn.BatchNorm2d(64), nn.ReLU(inplace=True))
+        self.squeeze3 = nn.Sequential(
+            nn.Conv2d(512, 64, 1), nn.BatchNorm2d(64), nn.ReLU(inplace=True))
+        self.squeeze2 = nn.Sequential(
+            nn.Conv2d(256, 64, 1), nn.BatchNorm2d(64), nn.ReLU(inplace=True))
+
+        self.decoder1 = Decoder()
+        self.decoder2 = Decoder()
+        self.linearp1 = nn.Conv2d(64, 1, kernel_size=3, stride=1, padding=1)
+        self.linearp2 = nn.Conv2d(64, 1, kernel_size=3, stride=1, padding=1)
+
+        self.linearr2 = nn.Conv2d(64, 1, kernel_size=3, stride=1, padding=1)
+        self.linearr3 = nn.Conv2d(64, 1, kernel_size=3, stride=1, padding=1)
+        self.linearr4 = nn.Conv2d(64, 1, kernel_size=3, stride=1, padding=1)
+        self.linearr5 = nn.Conv2d(64, 1, kernel_size=3, stride=1, padding=1)
+
+    def forward(self, x, shape=None):
+        x = x.reshape(1, 3, 448, 448)
+        out2h, out3h, out4h, out5v = self.bkbone(x)
+        out2h, out3h, out4h, out5v = self.squeeze2(out2h), self.squeeze3(
+            out3h), self.squeeze4(out4h), self.squeeze5(out5v)
+        out2h, out3h, out4h, out5v, pred1 = self.decoder1(
+            out2h, out3h, out4h, out5v)
+        out2h, out3h, out4h, out5v, pred2 = self.decoder2(
+            out2h, out3h, out4h, out5v, pred1)
+
+        shape = x.size()[2:] if shape is None else shape
+        pred1 = F.interpolate(
+            self.linearp1(pred1), size=shape, mode='bilinear')
+        pred2 = F.interpolate(
+            self.linearp2(pred2), size=shape, mode='bilinear')
+
+        out2h = F.interpolate(
+            self.linearr2(out2h), size=shape, mode='bilinear')
+        out3h = F.interpolate(
+            self.linearr3(out3h), size=shape, mode='bilinear')
+        out4h = F.interpolate(
+            self.linearr4(out4h), size=shape, mode='bilinear')
+        out5h = F.interpolate(
+            self.linearr5(out5v), size=shape, mode='bilinear')
+        return pred1, pred2, out2h, out3h, out4h, out5h
diff --git a/modelscope/models/cv/product_segmentation/seg_infer.py b/modelscope/models/cv/product_segmentation/seg_infer.py
new file mode 100644
index 00000000..876fac66
--- /dev/null
+++ b/modelscope/models/cv/product_segmentation/seg_infer.py
@@ -0,0 +1,77 @@
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
+
+import cv2
+import numpy as np
+import torch
+from PIL import Image
+
+from modelscope.metainfo import Models
+from modelscope.models.base import TorchModel
+from modelscope.models.builder import MODELS
+from modelscope.utils.constant import ModelFile, Tasks
+from modelscope.utils.logger import get_logger
+from .net import F3Net
+
+logger = get_logger()
+
+
+def load_state_dict(model_dir, device):
+    _dict = torch.load(
+        '{}/{}'.format(model_dir, ModelFile.TORCH_MODEL_BIN_FILE),
+        map_location=device)
+    state_dict = {}
+    for k, v in _dict.items():
+        if k.startswith('module'):
+            k = k[7:]
+        state_dict[k] = v
+    return state_dict
+
+
+@MODELS.register_module(
+    Tasks.product_segmentation, module_name=Models.product_segmentation)
+class F3NetForProductSegmentation(TorchModel):
+
+    def __init__(self, model_dir, device_id=0, *args, **kwargs):
+
+        super().__init__(
+            model_dir=model_dir, device_id=device_id, *args, **kwargs)
+
+        self.model = F3Net()
+        if torch.cuda.is_available():
+            self.device = 'cuda'
+            logger.info('Use GPU')
+        else:
+            self.device = 'cpu'
+            logger.info('Use CPU')
+
+        self.params = load_state_dict(model_dir, self.device)
+        self.model.load_state_dict(self.params)
+        self.model.to(self.device)
+        self.model.eval()
+        self.model.to(self.device)
+
+    def forward(self, x):
+        pred_result = self.model(x)
+        return pred_result
+
+
+mean, std = np.array([[[124.55, 118.90,
+                        102.94]]]), np.array([[[56.77, 55.97, 57.50]]])
+
+
+def inference(model, device, input_path):
+    img = Image.open(input_path)
+    img = np.array(img.convert('RGB')).astype(np.float32)
+    img = (img - mean) / std
+    img = cv2.resize(img, dsize=(448, 448), interpolation=cv2.INTER_LINEAR)
+    img = torch.from_numpy(img)
+    img = img.permute(2, 0, 1)
+    img = img.to(device).float()
+    outputs = model(img)
+    out = outputs[0]
+    pred = (torch.sigmoid(out[0, 0]) * 255).cpu().numpy()
+    pred[pred < 20] = 0
+    pred = pred[:, :, np.newaxis]
+    pred = np.round(pred)
+    logger.info('Inference Done')
+    return pred
diff --git a/modelscope/models/cv/realtime_object_detection/__init__.py b/modelscope/models/cv/realtime_object_detection/__init__.py
new file mode 100644
index 00000000..aed13cec
--- /dev/null
+++ b/modelscope/models/cv/realtime_object_detection/__init__.py
@@ -0,0 +1,21 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .realtime_detector import RealtimeDetector
+else:
+    _import_structure = {
+        'realtime_detector': ['RealtimeDetector'],
+    }
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/models/cv/realtime_object_detection/realtime_detector.py b/modelscope/models/cv/realtime_object_detection/realtime_detector.py
new file mode 100644
index 00000000..2b4b3f8c
--- /dev/null
+++ b/modelscope/models/cv/realtime_object_detection/realtime_detector.py
@@ -0,0 +1,90 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import argparse
+import logging as logger
+import os
+import os.path as osp
+import time
+
+import cv2
+import json
+import torch
+
+from modelscope.metainfo import Models
+from modelscope.models.base.base_torch_model import TorchModel
+from modelscope.models.builder import MODELS
+from modelscope.preprocessors import LoadImage
+from modelscope.utils.config import Config
+from modelscope.utils.constant import ModelFile, Tasks
+from .yolox.data.data_augment import ValTransform
+from .yolox.exp import get_exp_by_name
+from .yolox.utils import postprocess
+
+
+@MODELS.register_module(
+    group_key=Tasks.image_object_detection,
+    module_name=Models.realtime_object_detection)
+class RealtimeDetector(TorchModel):
+
+    def __init__(self, model_dir: str, *args, **kwargs):
+        super().__init__(model_dir, *args, **kwargs)
+        self.config = Config.from_file(
+            os.path.join(self.model_dir, ModelFile.CONFIGURATION))
+
+        # model type
+        self.exp = get_exp_by_name(self.config.model_type)
+
+        # build model
+        self.model = self.exp.get_model()
+        model_path = osp.join(model_dir, ModelFile.TORCH_MODEL_BIN_FILE)
+        ckpt = torch.load(model_path, map_location='cpu')
+
+        # load the model state dict
+        self.model.load_state_dict(ckpt['model'])
+        self.model.eval()
+
+        # params setting
+        self.exp.num_classes = self.config.num_classes
+        self.confthre = self.config.conf_thr
+        self.num_classes = self.exp.num_classes
+        self.nmsthre = self.exp.nmsthre
+        self.test_size = self.exp.test_size
+        self.preproc = ValTransform(legacy=False)
+        self.label_mapping = self.config['labels']
+
+    def inference(self, img):
+        with torch.no_grad():
+            outputs = self.model(img)
+        return outputs
+
+    def forward(self, inputs):
+        return self.inference(inputs)
+
+    def preprocess(self, img):
+        img = LoadImage.convert_to_ndarray(img)
+        height, width = img.shape[:2]
+        self.ratio = min(self.test_size[0] / img.shape[0],
+                         self.test_size[1] / img.shape[1])
+
+        img, _ = self.preproc(img, None, self.test_size)
+        img = torch.from_numpy(img).unsqueeze(0)
+        img = img.float()
+
+        return img
+
+    def postprocess(self, input):
+        outputs = postprocess(
+            input,
+            self.num_classes,
+            self.confthre,
+            self.nmsthre,
+            class_agnostic=True)
+
+        if len(outputs) == 1:
+            bboxes = outputs[0][:, 0:4].cpu().numpy() / self.ratio
+            scores = outputs[0][:, 5].cpu().numpy()
+            labels = outputs[0][:, 6].cpu().int().numpy()
+            pred_label_names = []
+            for lab in labels:
+                pred_label_names.append(self.label_mapping[lab])
+
+        return bboxes, scores, pred_label_names
diff --git a/modelscope/models/cv/realtime_object_detection/yolox/__init__.py b/modelscope/models/cv/realtime_object_detection/yolox/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/modelscope/models/cv/realtime_object_detection/yolox/data/__init__.py b/modelscope/models/cv/realtime_object_detection/yolox/data/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/modelscope/models/cv/realtime_object_detection/yolox/data/data_augment.py b/modelscope/models/cv/realtime_object_detection/yolox/data/data_augment.py
new file mode 100644
index 00000000..b52a65fe
--- /dev/null
+++ b/modelscope/models/cv/realtime_object_detection/yolox/data/data_augment.py
@@ -0,0 +1,69 @@
+# The implementation is based on YOLOX, available at https://github.com/Megvii-BaseDetection/YOLOX
+"""
+Data augmentation functionality. Passed as callable transformations to
+Dataset classes.
+
+The data augmentation procedures were interpreted from @weiliu89's SSD paper
+http://arxiv.org/abs/1512.02325
+"""
+
+import math
+import random
+
+import cv2
+import numpy as np
+
+from ..utils import xyxy2cxcywh
+
+
+def preproc(img, input_size, swap=(2, 0, 1)):
+    if len(img.shape) == 3:
+        padded_img = np.ones(
+            (input_size[0], input_size[1], 3), dtype=np.uint8) * 114
+    else:
+        padded_img = np.ones(input_size, dtype=np.uint8) * 114
+
+    r = min(input_size[0] / img.shape[0], input_size[1] / img.shape[1])
+    resized_img = cv2.resize(
+        img,
+        (int(img.shape[1] * r), int(img.shape[0] * r)),
+        interpolation=cv2.INTER_LINEAR,
+    ).astype(np.uint8)
+    padded_img[:int(img.shape[0] * r), :int(img.shape[1] * r)] = resized_img
+
+    padded_img = padded_img.transpose(swap)
+    padded_img = np.ascontiguousarray(padded_img, dtype=np.float32)
+    return padded_img, r
+
+
+class ValTransform:
+    """
+    Defines the transformations that should be applied to test PIL image
+    for input into the network
+
+    dimension -> tensorize -> color adj
+
+    Arguments:
+        resize (int): input dimension to SSD
+        rgb_means ((int,int,int)): average RGB of the dataset
+            (104,117,123)
+        swap ((int,int,int)): final order of channels
+
+    Returns:
+        transform (transform) : callable transform to be applied to test/val
+        data
+    """
+
+    def __init__(self, swap=(2, 0, 1), legacy=False):
+        self.swap = swap
+        self.legacy = legacy
+
+    # assume input is cv2 img for now
+    def __call__(self, img, res, input_size):
+        img, _ = preproc(img, input_size, self.swap)
+        if self.legacy:
+            img = img[::-1, :, :].copy()
+            img /= 255.0
+            img -= np.array([0.485, 0.456, 0.406]).reshape(3, 1, 1)
+            img /= np.array([0.229, 0.224, 0.225]).reshape(3, 1, 1)
+        return img, np.zeros((1, 5))
diff --git a/modelscope/models/cv/realtime_object_detection/yolox/exp/__init__.py b/modelscope/models/cv/realtime_object_detection/yolox/exp/__init__.py
new file mode 100644
index 00000000..e8e3be15
--- /dev/null
+++ b/modelscope/models/cv/realtime_object_detection/yolox/exp/__init__.py
@@ -0,0 +1,5 @@
+# The implementation is based on YOLOX, available at https://github.com/Megvii-BaseDetection/YOLOX
+
+from .base_exp import BaseExp
+from .build import get_exp_by_name
+from .yolox_base import Exp
diff --git a/modelscope/models/cv/realtime_object_detection/yolox/exp/base_exp.py b/modelscope/models/cv/realtime_object_detection/yolox/exp/base_exp.py
new file mode 100644
index 00000000..a4278cbf
--- /dev/null
+++ b/modelscope/models/cv/realtime_object_detection/yolox/exp/base_exp.py
@@ -0,0 +1,12 @@
+# The implementation is based on YOLOX, available at https://github.com/Megvii-BaseDetection/YOLOX
+
+from abc import ABCMeta, abstractmethod
+
+from torch.nn import Module
+
+
+class BaseExp(metaclass=ABCMeta):
+
+    @abstractmethod
+    def get_model(self) -> Module:
+        pass
diff --git a/modelscope/models/cv/realtime_object_detection/yolox/exp/build.py b/modelscope/models/cv/realtime_object_detection/yolox/exp/build.py
new file mode 100644
index 00000000..4858100c
--- /dev/null
+++ b/modelscope/models/cv/realtime_object_detection/yolox/exp/build.py
@@ -0,0 +1,18 @@
+# The implementation is based on YOLOX, available at https://github.com/Megvii-BaseDetection/YOLOX
+
+import os
+import sys
+
+
+def get_exp_by_name(exp_name):
+    exp = exp_name.replace('-',
+                           '_')  # convert string like "yolox-s" to "yolox_s"
+    if exp == 'yolox_s':
+        from .default import YoloXSExp as YoloXExp
+    elif exp == 'yolox_nano':
+        from .default import YoloXNanoExp as YoloXExp
+    elif exp == 'yolox_tiny':
+        from .default import YoloXTinyExp as YoloXExp
+    else:
+        pass
+    return YoloXExp()
diff --git a/modelscope/models/cv/realtime_object_detection/yolox/exp/default/__init__.py b/modelscope/models/cv/realtime_object_detection/yolox/exp/default/__init__.py
new file mode 100644
index 00000000..552bbccd
--- /dev/null
+++ b/modelscope/models/cv/realtime_object_detection/yolox/exp/default/__init__.py
@@ -0,0 +1,5 @@
+# The implementation is based on YOLOX, available at https://github.com/Megvii-BaseDetection/YOLOX
+
+from .yolox_nano import YoloXNanoExp
+from .yolox_s import YoloXSExp
+from .yolox_tiny import YoloXTinyExp
diff --git a/modelscope/models/cv/realtime_object_detection/yolox/exp/default/yolox_nano.py b/modelscope/models/cv/realtime_object_detection/yolox/exp/default/yolox_nano.py
new file mode 100644
index 00000000..7bada485
--- /dev/null
+++ b/modelscope/models/cv/realtime_object_detection/yolox/exp/default/yolox_nano.py
@@ -0,0 +1,47 @@
+# The implementation is based on YOLOX, available at https://github.com/Megvii-BaseDetection/YOLOX
+
+import os
+
+import torch.nn as nn
+
+from ..yolox_base import Exp as YoloXExp
+
+
+class YoloXNanoExp(YoloXExp):
+
+    def __init__(self):
+        super(YoloXNanoExp, self).__init__()
+        self.depth = 0.33
+        self.width = 0.25
+        self.input_size = (416, 416)
+        self.test_size = (416, 416)
+
+    def get_model(self, sublinear=False):
+
+        def init_yolo(M):
+            for m in M.modules():
+                if isinstance(m, nn.BatchNorm2d):
+                    m.eps = 1e-3
+                    m.momentum = 0.03
+
+        if 'model' not in self.__dict__:
+            from ...models import YOLOX, YOLOPAFPN, YOLOXHead
+            in_channels = [256, 512, 1024]
+            # NANO model use depthwise = True, which is main difference.
+            backbone = YOLOPAFPN(
+                self.depth,
+                self.width,
+                in_channels=in_channels,
+                act=self.act,
+                depthwise=True,
+            )
+            head = YOLOXHead(
+                self.num_classes,
+                self.width,
+                in_channels=in_channels,
+                act=self.act,
+                depthwise=True)
+            self.model = YOLOX(backbone, head)
+        self.model.apply(init_yolo)
+        self.model.head.initialize_biases(1e-2)
+        return self.model
diff --git a/modelscope/models/cv/realtime_object_detection/yolox/exp/default/yolox_s.py b/modelscope/models/cv/realtime_object_detection/yolox/exp/default/yolox_s.py
new file mode 100644
index 00000000..5a123b37
--- /dev/null
+++ b/modelscope/models/cv/realtime_object_detection/yolox/exp/default/yolox_s.py
@@ -0,0 +1,13 @@
+# The implementation is based on YOLOX, available at https://github.com/Megvii-BaseDetection/YOLOX
+
+import os
+
+from ..yolox_base import Exp as YoloXExp
+
+
+class YoloXSExp(YoloXExp):
+
+    def __init__(self):
+        super(YoloXSExp, self).__init__()
+        self.depth = 0.33
+        self.width = 0.50
diff --git a/modelscope/models/cv/realtime_object_detection/yolox/exp/default/yolox_tiny.py b/modelscope/models/cv/realtime_object_detection/yolox/exp/default/yolox_tiny.py
new file mode 100644
index 00000000..a80d0f2d
--- /dev/null
+++ b/modelscope/models/cv/realtime_object_detection/yolox/exp/default/yolox_tiny.py
@@ -0,0 +1,20 @@
+# The implementation is based on YOLOX, available at https://github.com/Megvii-BaseDetection/YOLOX
+
+import os
+
+from ..yolox_base import Exp as YoloXExp
+
+
+class YoloXTinyExp(YoloXExp):
+
+    def __init__(self):
+        super(YoloXTinyExp, self).__init__()
+        self.depth = 0.33
+        self.width = 0.375
+        self.input_size = (416, 416)
+        self.mosaic_scale = (0.5, 1.5)
+        self.random_size = (10, 20)
+        self.test_size = (416, 416)
+        self.exp_name = os.path.split(
+            os.path.realpath(__file__))[1].split('.')[0]
+        self.enable_mixup = False
diff --git a/modelscope/models/cv/realtime_object_detection/yolox/exp/yolox_base.py b/modelscope/models/cv/realtime_object_detection/yolox/exp/yolox_base.py
new file mode 100644
index 00000000..a2a41535
--- /dev/null
+++ b/modelscope/models/cv/realtime_object_detection/yolox/exp/yolox_base.py
@@ -0,0 +1,59 @@
+# The implementation is based on YOLOX, available at https://github.com/Megvii-BaseDetection/YOLOX
+
+import os
+import random
+
+import torch
+import torch.distributed as dist
+import torch.nn as nn
+
+from .base_exp import BaseExp
+
+
+class Exp(BaseExp):
+
+    def __init__(self):
+        super().__init__()
+
+        # ---------------- model config ---------------- #
+        # detect classes number of model
+        self.num_classes = 80
+        # factor of model depth
+        self.depth = 1.00
+        # factor of model width
+        self.width = 1.00
+        # activation name. For example, if using "relu", then "silu" will be replaced to "relu".
+        self.act = 'silu'
+        # -----------------  testing config ------------------ #
+        # output image size during evaluation/test
+        self.test_size = (640, 640)
+        # confidence threshold during evaluation/test,
+        # boxes whose scores are less than test_conf will be filtered
+        self.test_conf = 0.01
+        # nms threshold
+        self.nmsthre = 0.65
+
+    def get_model(self):
+        from ..models import YOLOX, YOLOPAFPN, YOLOXHead
+
+        def init_yolo(M):
+            for m in M.modules():
+                if isinstance(m, nn.BatchNorm2d):
+                    m.eps = 1e-3
+                    m.momentum = 0.03
+
+        if getattr(self, 'model', None) is None:
+            in_channels = [256, 512, 1024]
+            backbone = YOLOPAFPN(
+                self.depth, self.width, in_channels=in_channels, act=self.act)
+            head = YOLOXHead(
+                self.num_classes,
+                self.width,
+                in_channels=in_channels,
+                act=self.act)
+            self.model = YOLOX(backbone, head)
+
+        self.model.apply(init_yolo)
+        self.model.head.initialize_biases(1e-2)
+        self.model.train()
+        return self.model
diff --git a/modelscope/models/cv/realtime_object_detection/yolox/models/__init__.py b/modelscope/models/cv/realtime_object_detection/yolox/models/__init__.py
new file mode 100644
index 00000000..20b1a0d1
--- /dev/null
+++ b/modelscope/models/cv/realtime_object_detection/yolox/models/__init__.py
@@ -0,0 +1,7 @@
+# The implementation is based on YOLOX, available at https://github.com/Megvii-BaseDetection/YOLOX
+
+from .darknet import CSPDarknet, Darknet
+from .yolo_fpn import YOLOFPN
+from .yolo_head import YOLOXHead
+from .yolo_pafpn import YOLOPAFPN
+from .yolox import YOLOX
diff --git a/modelscope/models/cv/realtime_object_detection/yolox/models/darknet.py b/modelscope/models/cv/realtime_object_detection/yolox/models/darknet.py
new file mode 100644
index 00000000..8ece2a1e
--- /dev/null
+++ b/modelscope/models/cv/realtime_object_detection/yolox/models/darknet.py
@@ -0,0 +1,189 @@
+# The implementation is based on YOLOX, available at https://github.com/Megvii-BaseDetection/YOLOX
+
+from torch import nn
+
+from .network_blocks import (BaseConv, CSPLayer, DWConv, Focus, ResLayer,
+                             SPPBottleneck)
+
+
+class Darknet(nn.Module):
+    # number of blocks from dark2 to dark5.
+    depth2blocks = {21: [1, 2, 2, 1], 53: [2, 8, 8, 4]}
+
+    def __init__(
+            self,
+            depth,
+            in_channels=3,
+            stem_out_channels=32,
+            out_features=('dark3', 'dark4', 'dark5'),
+    ):
+        """
+        Args:
+            depth (int): depth of darknet used in model, usually use [21, 53] for this param.
+            in_channels (int): number of input channels, for example, use 3 for RGB image.
+            stem_out_channels (int): number of output channels of darknet stem.
+                It decides channels of darknet layer2 to layer5.
+            out_features (Tuple[str]): desired output layer name.
+        """
+        super().__init__()
+        assert out_features, 'please provide output features of Darknet'
+        self.out_features = out_features
+        self.stem = nn.Sequential(
+            BaseConv(
+                in_channels, stem_out_channels, ksize=3, stride=1,
+                act='lrelu'),
+            *self.make_group_layer(stem_out_channels, num_blocks=1, stride=2),
+        )
+        in_channels = stem_out_channels * 2  # 64
+
+        num_blocks = Darknet.depth2blocks[depth]
+        # create darknet with `stem_out_channels` and `num_blocks` layers.
+        # to make model structure more clear, we don't use `for` statement in python.
+        self.dark2 = nn.Sequential(
+            *self.make_group_layer(in_channels, num_blocks[0], stride=2))
+        in_channels *= 2  # 128
+        self.dark3 = nn.Sequential(
+            *self.make_group_layer(in_channels, num_blocks[1], stride=2))
+        in_channels *= 2  # 256
+        self.dark4 = nn.Sequential(
+            *self.make_group_layer(in_channels, num_blocks[2], stride=2))
+        in_channels *= 2  # 512
+
+        self.dark5 = nn.Sequential(
+            *self.make_group_layer(in_channels, num_blocks[3], stride=2),
+            *self.make_spp_block([in_channels, in_channels * 2],
+                                 in_channels * 2),
+        )
+
+    def make_group_layer(self,
+                         in_channels: int,
+                         num_blocks: int,
+                         stride: int = 1):
+        'starts with conv layer then has `num_blocks` `ResLayer`'
+        return [
+            BaseConv(
+                in_channels,
+                in_channels * 2,
+                ksize=3,
+                stride=stride,
+                act='lrelu'),
+            *[(ResLayer(in_channels * 2)) for _ in range(num_blocks)],
+        ]
+
+    def make_spp_block(self, filters_list, in_filters):
+        m = nn.Sequential(*[
+            BaseConv(in_filters, filters_list[0], 1, stride=1, act='lrelu'),
+            BaseConv(
+                filters_list[0], filters_list[1], 3, stride=1, act='lrelu'),
+            SPPBottleneck(
+                in_channels=filters_list[1],
+                out_channels=filters_list[0],
+                activation='lrelu',
+            ),
+            BaseConv(
+                filters_list[0], filters_list[1], 3, stride=1, act='lrelu'),
+            BaseConv(
+                filters_list[1], filters_list[0], 1, stride=1, act='lrelu'),
+        ])
+        return m
+
+    def forward(self, x):
+        outputs = {}
+        x = self.stem(x)
+        outputs['stem'] = x
+        x = self.dark2(x)
+        outputs['dark2'] = x
+        x = self.dark3(x)
+        outputs['dark3'] = x
+        x = self.dark4(x)
+        outputs['dark4'] = x
+        x = self.dark5(x)
+        outputs['dark5'] = x
+        return {k: v for k, v in outputs.items() if k in self.out_features}
+
+
+class CSPDarknet(nn.Module):
+
+    def __init__(
+        self,
+        dep_mul,
+        wid_mul,
+        out_features=('dark3', 'dark4', 'dark5'),
+        depthwise=False,
+        act='silu',
+    ):
+        super().__init__()
+        assert out_features, 'please provide output features of Darknet'
+        self.out_features = out_features
+        Conv = DWConv if depthwise else BaseConv
+
+        base_channels = int(wid_mul * 64)  # 64
+        base_depth = max(round(dep_mul * 3), 1)  # 3
+
+        # stem
+        self.stem = Focus(3, base_channels, ksize=3, act=act)
+
+        # dark2
+        self.dark2 = nn.Sequential(
+            Conv(base_channels, base_channels * 2, 3, 2, act=act),
+            CSPLayer(
+                base_channels * 2,
+                base_channels * 2,
+                n=base_depth,
+                depthwise=depthwise,
+                act=act,
+            ),
+        )
+
+        # dark3
+        self.dark3 = nn.Sequential(
+            Conv(base_channels * 2, base_channels * 4, 3, 2, act=act),
+            CSPLayer(
+                base_channels * 4,
+                base_channels * 4,
+                n=base_depth * 3,
+                depthwise=depthwise,
+                act=act,
+            ),
+        )
+
+        # dark4
+        self.dark4 = nn.Sequential(
+            Conv(base_channels * 4, base_channels * 8, 3, 2, act=act),
+            CSPLayer(
+                base_channels * 8,
+                base_channels * 8,
+                n=base_depth * 3,
+                depthwise=depthwise,
+                act=act,
+            ),
+        )
+
+        # dark5
+        self.dark5 = nn.Sequential(
+            Conv(base_channels * 8, base_channels * 16, 3, 2, act=act),
+            SPPBottleneck(
+                base_channels * 16, base_channels * 16, activation=act),
+            CSPLayer(
+                base_channels * 16,
+                base_channels * 16,
+                n=base_depth,
+                shortcut=False,
+                depthwise=depthwise,
+                act=act,
+            ),
+        )
+
+    def forward(self, x):
+        outputs = {}
+        x = self.stem(x)
+        outputs['stem'] = x
+        x = self.dark2(x)
+        outputs['dark2'] = x
+        x = self.dark3(x)
+        outputs['dark3'] = x
+        x = self.dark4(x)
+        outputs['dark4'] = x
+        x = self.dark5(x)
+        outputs['dark5'] = x
+        return {k: v for k, v in outputs.items() if k in self.out_features}
diff --git a/modelscope/models/cv/realtime_object_detection/yolox/models/network_blocks.py b/modelscope/models/cv/realtime_object_detection/yolox/models/network_blocks.py
new file mode 100644
index 00000000..fd15c1c1
--- /dev/null
+++ b/modelscope/models/cv/realtime_object_detection/yolox/models/network_blocks.py
@@ -0,0 +1,213 @@
+# The implementation is based on YOLOX, available at https://github.com/Megvii-BaseDetection/YOLOX
+
+import torch
+import torch.nn as nn
+
+
+def get_activation(name='silu', inplace=True):
+    if name == 'silu':
+        module = nn.SiLU(inplace=inplace)
+    else:
+        raise AttributeError('Unsupported act type: {}'.format(name))
+    return module
+
+
+class BaseConv(nn.Module):
+    """A Conv2d -> Batchnorm -> silu/leaky relu block"""
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 ksize,
+                 stride,
+                 groups=1,
+                 bias=False,
+                 act='silu'):
+        super(BaseConv, self).__init__()
+        # same padding
+        pad = (ksize - 1) // 2
+        self.conv = nn.Conv2d(
+            in_channels,
+            out_channels,
+            kernel_size=ksize,
+            stride=stride,
+            padding=pad,
+            groups=groups,
+            bias=bias,
+        )
+        self.bn = nn.BatchNorm2d(out_channels)
+        self.act = get_activation(act, inplace=True)
+
+    def forward(self, x):
+        return self.act(self.bn(self.conv(x)))
+
+    def fuseforward(self, x):
+        return self.act(self.conv(x))
+
+
+class DWConv(nn.Module):
+    """Depthwise Conv + Conv"""
+
+    def __init__(self, in_channels, out_channels, ksize, stride=1, act='silu'):
+        super(DWConv, self).__init__()
+        self.dconv = BaseConv(
+            in_channels,
+            in_channels,
+            ksize=ksize,
+            stride=stride,
+            groups=in_channels,
+            act=act,
+        )
+        self.pconv = BaseConv(
+            in_channels, out_channels, ksize=1, stride=1, groups=1, act=act)
+
+    def forward(self, x):
+        x = self.dconv(x)
+        return self.pconv(x)
+
+
+class Bottleneck(nn.Module):
+    # Standard bottleneck
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        shortcut=True,
+        expansion=0.5,
+        depthwise=False,
+        act='silu',
+    ):
+        super().__init__()
+        hidden_channels = int(out_channels * expansion)
+        Conv = DWConv if depthwise else BaseConv
+        self.conv1 = BaseConv(
+            in_channels, hidden_channels, 1, stride=1, act=act)
+        self.conv2 = Conv(hidden_channels, out_channels, 3, stride=1, act=act)
+        self.use_add = shortcut and in_channels == out_channels
+
+    def forward(self, x):
+        y = self.conv2(self.conv1(x))
+        if self.use_add:
+            y = y + x
+        return y
+
+
+class ResLayer(nn.Module):
+    'Residual layer with `in_channels` inputs.'
+
+    def __init__(self, in_channels: int):
+        super().__init__()
+        mid_channels = in_channels // 2
+        self.layer1 = BaseConv(
+            in_channels, mid_channels, ksize=1, stride=1, act='lrelu')
+        self.layer2 = BaseConv(
+            mid_channels, in_channels, ksize=3, stride=1, act='lrelu')
+
+    def forward(self, x):
+        out = self.layer2(self.layer1(x))
+        return x + out
+
+
+class SPPBottleneck(nn.Module):
+    """Spatial pyramid pooling layer used in YOLOv3-SPP"""
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_sizes=(5, 9, 13),
+                 activation='silu'):
+        super().__init__()
+        hidden_channels = in_channels // 2
+        self.conv1 = BaseConv(
+            in_channels, hidden_channels, 1, stride=1, act=activation)
+        self.m = nn.ModuleList([
+            nn.MaxPool2d(kernel_size=ks, stride=1, padding=ks // 2)
+            for ks in kernel_sizes
+        ])
+        conv2_channels = hidden_channels * (len(kernel_sizes) + 1)
+        self.conv2 = BaseConv(
+            conv2_channels, out_channels, 1, stride=1, act=activation)
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = torch.cat([x] + [m(x) for m in self.m], dim=1)
+        x = self.conv2(x)
+        return x
+
+
+class CSPLayer(nn.Module):
+    """C3 in yolov5, CSP Bottleneck with 3 convolutions"""
+
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        n=1,
+        shortcut=True,
+        expansion=0.5,
+        depthwise=False,
+        act='silu',
+    ):
+        """
+        Args:
+            in_channels (int): input channels.
+            out_channels (int): output channels.
+            n (int): number of Bottlenecks. Default value: 1.
+        """
+        # ch_in, ch_out, number, shortcut, groups, expansion
+        super().__init__()
+        hidden_channels = int(out_channels * expansion)  # hidden channels
+        self.conv1 = BaseConv(
+            in_channels, hidden_channels, 1, stride=1, act=act)
+        self.conv2 = BaseConv(
+            in_channels, hidden_channels, 1, stride=1, act=act)
+        self.conv3 = BaseConv(
+            2 * hidden_channels, out_channels, 1, stride=1, act=act)
+        module_list = [
+            Bottleneck(
+                hidden_channels,
+                hidden_channels,
+                shortcut,
+                1.0,
+                depthwise,
+                act=act) for _ in range(n)
+        ]
+        self.m = nn.Sequential(*module_list)
+
+    def forward(self, x):
+        x_1 = self.conv1(x)
+        x_2 = self.conv2(x)
+        x_1 = self.m(x_1)
+        x = torch.cat((x_1, x_2), dim=1)
+        return self.conv3(x)
+
+
+class Focus(nn.Module):
+    """Focus width and height information into channel space."""
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 ksize=1,
+                 stride=1,
+                 act='silu'):
+        super().__init__()
+        self.conv = BaseConv(
+            in_channels * 4, out_channels, ksize, stride, act=act)
+
+    def forward(self, x):
+        # shape of x (b,c,w,h) -> y(b,4c,w/2,h/2)
+        patch_top_left = x[..., ::2, ::2]
+        patch_top_right = x[..., ::2, 1::2]
+        patch_bot_left = x[..., 1::2, ::2]
+        patch_bot_right = x[..., 1::2, 1::2]
+        x = torch.cat(
+            (
+                patch_top_left,
+                patch_bot_left,
+                patch_top_right,
+                patch_bot_right,
+            ),
+            dim=1,
+        )
+        return self.conv(x)
diff --git a/modelscope/models/cv/realtime_object_detection/yolox/models/yolo_fpn.py b/modelscope/models/cv/realtime_object_detection/yolox/models/yolo_fpn.py
new file mode 100644
index 00000000..0cbebb09
--- /dev/null
+++ b/modelscope/models/cv/realtime_object_detection/yolox/models/yolo_fpn.py
@@ -0,0 +1,80 @@
+# The implementation is based on YOLOX, available at https://github.com/Megvii-BaseDetection/YOLOX
+
+import torch
+import torch.nn as nn
+
+from .darknet import Darknet
+from .network_blocks import BaseConv
+
+
+class YOLOFPN(nn.Module):
+    """
+    YOLOFPN module. Darknet 53 is the default backbone of this model.
+    """
+
+    def __init__(
+        self,
+        depth=53,
+        in_features=['dark3', 'dark4', 'dark5'],
+    ):
+        super(YOLOFPN, self).__init__()
+
+        self.backbone = Darknet(depth)
+        self.in_features = in_features
+
+        # out 1
+        self.out1_cbl = self._make_cbl(512, 256, 1)
+        self.out1 = self._make_embedding([256, 512], 512 + 256)
+
+        # out 2
+        self.out2_cbl = self._make_cbl(256, 128, 1)
+        self.out2 = self._make_embedding([128, 256], 256 + 128)
+
+        # upsample
+        self.upsample = nn.Upsample(scale_factor=2, mode='nearest')
+
+    def _make_cbl(self, _in, _out, ks):
+        return BaseConv(_in, _out, ks, stride=1, act='lrelu')
+
+    def _make_embedding(self, filters_list, in_filters):
+        m = nn.Sequential(*[
+            self._make_cbl(in_filters, filters_list[0], 1),
+            self._make_cbl(filters_list[0], filters_list[1], 3),
+            self._make_cbl(filters_list[1], filters_list[0], 1),
+            self._make_cbl(filters_list[0], filters_list[1], 3),
+            self._make_cbl(filters_list[1], filters_list[0], 1),
+        ])
+        return m
+
+    def load_pretrained_model(self, filename='./weights/darknet53.mix.pth'):
+        with open(filename, 'rb') as f:
+            state_dict = torch.load(f, map_location='cpu')
+        print('loading pretrained weights...')
+        self.backbone.load_state_dict(state_dict)
+
+    def forward(self, inputs):
+        """
+        Args:
+            inputs (Tensor): input image.
+
+        Returns:
+            Tuple[Tensor]: FPN output features..
+        """
+        #  backbone
+        out_features = self.backbone(inputs)
+        x2, x1, x0 = [out_features[f] for f in self.in_features]
+
+        #  yolo branch 1
+        x1_in = self.out1_cbl(x0)
+        x1_in = self.upsample(x1_in)
+        x1_in = torch.cat([x1_in, x1], 1)
+        out_dark4 = self.out1(x1_in)
+
+        #  yolo branch 2
+        x2_in = self.out2_cbl(out_dark4)
+        x2_in = self.upsample(x2_in)
+        x2_in = torch.cat([x2_in, x2], 1)
+        out_dark3 = self.out2(x2_in)
+
+        outputs = (out_dark3, out_dark4, x0)
+        return outputs
diff --git a/modelscope/models/cv/realtime_object_detection/yolox/models/yolo_head.py b/modelscope/models/cv/realtime_object_detection/yolox/models/yolo_head.py
new file mode 100644
index 00000000..1eef93a4
--- /dev/null
+++ b/modelscope/models/cv/realtime_object_detection/yolox/models/yolo_head.py
@@ -0,0 +1,182 @@
+# The implementation is based on YOLOX, available at https://github.com/Megvii-BaseDetection/YOLOX
+
+import math
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from ..utils import bboxes_iou, meshgrid
+from .network_blocks import BaseConv, DWConv
+
+
+class YOLOXHead(nn.Module):
+
+    def __init__(
+        self,
+        num_classes,
+        width=1.0,
+        strides=[8, 16, 32],
+        in_channels=[256, 512, 1024],
+        act='silu',
+        depthwise=False,
+    ):
+        """
+        Args:
+            act (str): activation type of conv. Defalut value: "silu".
+            depthwise (bool): whether apply depthwise conv in conv branch. Defalut value: False.
+        """
+        super(YOLOXHead, self).__init__()
+
+        self.n_anchors = 1
+        self.num_classes = num_classes
+        self.decode_in_inference = True  # for deploy, set to False
+
+        self.cls_convs = nn.ModuleList()
+        self.reg_convs = nn.ModuleList()
+        self.cls_preds = nn.ModuleList()
+        self.reg_preds = nn.ModuleList()
+        self.obj_preds = nn.ModuleList()
+        self.stems = nn.ModuleList()
+        Conv = DWConv if depthwise else BaseConv
+
+        for i in range(len(in_channels)):
+            self.stems.append(
+                BaseConv(
+                    in_channels=int(in_channels[i] * width),
+                    out_channels=int(256 * width),
+                    ksize=1,
+                    stride=1,
+                    act=act,
+                ))
+            self.cls_convs.append(
+                nn.Sequential(*[
+                    Conv(
+                        in_channels=int(256 * width),
+                        out_channels=int(256 * width),
+                        ksize=3,
+                        stride=1,
+                        act=act,
+                    ),
+                    Conv(
+                        in_channels=int(256 * width),
+                        out_channels=int(256 * width),
+                        ksize=3,
+                        stride=1,
+                        act=act,
+                    ),
+                ]))
+            self.reg_convs.append(
+                nn.Sequential(*[
+                    Conv(
+                        in_channels=int(256 * width),
+                        out_channels=int(256 * width),
+                        ksize=3,
+                        stride=1,
+                        act=act,
+                    ),
+                    Conv(
+                        in_channels=int(256 * width),
+                        out_channels=int(256 * width),
+                        ksize=3,
+                        stride=1,
+                        act=act,
+                    ),
+                ]))
+            self.cls_preds.append(
+                nn.Conv2d(
+                    in_channels=int(256 * width),
+                    out_channels=self.n_anchors * self.num_classes,
+                    kernel_size=1,
+                    stride=1,
+                    padding=0,
+                ))
+            self.reg_preds.append(
+                nn.Conv2d(
+                    in_channels=int(256 * width),
+                    out_channels=4,
+                    kernel_size=1,
+                    stride=1,
+                    padding=0,
+                ))
+            self.obj_preds.append(
+                nn.Conv2d(
+                    in_channels=int(256 * width),
+                    out_channels=self.n_anchors * 1,
+                    kernel_size=1,
+                    stride=1,
+                    padding=0,
+                ))
+
+        self.use_l1 = False
+        self.l1_loss = nn.L1Loss(reduction='none')
+        self.bcewithlog_loss = nn.BCEWithLogitsLoss(reduction='none')
+        # self.iou_loss = IOUloss(reduction="none")
+        self.strides = strides
+        self.grids = [torch.zeros(1)] * len(in_channels)
+
+    def initialize_biases(self, prior_prob):
+        for conv in self.cls_preds:
+            b = conv.bias.view(self.n_anchors, -1)
+            b.data.fill_(-math.log((1 - prior_prob) / prior_prob))
+            conv.bias = torch.nn.Parameter(b.view(-1), requires_grad=True)
+
+        for conv in self.obj_preds:
+            b = conv.bias.view(self.n_anchors, -1)
+            b.data.fill_(-math.log((1 - prior_prob) / prior_prob))
+            conv.bias = torch.nn.Parameter(b.view(-1), requires_grad=True)
+
+    def forward(self, xin, labels=None, imgs=None):
+        outputs = []
+
+        for k, (cls_conv, reg_conv, stride_this_level, x) in enumerate(
+                zip(self.cls_convs, self.reg_convs, self.strides, xin)):
+            x = self.stems[k](x)
+            cls_x = x
+            reg_x = x
+
+            cls_feat = cls_conv(cls_x)
+            cls_output = self.cls_preds[k](cls_feat)
+
+            reg_feat = reg_conv(reg_x)
+            reg_output = self.reg_preds[k](reg_feat)
+            obj_output = self.obj_preds[k](reg_feat)
+
+            if self.training:
+                pass
+            else:
+                output = torch.cat(
+                    [reg_output,
+                     obj_output.sigmoid(),
+                     cls_output.sigmoid()], 1)
+
+            outputs.append(output)
+
+        if self.training:
+            pass
+        else:
+            self.hw = [x.shape[-2:] for x in outputs]
+            # [batch, n_anchors_all, 85]
+            outputs = torch.cat([x.flatten(start_dim=2) for x in outputs],
+                                dim=2).permute(0, 2, 1)
+            if self.decode_in_inference:
+                return self.decode_outputs(outputs, dtype=xin[0].type())
+            else:
+                return outputs
+
+    def decode_outputs(self, outputs, dtype):
+        grids = []
+        strides = []
+        for (hsize, wsize), stride in zip(self.hw, self.strides):
+            yv, xv = meshgrid([torch.arange(hsize), torch.arange(wsize)])
+            grid = torch.stack((xv, yv), 2).view(1, -1, 2)
+            grids.append(grid)
+            shape = grid.shape[:2]
+            strides.append(torch.full((*shape, 1), stride))
+
+        grids = torch.cat(grids, dim=1).type(dtype)
+        strides = torch.cat(strides, dim=1).type(dtype)
+
+        outputs[..., :2] = (outputs[..., :2] + grids) * strides
+        outputs[..., 2:4] = torch.exp(outputs[..., 2:4]) * strides
+        return outputs
diff --git a/modelscope/models/cv/realtime_object_detection/yolox/models/yolo_pafpn.py b/modelscope/models/cv/realtime_object_detection/yolox/models/yolo_pafpn.py
new file mode 100644
index 00000000..cd4258bf
--- /dev/null
+++ b/modelscope/models/cv/realtime_object_detection/yolox/models/yolo_pafpn.py
@@ -0,0 +1,126 @@
+# The implementation is based on YOLOX, available at https://github.com/Megvii-BaseDetection/YOLOX
+
+import torch
+import torch.nn as nn
+
+from .darknet import CSPDarknet
+from .network_blocks import BaseConv, CSPLayer, DWConv
+
+
+class YOLOPAFPN(nn.Module):
+    """
+    YOLOv3 model. Darknet 53 is the default backbone of this model.
+    """
+
+    def __init__(
+        self,
+        depth=1.0,
+        width=1.0,
+        in_features=('dark3', 'dark4', 'dark5'),
+        in_channels=[256, 512, 1024],
+        depthwise=False,
+        act='silu',
+    ):
+        super(YOLOPAFPN, self).__init__()
+        self.backbone = CSPDarknet(depth, width, depthwise=depthwise, act=act)
+        self.in_features = in_features
+        self.in_channels = in_channels
+        Conv = DWConv if depthwise else BaseConv
+
+        self.upsample = nn.Upsample(scale_factor=2, mode='nearest')
+        self.lateral_conv0 = BaseConv(
+            int(in_channels[2] * width),
+            int(in_channels[1] * width),
+            1,
+            1,
+            act=act)
+        self.C3_p4 = CSPLayer(
+            int(2 * in_channels[1] * width),
+            int(in_channels[1] * width),
+            round(3 * depth),
+            False,
+            depthwise=depthwise,
+            act=act,
+        )  # cat
+
+        self.reduce_conv1 = BaseConv(
+            int(in_channels[1] * width),
+            int(in_channels[0] * width),
+            1,
+            1,
+            act=act)
+        self.C3_p3 = CSPLayer(
+            int(2 * in_channels[0] * width),
+            int(in_channels[0] * width),
+            round(3 * depth),
+            False,
+            depthwise=depthwise,
+            act=act,
+        )
+
+        # bottom-up conv
+        self.bu_conv2 = Conv(
+            int(in_channels[0] * width),
+            int(in_channels[0] * width),
+            3,
+            2,
+            act=act)
+        self.C3_n3 = CSPLayer(
+            int(2 * in_channels[0] * width),
+            int(in_channels[1] * width),
+            round(3 * depth),
+            False,
+            depthwise=depthwise,
+            act=act,
+        )
+
+        # bottom-up conv
+        self.bu_conv1 = Conv(
+            int(in_channels[1] * width),
+            int(in_channels[1] * width),
+            3,
+            2,
+            act=act)
+        self.C3_n4 = CSPLayer(
+            int(2 * in_channels[1] * width),
+            int(in_channels[2] * width),
+            round(3 * depth),
+            False,
+            depthwise=depthwise,
+            act=act,
+        )
+
+    def forward(self, input):
+        """
+        Args:
+            inputs: input images.
+
+        Returns:
+            Tuple[Tensor]: FPN feature.
+        """
+
+        #  backbone
+        out_features = self.backbone(input)
+        features = [out_features[f] for f in self.in_features]
+        [x2, x1, x0] = features
+
+        fpn_out0 = self.lateral_conv0(x0)  # 1024->512/32
+        f_out0 = self.upsample(fpn_out0)  # 512/16
+        f_out0 = torch.cat([f_out0, x1], 1)  # 512->1024/16
+        f_out0 = self.C3_p4(f_out0)  # 1024->512/16
+
+        fpn_out1 = self.reduce_conv1(f_out0)  # 512->256/16
+        f_out1 = self.upsample(fpn_out1)  # 256/8
+        f_out1 = torch.cat([f_out1, x2], 1)  # 256->512/8
+        pan_out2 = self.C3_p3(f_out1)  # 512->256/8
+
+        p_out1 = self.bu_conv2(pan_out2)  # 256->256/16
+        p_out1 = torch.cat([p_out1, fpn_out1], 1)  # 256->512/16
+        pan_out1 = self.C3_n3(p_out1)  # 512->512/16
+
+        p_out0 = self.bu_conv1(pan_out1)  # 512->512/32
+        p_out0 = torch.cat([p_out0, fpn_out0], 1)  # 512->1024/32
+        pan_out0 = self.C3_n4(p_out0)  # 1024->1024/32
+
+        outputs = (pan_out2, pan_out1, pan_out0)
+        return outputs
diff --git a/modelscope/models/cv/realtime_object_detection/yolox/models/yolox.py b/modelscope/models/cv/realtime_object_detection/yolox/models/yolox.py
new file mode 100644
index 00000000..181c368b
--- /dev/null
+++ b/modelscope/models/cv/realtime_object_detection/yolox/models/yolox.py
@@ -0,0 +1,33 @@
+# The implementation is based on YOLOX, available at https://github.com/Megvii-BaseDetection/YOLOX
+
+import torch.nn as nn
+
+from .yolo_head import YOLOXHead
+from .yolo_pafpn import YOLOPAFPN
+
+
+class YOLOX(nn.Module):
+    """
+    YOLOX model module. The module list is defined by create_yolov3_modules function.
+    The network returns loss values from three YOLO layers during training
+    and detection results during test.
+    """
+
+    def __init__(self, backbone=None, head=None):
+        super(YOLOX, self).__init__()
+        if backbone is None:
+            backbone = YOLOPAFPN()
+        if head is None:
+            head = YOLOXHead(80)
+
+        self.backbone = backbone
+        self.head = head
+
+    def forward(self, x, targets=None):
+        fpn_outs = self.backbone(x)
+        if self.training:
+            raise NotImplementedError('Training is not supported yet!')
+        else:
+            outputs = self.head(fpn_outs)
+
+        return outputs
diff --git a/modelscope/models/cv/realtime_object_detection/yolox/utils/__init__.py b/modelscope/models/cv/realtime_object_detection/yolox/utils/__init__.py
new file mode 100644
index 00000000..2c1ea489
--- /dev/null
+++ b/modelscope/models/cv/realtime_object_detection/yolox/utils/__init__.py
@@ -0,0 +1,5 @@
+# The implementation is based on YOLOX, available at https://github.com/Megvii-BaseDetection/YOLOX
+
+from .boxes import *  # noqa
+
+__all__ = ['bboxes_iou', 'meshgrid', 'postprocess', 'xyxy2cxcywh', 'xyxy2xywh']
diff --git a/modelscope/models/cv/realtime_object_detection/yolox/utils/boxes.py b/modelscope/models/cv/realtime_object_detection/yolox/utils/boxes.py
new file mode 100644
index 00000000..b29a3a04
--- /dev/null
+++ b/modelscope/models/cv/realtime_object_detection/yolox/utils/boxes.py
@@ -0,0 +1,107 @@
+# The implementation is based on YOLOX, available at https://github.com/Megvii-BaseDetection/YOLOX
+
+import torch
+import torchvision
+
+_TORCH_VER = [int(x) for x in torch.__version__.split('.')[:2]]
+
+
+def meshgrid(*tensors):
+    if _TORCH_VER >= [1, 10]:
+        return torch.meshgrid(*tensors, indexing='ij')
+    else:
+        return torch.meshgrid(*tensors)
+
+
+def xyxy2xywh(bboxes):
+    bboxes[:, 2] = bboxes[:, 2] - bboxes[:, 0]
+    bboxes[:, 3] = bboxes[:, 3] - bboxes[:, 1]
+    return bboxes
+
+
+def xyxy2cxcywh(bboxes):
+    bboxes[:, 2] = bboxes[:, 2] - bboxes[:, 0]
+    bboxes[:, 3] = bboxes[:, 3] - bboxes[:, 1]
+    bboxes[:, 0] = bboxes[:, 0] + bboxes[:, 2] * 0.5
+    bboxes[:, 1] = bboxes[:, 1] + bboxes[:, 3] * 0.5
+    return bboxes
+
+
+def postprocess(prediction,
+                num_classes,
+                conf_thre=0.7,
+                nms_thre=0.45,
+                class_agnostic=False):
+    box_corner = prediction.new(prediction.shape)
+    box_corner[:, :, 0] = prediction[:, :, 0] - prediction[:, :, 2] / 2
+    box_corner[:, :, 1] = prediction[:, :, 1] - prediction[:, :, 3] / 2
+    box_corner[:, :, 2] = prediction[:, :, 0] + prediction[:, :, 2] / 2
+    box_corner[:, :, 3] = prediction[:, :, 1] + prediction[:, :, 3] / 2
+    prediction[:, :, :4] = box_corner[:, :, :4]
+
+    output = [None for _ in range(len(prediction))]
+    for i, image_pred in enumerate(prediction):
+
+        # If none are remaining => process next image
+        if not image_pred.size(0):
+            continue
+        # Get score and class with highest confidence
+        class_conf, class_pred = torch.max(
+            image_pred[:, 5:5 + num_classes], 1, keepdim=True)
+
+        conf_mask = image_pred[:, 4] * class_conf.squeeze()
+        conf_mask = (conf_mask >= conf_thre).squeeze()
+        # Detections ordered as (x1, y1, x2, y2, obj_conf, class_conf, class_pred)
+        detections = torch.cat(
+            (image_pred[:, :5], class_conf, class_pred.float()), 1)
+        detections = detections[conf_mask]
+        if not detections.size(0):
+            continue
+
+        if class_agnostic:
+            nms_out_index = torchvision.ops.nms(
+                detections[:, :4],
+                detections[:, 4] * detections[:, 5],
+                nms_thre,
+            )
+        else:
+            nms_out_index = torchvision.ops.batched_nms(
+                detections[:, :4],
+                detections[:, 4] * detections[:, 5],
+                detections[:, 6],
+                nms_thre,
+            )
+
+        detections = detections[nms_out_index]
+        if output[i] is None:
+            output[i] = detections
+        else:
+            output[i] = torch.cat((output[i], detections))
+
+    return output
+
+
+def bboxes_iou(bboxes_a, bboxes_b, xyxy=True):
+    if bboxes_a.shape[1] != 4 or bboxes_b.shape[1] != 4:
+        raise IndexError
+
+    if xyxy:
+        tl = torch.max(bboxes_a[:, None, :2], bboxes_b[:, :2])
+        br = torch.min(bboxes_a[:, None, 2:], bboxes_b[:, 2:])
+        area_a = torch.prod(bboxes_a[:, 2:] - bboxes_a[:, :2], 1)
+        area_b = torch.prod(bboxes_b[:, 2:] - bboxes_b[:, :2], 1)
+    else:
+        tl = torch.max(
+            (bboxes_a[:, None, :2] - bboxes_a[:, None, 2:] / 2),
+            (bboxes_b[:, :2] - bboxes_b[:, 2:] / 2),
+        )
+        br = torch.min(
+            (bboxes_a[:, None, :2] + bboxes_a[:, None, 2:] / 2),
+            (bboxes_b[:, :2] + bboxes_b[:, 2:] / 2),
+        )
+
+        area_a = torch.prod(bboxes_a[:, 2:], 1)
+        area_b = torch.prod(bboxes_b[:, 2:], 1)
+    en = (tl < br).type(tl.type()).prod(dim=2)
+    area_i = torch.prod(br - tl, 2) * en  # * ((tl < br).all())
+    return area_i / (area_a[:, None] + area_b - area_i)
diff --git a/modelscope/models/cv/salient_detection/models/__init__.py b/modelscope/models/cv/salient_detection/models/__init__.py
index 0850c33d..8ea7a5d3 100644
--- a/modelscope/models/cv/salient_detection/models/__init__.py
+++ b/modelscope/models/cv/salient_detection/models/__init__.py
@@ -1 +1,3 @@
+# The implementation is adopted from U-2-Net, made publicly available under the Apache 2.0 License
+# source code avaiable via https://github.com/xuebinqin/U-2-Net
 from .u2net import U2NET
diff --git a/modelscope/models/cv/salient_detection/models/u2net.py b/modelscope/models/cv/salient_detection/models/u2net.py
index 0a0a4511..05dbf7ad 100644
--- a/modelscope/models/cv/salient_detection/models/u2net.py
+++ b/modelscope/models/cv/salient_detection/models/u2net.py
@@ -1,4 +1,5 @@
-# Implementation in this file is modifed from source code avaiable via https://github.com/xuebinqin/U-2-Net
+# The implementation is adopted from U-2-Net, made publicly available under the Apache 2.0 License
+# source code avaiable via https://github.com/xuebinqin/U-2-Net
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
diff --git a/modelscope/models/cv/salient_detection/salient_model.py b/modelscope/models/cv/salient_detection/salient_model.py
index 539d1f24..73c3c3fb 100644
--- a/modelscope/models/cv/salient_detection/salient_model.py
+++ b/modelscope/models/cv/salient_detection/salient_model.py
@@ -1,3 +1,4 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
 import os.path as osp
 
 import cv2
@@ -13,7 +14,8 @@ from modelscope.utils.constant import ModelFile, Tasks
 from .models import U2NET
 
 
-@MODELS.register_module(Tasks.image_segmentation, module_name=Models.detection)
+@MODELS.register_module(
+    Tasks.semantic_segmentation, module_name=Models.detection)
 class SalientDetection(TorchModel):
 
     def __init__(self, model_dir: str, *args, **kwargs):
diff --git a/modelscope/models/cv/shop_segmentation/__init__.py b/modelscope/models/cv/shop_segmentation/__init__.py
new file mode 100644
index 00000000..072628bd
--- /dev/null
+++ b/modelscope/models/cv/shop_segmentation/__init__.py
@@ -0,0 +1,20 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .shop_seg_base import SHOPSEG
+
+else:
+    _import_structure = {'shop_seg_base': ['SHOPSEG']}
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/models/cv/shop_segmentation/common.py b/modelscope/models/cv/shop_segmentation/common.py
new file mode 100644
index 00000000..8cb940a5
--- /dev/null
+++ b/modelscope/models/cv/shop_segmentation/common.py
@@ -0,0 +1,57 @@
+# Base modules are adapted from https://github.com/open-mmlab/mmcv/,
+# originally Apache 2.0 License, Copyright (c) 2018-2022 OpenMMLab,
+# https://github.com/open-mmlab/mmsegmentation/,
+# originally Apache 2.0 License, Copyright (c) 2020-2021 OpenMMLab,
+# and adapted from https://github.com/raoyongming/DenseCLIP/,
+# originally MIT License, Copyright (c) 2022 Rao, Yongming.
+
+import warnings
+
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+def resize(input,
+           size=None,
+           scale_factor=None,
+           mode='nearest',
+           align_corners=None,
+           warning=True):
+    if warning:
+        if size is not None and align_corners:
+            input_h, input_w = tuple(int(x) for x in input.shape[2:])
+            output_h, output_w = tuple(int(x) for x in size)
+            if output_h > input_h or output_w > input_w:
+                if ((output_h > 1 and output_w > 1 and input_h > 1
+                     and input_w > 1) and (output_h - 1) % (input_h - 1)
+                        and (output_w - 1) % (input_w - 1)):
+                    warnings.warn(
+                        f'When align_corners={align_corners}, '
+                        'the output would more aligned if '
+                        f'input size {(input_h, input_w)} is `x+1` and '
+                        f'out size {(output_h, output_w)} is `nx+1`')
+    return F.interpolate(input, size, scale_factor, mode, align_corners)
+
+
+class Upsample(nn.Module):
+
+    def __init__(self,
+                 size=None,
+                 scale_factor=None,
+                 mode='nearest',
+                 align_corners=None):
+        super(Upsample, self).__init__()
+        self.size = size
+        if isinstance(scale_factor, tuple):
+            self.scale_factor = tuple(float(factor) for factor in scale_factor)
+        else:
+            self.scale_factor = float(scale_factor) if scale_factor else None
+        self.mode = mode
+        self.align_corners = align_corners
+
+    def forward(self, x):
+        if not self.size:
+            size = [int(t * self.scale_factor) for t in x.shape[-2:]]
+        else:
+            size = self.size
+        return resize(x, size, None, self.mode, self.align_corners)
diff --git a/modelscope/models/cv/shop_segmentation/head_fpn.py b/modelscope/models/cv/shop_segmentation/head_fpn.py
new file mode 100644
index 00000000..cad389c7
--- /dev/null
+++ b/modelscope/models/cv/shop_segmentation/head_fpn.py
@@ -0,0 +1,120 @@
+# Base modules are adapted from https://github.com/open-mmlab/mmcv/,
+# originally Apache 2.0 License, Copyright (c) 2018-2022 OpenMMLab,
+# https://github.com/open-mmlab/mmsegmentation/,
+# originally Apache 2.0 License, Copyright (c) 2020-2021 OpenMMLab,
+# and adapted from https://github.com/raoyongming/DenseCLIP/,
+# originally MIT License, Copyright (c) 2022 Rao, Yongming.
+
+import numpy as np
+import torch
+import torch.nn as nn
+from mmcv.cnn import ConvModule
+from timm.models.layers import drop, drop_path, trunc_normal_
+
+from .common import Upsample, resize
+
+
+class FPNHead(nn.Module):
+    """Panoptic Feature Pyramid Networks.
+    This head is the implementation of `Semantic FPN
+    <https://arxiv.org/abs/1901.02446>`_.
+    Args:
+        feature_strides (tuple[int]): The strides for input feature maps.
+            stack_lateral. All strides suppose to be power of 2. The first
+            one is of largest resolution.
+    """
+
+    def __init__(self,
+                 channels,
+                 num_classes,
+                 dropout_ratio=0.1,
+                 feature_strides=[4, 8, 16, 32],
+                 align_corners=False,
+                 **kwargs):
+        super(FPNHead, self).__init__()
+        self.act_cfg = dict(type='ReLU')
+        self.channels = channels
+        self.conv_cfg = None
+        self.norm_cfg = None
+        self.norm_cfg = dict(type='BN2d', requires_grad=True)
+        self.align_corners = align_corners
+        self.dropout_ratio = dropout_ratio
+        self.conv_seg = nn.Conv2d(channels, num_classes, kernel_size=1)
+        if dropout_ratio > 0:
+            self.dropout = nn.Dropout2d(dropout_ratio)
+        else:
+            self.dropout = None
+        self.in_index = [0, 1, 2, 3]
+        assert min(feature_strides) == feature_strides[0]
+        self.feature_strides = feature_strides
+        self.scale_heads = nn.ModuleList()
+        for i in range(len(feature_strides)):
+            head_length = max(
+                1,
+                int(np.log2(feature_strides[i]) - np.log2(feature_strides[0])))
+            scale_head = []
+            for k in range(head_length):
+                scale_head.append(
+                    ConvModule(
+                        self.channels,
+                        self.channels,
+                        3,
+                        padding=1,
+                        conv_cfg=self.conv_cfg,
+                        norm_cfg=self.norm_cfg,
+                        act_cfg=self.act_cfg))
+                if feature_strides[i] != feature_strides[0]:
+                    scale_head.append(
+                        Upsample(
+                            scale_factor=2,
+                            mode='bilinear',
+                            align_corners=self.align_corners))
+            self.scale_heads.append(nn.Sequential(*scale_head))
+
+        self.apply(self._init_weights)
+
+    def _transform_inputs(self, inputs):
+        """Transform inputs for decoder.
+
+        Args:
+            inputs (list[Tensor]): List of multi-level img features.
+
+        Returns:
+            Tensor: The transformed inputs
+        """
+        inputs = [inputs[i] for i in self.in_index]
+        return inputs
+
+    def cls_seg(self, feat):
+        """Classify each pixel."""
+        if self.dropout is not None:
+            feat = self.dropout(feat)
+        output = self.conv_seg(feat)
+        return output
+
+    def forward(self, inputs):
+        x = self._transform_inputs(inputs)
+        output = self.scale_heads[0](x[0])
+        for i in range(1, len(self.feature_strides)):
+            # non inplace
+            output = output + resize(
+                self.scale_heads[i](x[i]),
+                size=output.shape[2:],
+                mode='bilinear',
+                align_corners=self.align_corners)
+
+        output = self.cls_seg(output)
+        return output
+
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+        elif isinstance(m, nn.Conv2d):
+            nn.init.kaiming_normal_(m.weight.data, nonlinearity='relu')
+            if m.bias is not None:
+                nn.init.constant_(m.bias.data, 0)
diff --git a/modelscope/models/cv/shop_segmentation/models.py b/modelscope/models/cv/shop_segmentation/models.py
new file mode 100644
index 00000000..3880d074
--- /dev/null
+++ b/modelscope/models/cv/shop_segmentation/models.py
@@ -0,0 +1,899 @@
+# Base modules are adapted from https://github.com/open-mmlab/mmcv/,
+# originally Apache 2.0 License, Copyright (c) 2018-2022 OpenMMLab,
+# https://github.com/open-mmlab/mmsegmentation/,
+# originally Apache 2.0 License, Copyright (c) 2020-2021 OpenMMLab,
+# and adapted from https://github.com/raoyongming/DenseCLIP/,
+# originally MIT License, Copyright (c) 2022 Rao, Yongming.
+
+import math
+from collections import OrderedDict
+
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint as checkpoint
+from timm.models.layers import drop, drop_path, trunc_normal_
+from torch import nn
+
+
+class Bottleneck(nn.Module):
+    expansion = 4
+
+    def __init__(self, inplanes, planes, stride=1):
+        super().__init__()
+
+        # all conv layers have stride 1. an avgpool is performed after the second convolution when stride > 1
+        self.conv1 = nn.Conv2d(inplanes, planes, 1, bias=False)
+        self.bn1 = nn.BatchNorm2d(planes)
+
+        self.conv2 = nn.Conv2d(planes, planes, 3, padding=1, bias=False)
+        self.bn2 = nn.BatchNorm2d(planes)
+
+        self.avgpool = nn.AvgPool2d(stride) if stride > 1 else nn.Identity()
+
+        self.conv3 = nn.Conv2d(planes, planes * self.expansion, 1, bias=False)
+        self.bn3 = nn.BatchNorm2d(planes * self.expansion)
+
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = None
+        self.stride = stride
+
+        if stride > 1 or inplanes != planes * Bottleneck.expansion:
+            # downsampling layer is prepended with an avgpool, and the subsequent convolution has stride 1
+            self.downsample = nn.Sequential(
+                OrderedDict([('-1', nn.AvgPool2d(stride)),
+                             ('0',
+                              nn.Conv2d(
+                                  inplanes,
+                                  planes * self.expansion,
+                                  1,
+                                  stride=1,
+                                  bias=False)),
+                             ('1', nn.BatchNorm2d(planes * self.expansion))]))
+
+    def forward(self, x: torch.Tensor):
+        identity = x
+
+        out = self.relu(self.bn1(self.conv1(x)))
+        out = self.relu(self.bn2(self.conv2(out)))
+        out = self.avgpool(out)
+        out = self.bn3(self.conv3(out))
+
+        if self.downsample is not None:
+            identity = self.downsample(x)
+
+        out += identity
+        out = self.relu(out)
+        return out
+
+
+class AttentionPool2d(nn.Module):
+
+    def __init__(self,
+                 spacial_dim: int,
+                 embed_dim: int,
+                 num_heads: int,
+                 output_dim: int = None):
+        super().__init__()
+        self.positional_embedding = nn.Parameter(
+            torch.randn(spacial_dim**2 + 1, embed_dim) / embed_dim**0.5)
+        self.k_proj = nn.Linear(embed_dim, embed_dim)
+        self.q_proj = nn.Linear(embed_dim, embed_dim)
+        self.v_proj = nn.Linear(embed_dim, embed_dim)
+        self.c_proj = nn.Linear(embed_dim, output_dim or embed_dim)
+        self.num_heads = num_heads
+        self.embed_dim = embed_dim
+        self.spacial_dim = spacial_dim
+
+    def forward(self, x):
+        B, C, H, W = x.shape
+        x = x.reshape(x.shape[0], x.shape[1],
+                      x.shape[2] * x.shape[3]).permute(2, 0,
+                                                       1)  # NCHW -> (HW)NC
+        x = torch.cat([x.mean(dim=0, keepdim=True), x], dim=0)  # (HW+1)NC
+
+        cls_pos = self.positional_embedding[0:1, :]
+        spatial_pos = F.interpolate(
+            self.positional_embedding[1:, ].reshape(1, self.spacial_dim,
+                                                    self.spacial_dim,
+                                                    self.embed_dim).permute(
+                                                        0, 3, 1, 2),
+            size=(H, W),
+            mode='bilinear')
+        spatial_pos = spatial_pos.reshape(self.embed_dim, H * W).permute(1, 0)
+        positional_embedding = torch.cat([cls_pos, spatial_pos], dim=0)
+
+        x = x + positional_embedding[:, None, :]
+        x, _ = F.multi_head_attention_forward(
+            query=x,
+            key=x,
+            value=x,
+            embed_dim_to_check=x.shape[-1],
+            num_heads=self.num_heads,
+            q_proj_weight=self.q_proj.weight,
+            k_proj_weight=self.k_proj.weight,
+            v_proj_weight=self.v_proj.weight,
+            in_proj_weight=None,
+            in_proj_bias=torch.cat(
+                [self.q_proj.bias, self.k_proj.bias, self.v_proj.bias]),
+            bias_k=None,
+            bias_v=None,
+            add_zero_attn=False,
+            dropout_p=0,
+            out_proj_weight=self.c_proj.weight,
+            out_proj_bias=self.c_proj.bias,
+            use_separate_proj_weight=True,
+            training=self.training,
+            need_weights=False)
+
+        x = x.permute(1, 2, 0)
+        global_feat = x[:, :, 0]
+        feature_map = x[:, :, 1:].reshape(B, -1, H, W)
+        return global_feat, feature_map
+
+
+class CLIPResNet(nn.Module):
+    """
+    A ResNet class that is similar to torchvision's but contains the following changes:
+    - There are now 3 "stem" convolutions as opposed to 1, with an average pool instead of a max pool.
+    - Performs anti-aliasing strided convolutions, where an avgpool is prepended to convolutions with stride > 1
+    - The final pooling layer is a QKV attention instead of an average pool
+    """
+
+    def __init__(self,
+                 layers,
+                 output_dim=512,
+                 input_resolution=224,
+                 width=64,
+                 pretrained=None,
+                 **kwargs):
+        super().__init__()
+        self.pretrained = pretrained
+        self.output_dim = output_dim
+        self.input_resolution = input_resolution
+
+        # the 3-layer stem
+        self.conv1 = nn.Conv2d(
+            3, width // 2, kernel_size=3, stride=2, padding=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(width // 2)
+        self.conv2 = nn.Conv2d(
+            width // 2, width // 2, kernel_size=3, padding=1, bias=False)
+        self.bn2 = nn.BatchNorm2d(width // 2)
+        self.conv3 = nn.Conv2d(
+            width // 2, width, kernel_size=3, padding=1, bias=False)
+        self.bn3 = nn.BatchNorm2d(width)
+        self.avgpool = nn.AvgPool2d(2)
+        self.relu = nn.ReLU(inplace=True)
+
+        # residual layers
+        self._inplanes = width  # this is a *mutable* variable used during construction
+        self.layer1 = self._make_layer(width, layers[0])
+        self.layer2 = self._make_layer(width * 2, layers[1], stride=2)
+        self.layer3 = self._make_layer(width * 4, layers[2], stride=2)
+        self.layer4 = self._make_layer(width * 8, layers[3], stride=2)
+
+    def init_weights(self, pretrained=None):
+        pretrained = pretrained or self.pretrained
+        if isinstance(pretrained, str):
+            checkpoint = torch.jit.load(
+                pretrained, map_location='cpu').float().state_dict()
+
+            state_dict = {}
+
+            for k in checkpoint.keys():
+                if k.startswith('visual.'):
+                    new_k = k.replace('visual.', '')
+                    state_dict[new_k] = checkpoint[k]
+
+            u, w = self.load_state_dict(state_dict, False)
+            print(u, w, 'are misaligned params in CLIPResNet')
+
+    def _make_layer(self, planes, blocks, stride=1):
+        layers = [Bottleneck(self._inplanes, planes, stride)]
+
+        self._inplanes = planes * Bottleneck.expansion
+        for _ in range(1, blocks):
+            layers.append(Bottleneck(self._inplanes, planes))
+
+        return nn.Sequential(*layers)
+
+    def forward(self, x):
+
+        def stem(x):
+            for conv, bn in [(self.conv1, self.bn1), (self.conv2, self.bn2),
+                             (self.conv3, self.bn3)]:
+                x = self.relu(bn(conv(x)))
+            x = self.avgpool(x)
+            return x
+
+        x = x.type(self.conv1.weight.dtype)
+        x = stem(x)
+
+        outs = []
+        x = self.layer1(x)
+        outs.append(x)
+        x = self.layer2(x)
+        outs.append(x)
+        x = self.layer3(x)
+        outs.append(x)
+        x = self.layer4(x)
+        outs.append(x)
+
+        return tuple(outs)
+
+
+class CLIPResNetWithAttention(nn.Module):
+    """
+    A ResNet class that is similar to torchvision's but contains the following changes:
+    - There are now 3 "stem" convolutions as opposed to 1, with an average pool instead of a max pool.
+    - Performs anti-aliasing strided convolutions, where an avgpool is prepended to convolutions with stride > 1
+    - The final pooling layer is a QKV attention instead of an average pool
+    """
+
+    def __init__(self,
+                 layers,
+                 output_dim=1024,
+                 input_resolution=224,
+                 width=64,
+                 pretrained=None,
+                 **kwargs):
+        super().__init__()
+        self.pretrained = pretrained
+        self.output_dim = output_dim
+        self.input_resolution = input_resolution
+
+        # the 3-layer stem
+        self.conv1 = nn.Conv2d(
+            3, width // 2, kernel_size=3, stride=2, padding=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(width // 2)
+        self.conv2 = nn.Conv2d(
+            width // 2, width // 2, kernel_size=3, padding=1, bias=False)
+        self.bn2 = nn.BatchNorm2d(width // 2)
+        self.conv3 = nn.Conv2d(
+            width // 2, width, kernel_size=3, padding=1, bias=False)
+        self.bn3 = nn.BatchNorm2d(width)
+        self.avgpool = nn.AvgPool2d(2)
+        self.relu = nn.ReLU(inplace=True)
+
+        # residual layers
+        self._inplanes = width  # this is a *mutable* variable used during construction
+        self.layer1 = self._make_layer(width, layers[0])
+        self.layer2 = self._make_layer(width * 2, layers[1], stride=2)
+        self.layer3 = self._make_layer(width * 4, layers[2], stride=2)
+        self.layer4 = self._make_layer(width * 8, layers[3], stride=2)
+
+        embed_dim = width * 32  # the ResNet feature dimension
+        self.attnpool = AttentionPool2d(input_resolution // 32, embed_dim, 32,
+                                        output_dim)
+
+    def init_weights(self, pretrained=None):
+        pretrained = pretrained or self.pretrained
+        if isinstance(pretrained, str):
+            checkpoint = torch.jit.load(
+                pretrained, map_location='cpu').float().state_dict()
+
+            state_dict = {}
+
+            for k in checkpoint.keys():
+                if k.startswith('visual.'):
+                    new_k = k.replace('visual.', '')
+                    state_dict[new_k] = checkpoint[k]
+
+                    if 'positional_embedding' in new_k:
+                        if self.attnpool.positional_embedding.shape != state_dict[
+                                new_k].shape:
+                            print(
+                                f'Resize the pos_embed shape from {state_dict[new_k].shape}'
+                                f' to {self.attnpool.positional_embedding.shape}'
+                            )
+                            cls_pos = state_dict[new_k][0:1, :]
+                            H = W = self.input_resolution // 32
+                            old_h = int(
+                                math.sqrt(state_dict[new_k][1:, ].shape[0]))
+                            spatial_pos = F.interpolate(
+                                state_dict[new_k][1:, ].reshape(
+                                    1, old_h, old_h,
+                                    cls_pos.shape[1]).permute(0, 3, 1, 2),
+                                size=(H, W),
+                                mode='bilinear')
+                            spatial_pos = spatial_pos.reshape(
+                                cls_pos.shape[1], H * W).permute(1, 0)
+                            positional_embedding = torch.cat(
+                                [cls_pos, spatial_pos], dim=0)
+                            state_dict[new_k] = positional_embedding
+                            assert self.attnpool.positional_embedding.shape == state_dict[
+                                new_k].shape
+
+            u, w = self.load_state_dict(state_dict, False)
+            print(u, w, 'are misaligned params in CLIPResNet')
+
+    def _make_layer(self, planes, blocks, stride=1):
+        layers = [Bottleneck(self._inplanes, planes, stride)]
+
+        self._inplanes = planes * Bottleneck.expansion
+        for _ in range(1, blocks):
+            layers.append(Bottleneck(self._inplanes, planes))
+
+        return nn.Sequential(*layers)
+
+    def forward(self, x):
+
+        def stem(x):
+            for conv, bn in [(self.conv1, self.bn1), (self.conv2, self.bn2),
+                             (self.conv3, self.bn3)]:
+                x = self.relu(bn(conv(x)))
+            x = self.avgpool(x)
+            return x
+
+        x = x.type(self.conv1.weight.dtype)
+        x = stem(x)
+
+        outs = []
+        x = self.layer1(x)
+        outs.append(x)
+        x = self.layer2(x)
+        outs.append(x)
+        x = self.layer3(x)
+        outs.append(x)
+        x = self.layer4(x)
+        outs.append(x)
+
+        x_global, x_local = self.attnpool(x)
+        outs.append([x_global, x_local])
+
+        return tuple(outs)
+
+
+class LayerNorm(nn.LayerNorm):
+    """Subclass torch's LayerNorm to handle fp16."""
+
+    def forward(self, x: torch.Tensor):
+        orig_type = x.dtype
+        ret = super().forward(x.type(torch.float32))
+        return ret.type(orig_type)
+
+
+class QuickGELU(nn.Module):
+
+    def forward(self, x: torch.Tensor):
+        return x * torch.sigmoid(1.702 * x)
+
+
+class DropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks).
+    """
+
+    def __init__(self, drop_prob=None):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training)
+
+    def extra_repr(self) -> str:
+        return 'p={}'.format(self.drop_prob)
+
+
+class ResidualAttentionBlock(nn.Module):
+
+    def __init__(self,
+                 d_model: int,
+                 n_head: int,
+                 attn_mask: torch.Tensor = None,
+                 drop_path=0.):
+        super().__init__()
+
+        self.attn = nn.MultiheadAttention(d_model, n_head)
+        self.ln_1 = LayerNorm(d_model)
+        self.mlp = nn.Sequential(
+            OrderedDict([('c_fc', nn.Linear(d_model, d_model * 4)),
+                         ('gelu', QuickGELU()),
+                         ('c_proj', nn.Linear(d_model * 4, d_model))]))
+        self.ln_2 = LayerNorm(d_model)
+        self.attn_mask = attn_mask
+
+        self.drop_path = DropPath(
+            drop_path) if drop_path > 0. else nn.Identity()
+
+    def attention(self, x: torch.Tensor):
+        self.attn_mask = self.attn_mask.to(
+            dtype=x.dtype,
+            device=x.device) if self.attn_mask is not None else None
+        return self.attn(
+            x, x, x, need_weights=False, attn_mask=self.attn_mask)[0]
+
+    def forward(self, x: torch.Tensor):
+        x = x + self.drop_path(self.attention(self.ln_1(x)))
+        x = x + self.drop_path(self.mlp(self.ln_2(x)))
+        return x
+
+
+class Transformer(nn.Module):
+
+    def __init__(self,
+                 width: int,
+                 layers: int,
+                 heads: int,
+                 attn_mask: torch.Tensor = None,
+                 drop_path_rate=0.):
+        super().__init__()
+        self.width = width
+        self.layers = layers
+        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, layers)
+               ]  # stochastic depth decay rule
+        self.resblocks = nn.Sequential(*[
+            ResidualAttentionBlock(width, heads, attn_mask, dpr[i])
+            for i in range(layers)
+        ])
+
+    def forward(self, x: torch.Tensor):
+        return self.resblocks(x)
+
+
+class Attention(nn.Module):
+
+    def __init__(self,
+                 dim,
+                 num_heads=8,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 attn_drop=0.,
+                 proj_drop=0.):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        # NOTE scale factor was wrong in my original version, can set manually to be compat with prev weights
+        self.scale = qk_scale or head_dim**-0.5
+
+        self.q_proj = nn.Linear(dim, dim, bias=qkv_bias)
+        self.k_proj = nn.Linear(dim, dim, bias=qkv_bias)
+        self.v_proj = nn.Linear(dim, dim, bias=qkv_bias)
+
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+
+    def forward(self, q, k, v):
+        B, N, C = q.shape
+        assert k.shape == v.shape
+        B, M, C = k.shape
+        q = self.q_proj(q).reshape(B, N, self.num_heads, C // self.num_heads)
+        k = self.k_proj(k).reshape(B, M, self.num_heads, C // self.num_heads)
+        v = self.v_proj(v).reshape(B, M, self.num_heads, C // self.num_heads)
+
+        attn = torch.einsum('bnkc,bmkc->bknm', q, k) * self.scale
+
+        attn = attn.softmax(dim=-1)
+
+        x = torch.einsum('bknm,bmkc->bnkc', attn, v).reshape(B, N, C)
+
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+
+
+class TransformerDecoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        d_model,
+        nhead,
+        dropout=0.1,
+    ):
+        super().__init__()
+        self.self_attn = Attention(d_model, nhead, proj_drop=dropout)
+        self.cross_attn = Attention(d_model, nhead, proj_drop=dropout)
+
+        self.norm1 = nn.LayerNorm(d_model)
+        self.norm2 = nn.LayerNorm(d_model)
+        self.norm3 = nn.LayerNorm(d_model)
+        self.dropout = nn.Dropout(dropout)
+
+        self.mlp = nn.Sequential(
+            nn.Linear(d_model, d_model * 4), nn.GELU(), nn.Dropout(dropout),
+            nn.Linear(d_model * 4, d_model))
+
+    def forward(self, x, mem):
+        q = k = v = self.norm1(x)
+        x = x + self.self_attn(q, k, v)
+        q = self.norm2(x)
+        x = x + self.cross_attn(q, mem, mem)
+        x = x + self.dropout(self.mlp(self.norm3(x)))
+        return x
+
+
+class CLIPVisionTransformer(nn.Module):
+
+    def __init__(self,
+                 input_resolution=224,
+                 patch_size=32,
+                 width=768,
+                 layers=12,
+                 heads=12,
+                 output_dim=512,
+                 drop_path_rate=0.0,
+                 out_indices=[3, 5, 7, 11],
+                 pretrained=None,
+                 get_embeddings=False,
+                 **kwargs):
+        super().__init__()
+        self.pretrained = pretrained
+        self.input_resolution = input_resolution
+        self.output_dim = output_dim
+        self.conv1 = nn.Conv2d(
+            in_channels=3,
+            out_channels=width,
+            kernel_size=patch_size,
+            stride=patch_size,
+            bias=False)
+
+        scale = width**-0.5
+        self.class_embedding = nn.Parameter(scale * torch.randn(width))
+        self.positional_embedding = nn.Parameter(scale * torch.randn(
+            (input_resolution // patch_size)**2 + 1, width))
+        self.spatial_size = input_resolution // patch_size
+        self.ln_pre = LayerNorm(width)
+        self.get_embeddings = get_embeddings
+
+        self.transformer = Transformer(
+            width, layers, heads, drop_path_rate=drop_path_rate)
+
+        self.out_indices = out_indices
+
+        if get_embeddings:
+            self.ln_post = LayerNorm(width)
+            self.proj = nn.Parameter(scale * torch.randn(width, output_dim))
+
+        embed_dim = width
+
+        if patch_size == 16:
+            self.fpn1 = nn.Sequential(
+                nn.GroupNorm(1, embed_dim),
+                nn.ConvTranspose2d(
+                    embed_dim, embed_dim, kernel_size=2, stride=2),
+                nn.BatchNorm2d(embed_dim),
+                nn.GELU(),
+                nn.ConvTranspose2d(
+                    embed_dim, embed_dim, kernel_size=2, stride=2),
+            )
+
+            self.fpn2 = nn.Sequential(
+                nn.GroupNorm(1, embed_dim),
+                nn.ConvTranspose2d(
+                    embed_dim, embed_dim, kernel_size=2, stride=2),
+            )
+
+            self.fpn3 = nn.GroupNorm(1, embed_dim)
+
+            self.fpn4 = nn.Sequential(
+                nn.GroupNorm(1, embed_dim),
+                nn.MaxPool2d(kernel_size=2, stride=2))
+
+        elif patch_size == 8:
+            self.fpn1 = nn.Sequential(
+                nn.GroupNorm(1, embed_dim),
+                nn.ConvTranspose2d(
+                    embed_dim, embed_dim, kernel_size=2, stride=2),
+            )
+
+            self.fpn2 = nn.GroupNorm(1, embed_dim)
+
+            self.fpn3 = nn.Sequential(
+                nn.GroupNorm(1, embed_dim),
+                nn.MaxPool2d(kernel_size=2, stride=2),
+            )
+
+            self.fpn4 = nn.Sequential(
+                nn.GroupNorm(1, embed_dim),
+                nn.MaxPool2d(kernel_size=4, stride=4),
+            )
+
+    def init_weights(self, pretrained=None):
+        pretrained = pretrained or self.pretrained
+        if isinstance(pretrained, str):
+            checkpoint = torch.jit.load(
+                pretrained, map_location='cpu').float().state_dict()
+
+            state_dict = {}
+
+            for k in checkpoint.keys():
+                if k.startswith('visual.'):
+                    new_k = k.replace('visual.', '')
+                    state_dict[new_k] = checkpoint[k]
+
+            if 'positional_embedding' in state_dict.keys():
+                if self.positional_embedding.shape != state_dict[
+                        'positional_embedding'].shape:
+                    print(
+                        f'Resize the pos_embed shape from {state_dict["positional_embedding"].shape} to'
+                        f' {self.positional_embedding.shape}')
+                    cls_pos = state_dict['positional_embedding'][0:1, :]
+                    spatial_pos = F.interpolate(
+                        state_dict['positional_embedding'][1:, ].reshape(
+                            1, 14, 14, 768).permute(0, 3, 1, 2),
+                        size=(self.spatial_size, self.spatial_size),
+                        mode='bilinear')
+                    spatial_pos = spatial_pos.reshape(
+                        768,
+                        self.spatial_size * self.spatial_size).permute(1, 0)
+                    positional_embedding = torch.cat([cls_pos, spatial_pos],
+                                                     dim=0)
+                    state_dict['positional_embedding'] = positional_embedding
+                    assert self.positional_embedding.shape == state_dict[
+                        'positional_embedding'].shape
+
+            u, w = self.load_state_dict(state_dict, False)
+            print(u, w, 'are misaligned params in vision transformer')
+
+    def forward(self, x: torch.Tensor):
+        x = self.conv1(x)  # shape = [*, width, grid, grid]
+        B, C, H, W = x.shape
+        x = x.reshape(x.shape[0], x.shape[1],
+                      -1)  # shape = [*, width, grid ** 2]
+        x = x.permute(0, 2, 1)  # shape = [*, grid ** 2, width]
+        x1 = self.class_embedding.to(x.dtype)
+        x2 = torch.zeros(
+            x.shape[0], 1, x.shape[-1], dtype=x.dtype, device=x.device)
+        x = torch.cat([x1 + x2, x], dim=1)
+        pos = self.positional_embedding.to(x.dtype)
+        cls_pos = pos[0, :] + self.class_embedding.to(x.dtype)
+        spatial_pos = F.interpolate(
+            pos[1:, ].reshape(1, self.spatial_size, self.spatial_size,
+                              C).permute(0, 3, 1, 2),
+            size=(H, W),
+            mode='bilinear')
+        spatial_pos = spatial_pos.reshape(1, C, H * W).permute(0, 2, 1)
+        pos = torch.cat([cls_pos.reshape(1, 1, C), spatial_pos], dim=1)
+        x = x + pos
+        x = self.ln_pre(x)
+        x = x.permute(1, 0, 2)  # NLD -> LND
+
+        gradientcheckpoint = False
+
+        features = []
+        for i, blk in enumerate(self.transformer.resblocks):
+            if gradientcheckpoint:
+                x = checkpoint.checkpoint(blk, x)
+            else:
+                x = blk(x)
+
+            if i in self.out_indices:
+                xp = x.permute(1, 0, 2)[:,
+                                        1:, :].permute(0, 2,
+                                                       1).reshape(B, -1, H, W)
+                features.append(xp.contiguous())
+
+        ops = [self.fpn1, self.fpn2, self.fpn3, self.fpn4]
+        for i in range(len(features)):
+            features[i] = ops[i](features[i])
+
+        if self.get_embeddings:
+            x = x.permute(1, 0, 2)
+            x = self.ln_post(x)
+            x = x @ self.proj
+
+            global_embedding = x[:, 0]
+            visual_embedding = x[:, 1:].reshape(B, H, W,
+                                                -1).permute(0, 3, 1,
+                                                            2)  # B C H W
+
+            features.append([global_embedding, visual_embedding])
+
+        return tuple(features)
+
+
+class CLIPTextEncoder(nn.Module):
+
+    def __init__(self,
+                 context_length=77,
+                 vocab_size=49408,
+                 transformer_width=512,
+                 transformer_heads=8,
+                 transformer_layers=12,
+                 embed_dim=1024,
+                 out_dim=256,
+                 pretrained=None,
+                 **kwargs):
+        super().__init__()
+
+        self.pretrained = pretrained
+
+        self.context_length = context_length
+
+        self.transformer = Transformer(
+            width=transformer_width,
+            layers=transformer_layers,
+            heads=transformer_heads,
+            attn_mask=self.build_attention_mask())
+
+        self.vocab_size = vocab_size
+        self.token_embedding = nn.Embedding(vocab_size, transformer_width)
+        self.positional_embedding = nn.Parameter(
+            torch.empty(self.context_length, transformer_width))
+        self.ln_final = LayerNorm(transformer_width)
+        self.text_projection = nn.Parameter(
+            torch.empty(transformer_width, embed_dim))
+
+    def init_weights(self, pretrained=None):
+        pretrained = pretrained or self.pretrained
+        if isinstance(pretrained, str):
+            checkpoint = torch.jit.load(
+                pretrained, map_location='cpu').float().state_dict()
+
+            state_dict = {}
+
+            for k in checkpoint.keys():
+                if k.startswith('transformer.'):
+                    state_dict[k] = checkpoint[k]
+
+                if k == 'positional_embedding' or k == 'text_projection' or k.startswith(
+                        'token_embedding') or k.startswith('ln_final'):
+                    if k == 'positional_embedding' and checkpoint[k].size(
+                            0) > self.context_length:
+                        checkpoint[k] = checkpoint[k][:self.context_length]
+                        print('positional_embedding is tuncated from 77 to',
+                              self.context_length)
+                    state_dict[k] = checkpoint[k]
+
+            u, w = self.load_state_dict(state_dict, False)
+            print(u, w, 'are misaligned params in text encoder')
+
+    def build_attention_mask(self):
+        # lazily create causal attention mask, with full attention between the vision tokens
+        # pytorch uses additive attention mask; fill with -inf
+        mask = torch.empty(self.context_length, self.context_length)
+        mask.fill_(float('-inf'))
+        mask.triu_(1)  # zero out the lower diagonal
+        return mask
+
+    def forward(self, text):
+        x = self.token_embedding(text)
+        x = x + self.positional_embedding
+        x = x.permute(1, 0, 2)
+        x = self.transformer(x)
+        x = x.permute(1, 0, 2)
+        x = self.ln_final(x)
+        x = x[torch.arange(x.shape[0]),
+              text.argmax(dim=-1), ...] @ self.text_projection
+        return x
+
+
+class CLIPTextContextEncoder(nn.Module):
+
+    def __init__(self,
+                 context_length=22,
+                 vocab_size=49408,
+                 transformer_width=512,
+                 transformer_heads=8,
+                 transformer_layers=12,
+                 embed_dim=1024,
+                 out_dim=256,
+                 pretrained=None,
+                 **kwargs):
+        super().__init__()
+
+        self.pretrained = pretrained
+
+        self.context_length = context_length
+
+        self.transformer = Transformer(
+            width=transformer_width,
+            layers=transformer_layers,
+            heads=transformer_heads,
+            attn_mask=self.build_attention_mask())
+
+        self.embed_dim = embed_dim
+
+        self.vocab_size = vocab_size
+        self.token_embedding = nn.Embedding(vocab_size, transformer_width)
+        self.positional_embedding = nn.Parameter(
+            torch.empty(self.context_length, transformer_width))
+        self.ln_final = LayerNorm(transformer_width)
+        self.text_projection = nn.Parameter(
+            torch.empty(transformer_width, embed_dim))
+
+    def init_weights(self, pretrained=None):
+        pretrained = pretrained or self.pretrained
+        if isinstance(pretrained, str):
+            checkpoint = torch.jit.load(
+                pretrained, map_location='cpu').float().state_dict()
+
+            state_dict = {}
+
+            for k in checkpoint.keys():
+                if k.startswith('transformer.'):
+                    state_dict[k] = checkpoint[k]
+
+                if k == 'positional_embedding' or k == 'text_projection' or k.startswith(
+                        'token_embedding') or k.startswith('ln_final'):
+                    if k == 'positional_embedding' and checkpoint[k].size(
+                            0) > self.context_length:
+                        checkpoint[k] = checkpoint[k][:self.context_length]
+                        print('positional_embedding is tuncated from 77 to',
+                              self.context_length)
+                    state_dict[k] = checkpoint[k]
+
+            u, w = self.load_state_dict(state_dict, False)
+            print(u, w, 'are misaligned params in text encoder')
+
+    def build_attention_mask(self):
+        # lazily create causal attention mask, with full attention between the vision tokens
+        # pytorch uses additive attention mask; fill with -inf
+        mask = torch.empty(self.context_length, self.context_length)
+        mask.fill_(float('-inf'))
+        mask.triu_(1)  # zero out the lower diagonal
+        return mask
+
+    def forward(self, text, context=None):
+        x_text = self.token_embedding(text)  # n_clas, n_text, C
+        K, N1, C = x_text.shape  # 150类 * 5??? * 512
+        B, N2, C = context.shape  # 1 * 8 * 512
+
+        eos_indx = text.argmax(dim=-1) + N2
+        eos_indx = eos_indx.reshape(1, K).expand(B, K).reshape(-1)
+
+        x_text = x_text.reshape(1, K, N1, C).expand(B, K, N1, C)
+        context = context.reshape(B, 1, N2, C).expand(B, K, N2, C)
+
+        x = torch.cat([x_text[:, :, 0:1], context, x_text[:, :, 1:]],
+                      dim=2).reshape(B * K, N1 + N2, C)
+        x = x + self.positional_embedding
+        x = x.permute(1, 0, 2)  # NLD -> LND
+        x = self.transformer(x)
+        x = x.permute(1, 0, 2)  # LND -> NLD
+        x = self.ln_final(x)
+        x = x[torch.arange(x.shape[0]), eos_indx] @ self.text_projection
+        x = x.reshape(B, K, self.embed_dim)
+        return x
+
+
+class ContextDecoder(nn.Module):
+
+    def __init__(self,
+                 transformer_width=256,
+                 transformer_heads=4,
+                 transformer_layers=6,
+                 visual_dim=1024,
+                 dropout=0.1,
+                 **kwargs):
+        super().__init__()
+
+        self.memory_proj = nn.Sequential(
+            nn.LayerNorm(visual_dim),
+            nn.Linear(visual_dim, transformer_width),
+            nn.LayerNorm(transformer_width),
+        )
+
+        self.text_proj = nn.Sequential(
+            nn.LayerNorm(visual_dim),
+            nn.Linear(visual_dim, transformer_width),
+        )
+
+        self.decoder = nn.ModuleList([
+            TransformerDecoderLayer(transformer_width, transformer_heads,
+                                    dropout) for _ in range(transformer_layers)
+        ])
+
+        self.out_proj = nn.Sequential(
+            nn.LayerNorm(transformer_width),
+            nn.Linear(transformer_width, visual_dim))
+
+        self.apply(self._init_weights)
+
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+
+    def forward(self, text, visual):
+        B, N, C = visual.shape
+        visual = self.memory_proj(visual)
+        x = self.text_proj(text)
+
+        for layer in self.decoder:
+            x = layer(x, visual)
+
+        return self.out_proj(x)
diff --git a/modelscope/models/cv/shop_segmentation/neck_fpn.py b/modelscope/models/cv/shop_segmentation/neck_fpn.py
new file mode 100644
index 00000000..aa4d7159
--- /dev/null
+++ b/modelscope/models/cv/shop_segmentation/neck_fpn.py
@@ -0,0 +1,215 @@
+# Base modules are adapted from https://github.com/open-mmlab/mmcv/,
+# originally Apache 2.0 License, Copyright (c) 2018-2022 OpenMMLab,
+# https://github.com/open-mmlab/mmsegmentation/,
+# originally Apache 2.0 License, Copyright (c) 2020-2021 OpenMMLab,
+# and adapted from https://github.com/raoyongming/DenseCLIP/,
+# originally MIT License, Copyright (c) 2022 Rao, Yongming.
+
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import ConvModule
+from timm.models.layers import drop, drop_path, trunc_normal_
+
+from .common import resize
+
+
+class FPN(nn.Module):
+    """Feature Pyramid Network.
+
+    This neck is the implementation of `Feature Pyramid Networks for Object
+    Detection <https://arxiv.org/abs/1612.03144>`_.
+
+    Args:
+        in_channels (list[int]): Number of input channels per scale.
+        out_channels (int): Number of output channels (used at each scale).
+        num_outs (int): Number of output scales.
+        start_level (int): Index of the start input backbone level used to
+            build the feature pyramid. Default: 0.
+        end_level (int): Index of the end input backbone level (exclusive) to
+            build the feature pyramid. Default: -1, which means the last level.
+        add_extra_convs (bool | str): If bool, it decides whether to add conv
+            layers on top of the original feature maps. Default to False.
+            If True, its actual mode is specified by `extra_convs_on_inputs`.
+            If str, it specifies the source feature map of the extra convs.
+            Only the following options are allowed
+
+            - 'on_input': Last feat map of neck inputs (i.e. backbone feature).
+            - 'on_lateral': Last feature map after lateral convs.
+            - 'on_output': The last output feature map after fpn convs.
+        extra_convs_on_inputs (bool, deprecated): Whether to apply extra convs
+            on the original feature from the backbone. If True,
+            it is equivalent to `add_extra_convs='on_input'`. If False, it is
+            equivalent to set `add_extra_convs='on_output'`. Default to True.
+        relu_before_extra_convs (bool): Whether to apply relu before the extra
+            conv. Default: False.
+        no_norm_on_lateral (bool): Whether to apply norm on lateral.
+            Default: False.
+        conv_cfg (dict): Config dict for convolution layer. Default: None.
+        norm_cfg (dict): Config dict for normalization layer. Default: None.
+        act_cfg (dict): Config dict for activation layer in ConvModule.
+            Default: None.
+        upsample_cfg (dict): Config dict for interpolate layer.
+            Default: dict(mode='nearest').
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 num_outs,
+                 start_level=0,
+                 end_level=-1,
+                 add_extra_convs=False,
+                 extra_convs_on_inputs=False,
+                 relu_before_extra_convs=False,
+                 no_norm_on_lateral=False,
+                 conv_cfg=None,
+                 norm_cfg=None,
+                 act_cfg=None,
+                 upsample_cfg=dict(mode='nearest')):
+        super(FPN, self).__init__()
+        assert isinstance(in_channels, list)
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.num_ins = len(in_channels)
+        self.num_outs = num_outs
+        self.relu_before_extra_convs = relu_before_extra_convs
+        self.no_norm_on_lateral = no_norm_on_lateral
+        self.fp16_enabled = False
+        self.upsample_cfg = upsample_cfg.copy()
+
+        if end_level == -1:
+            self.backbone_end_level = self.num_ins
+            assert num_outs >= self.num_ins - start_level
+        else:
+            # if end_level < inputs, no extra level is allowed
+            self.backbone_end_level = end_level
+            assert end_level <= len(in_channels)
+            assert num_outs == end_level - start_level
+        self.start_level = start_level
+        self.end_level = end_level
+        self.add_extra_convs = add_extra_convs
+        assert isinstance(add_extra_convs, (str, bool))
+        if isinstance(add_extra_convs, str):
+            # Extra_convs_source choices: 'on_input', 'on_lateral', 'on_output'
+            assert add_extra_convs in ('on_input', 'on_lateral', 'on_output')
+        elif add_extra_convs:  # True
+            if extra_convs_on_inputs:
+                # For compatibility with previous release
+                # TODO: deprecate `extra_convs_on_inputs`
+                self.add_extra_convs = 'on_input'
+            else:
+                self.add_extra_convs = 'on_output'
+
+        self.lateral_convs = nn.ModuleList()
+        self.fpn_convs = nn.ModuleList()
+
+        for i in range(self.start_level, self.backbone_end_level):
+            l_conv = ConvModule(
+                in_channels[i],
+                out_channels,
+                1,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg if not self.no_norm_on_lateral else None,
+                act_cfg=act_cfg,
+                inplace=False)
+            fpn_conv = ConvModule(
+                out_channels,
+                out_channels,
+                3,
+                padding=1,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                act_cfg=act_cfg,
+                inplace=False)
+
+            self.lateral_convs.append(l_conv)
+            self.fpn_convs.append(fpn_conv)
+
+        # add extra conv layers (e.g., RetinaNet)
+        extra_levels = num_outs - self.backbone_end_level + self.start_level
+        if self.add_extra_convs and extra_levels >= 1:
+            for i in range(extra_levels):
+                if i == 0 and self.add_extra_convs == 'on_input':
+                    in_channels = self.in_channels[self.backbone_end_level - 1]
+                else:
+                    in_channels = out_channels
+                extra_fpn_conv = ConvModule(
+                    in_channels,
+                    out_channels,
+                    3,
+                    stride=2,
+                    padding=1,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    act_cfg=act_cfg,
+                    inplace=False)
+                self.fpn_convs.append(extra_fpn_conv)
+
+        self.apply(self._init_weights)
+
+    def forward(self, inputs):
+        assert len(inputs) == len(self.in_channels)
+
+        # build laterals
+        laterals = [
+            lateral_conv(inputs[i + self.start_level])
+            for i, lateral_conv in enumerate(self.lateral_convs)
+        ]
+
+        # build top-down path
+        used_backbone_levels = len(laterals)
+        for i in range(used_backbone_levels - 1, 0, -1):
+            # In some cases, fixing `scale factor` (e.g. 2) is preferred, but
+            #  it cannot co-exist with `size` in `F.interpolate`.
+            if 'scale_factor' in self.upsample_cfg:
+                laterals[i - 1] = laterals[i - 1] + resize(
+                    laterals[i], **self.upsample_cfg)
+            else:
+                prev_shape = laterals[i - 1].shape[2:]
+                laterals[i - 1] = laterals[i - 1] + resize(
+                    laterals[i], size=prev_shape, **self.upsample_cfg)
+
+        # build outputs
+        # part 1: from original levels
+        outs = [
+            self.fpn_convs[i](laterals[i]) for i in range(used_backbone_levels)
+        ]
+        # part 2: add extra levels
+        if self.num_outs > len(outs):
+            # use max pool to get more levels on top of outputs
+            # (e.g., Faster R-CNN, Mask R-CNN)
+            if not self.add_extra_convs:
+                for i in range(self.num_outs - used_backbone_levels):
+                    outs.append(F.max_pool2d(outs[-1], 1, stride=2))
+            # add conv layers on top of original feature maps (RetinaNet)
+            else:
+                if self.add_extra_convs == 'on_input':
+                    extra_source = inputs[self.backbone_end_level - 1]
+                elif self.add_extra_convs == 'on_lateral':
+                    extra_source = laterals[-1]
+                elif self.add_extra_convs == 'on_output':
+                    extra_source = outs[-1]
+                else:
+                    raise NotImplementedError
+                outs.append(self.fpn_convs[used_backbone_levels](extra_source))
+                for i in range(used_backbone_levels + 1, self.num_outs):
+                    if self.relu_before_extra_convs:
+                        outs.append(self.fpn_convs[i](F.relu(outs[-1])))
+                    else:
+                        outs.append(self.fpn_convs[i](outs[-1]))
+        return tuple(outs)
+
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+        elif isinstance(m, nn.Conv2d):
+            nn.init.kaiming_normal_(m.weight.data, nonlinearity='relu')
+            if m.bias is not None:
+                nn.init.constant_(m.bias.data, 0)
diff --git a/modelscope/models/cv/shop_segmentation/shop_seg_base.py b/modelscope/models/cv/shop_segmentation/shop_seg_base.py
new file mode 100644
index 00000000..34686370
--- /dev/null
+++ b/modelscope/models/cv/shop_segmentation/shop_seg_base.py
@@ -0,0 +1,155 @@
+# Base modules are adapted from https://github.com/open-mmlab/mmcv/,
+# originally Apache 2.0 License, Copyright (c) 2018-2022 OpenMMLab,
+# https://github.com/open-mmlab/mmsegmentation/,
+# originally Apache 2.0 License, Copyright (c) 2020-2021 OpenMMLab,
+# and adapted from https://github.com/raoyongming/DenseCLIP/,
+# originally MIT License, Copyright (c) 2022 Rao, Yongming.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from .head_fpn import FPNHead
+from .models import (CLIPTextContextEncoder, CLIPVisionTransformer,
+                     ContextDecoder)
+from .neck_fpn import FPN
+from .utils import SimpleTokenizer, tokenize
+
+
+class SHOPSEG(nn.Module):
+    """Encoder Decoder segmentors.
+
+    EncoderDecoder typically consists of backbone, decode_head, auxiliary_head.
+    Note that auxiliary_head is only used for deep supervision during training,
+    which could be dumped during inference.
+    """
+
+    def __init__(self,
+                 model_dir,
+                 context_length=22,
+                 context_feature='attention',
+                 score_concat_index=2,
+                 tau=0.07,
+                 token_embed_dim=512,
+                 text_dim=512,
+                 **args):
+        super(SHOPSEG, self).__init__()
+
+        self.model_dir = model_dir
+        self.tokenizer = SimpleTokenizer(model_dir
+                                         + '/bpe_simple_vocab_16e6.txt.gz')
+
+        backbone = CLIPVisionTransformer(
+            input_resolution=1024,
+            patch_size=16,
+            width=768,
+            layers=12,
+            output_dim=512,
+            drop_path_rate=0.1,
+            pretrained=False,
+            get_embeddings=True)
+
+        text_encoder = CLIPTextContextEncoder(
+            context_length=30,
+            vocab_size=49408,
+            transformer_width=512,
+            transformer_heads=8,
+            transformer_layers=12,
+            embed_dim=512,
+            pretrained=False)
+
+        context_decoder = ContextDecoder(
+            transformer_width=256,
+            transformer_heads=4,
+            transformer_layers=3,
+            visual_dim=512,
+            dropout=0.1)
+        neck = FPN(
+            in_channels=[768, 768, 768 + 2, 768], out_channels=256, num_outs=4)
+        head_fpd = FPNHead(channels=256, num_classes=2)
+
+        self.backbone = backbone
+        self.text_encoder = text_encoder
+        self.context_decoder = context_decoder
+        self.context_length = context_length
+        self.score_concat_index = score_concat_index
+
+        self.context_feature = context_feature
+        self.tau = tau
+        context_length = self.text_encoder.context_length - self.context_length
+        self.contexts = nn.Parameter(
+            torch.randn(1, context_length, token_embed_dim))
+        nn.init.trunc_normal_(self.contexts)
+        self.gamma = nn.Parameter(torch.ones(text_dim) * 1e-4)
+
+        self.neck = neck
+        self.head_fpn = head_fpd
+
+        self.tau = 0.07
+
+    def encode_text(self, text, context_length):
+        output = tokenize(self.tokenizer, text, context_length, True)
+        return output
+
+    def extract_feat(self, img):
+        """Extract features from images."""
+        x = self.backbone(img)
+        return x
+
+    def after_extract_feat(self, x, name_list):
+        x_orig = list(x[0:4])
+        global_feat, visual_embeddings = x[4]
+        B, C, H, W = visual_embeddings.shape
+        if self.context_feature == 'attention':
+            x1 = global_feat.reshape(B, C, 1)
+            x2 = visual_embeddings.reshape(B, C, H * W)
+            visual_context = torch.cat([x1, x2], dim=2).permute(0, 2, 1)
+        texts = torch.cat([
+            self.encode_text(c, context_length=self.context_length)
+            for c in name_list
+        ])
+        x1 = texts.to(global_feat.device)
+        x1 = self.text_encoder(x1, self.contexts)
+        text_embeddings = x1.expand(B, -1, -1)
+        # update text_embeddings by visual_context!
+        # (B, 1, C)
+        text_diff = self.context_decoder(text_embeddings, visual_context)
+        # (B, K, C)
+        text_embeddings = text_embeddings + self.gamma * text_diff
+
+        # compute score map and concat
+        B, K, C = text_embeddings.shape
+        visual_embeddings = F.normalize(visual_embeddings, dim=1, p=2)
+        text = F.normalize(text_embeddings, dim=2, p=2)
+        score_map_list = []
+        bsz = B
+        for i in range(bsz):
+            ind = 2 * i
+            sub_text = torch.cat(
+                [text[i:i + 1, ind:ind + 1], text[i:i + 1, ind + 1:ind + 2]],
+                dim=1)  # 1 * 2 * h * w
+
+            sub_score_map = torch.einsum('bchw,bkc->bkhw',
+                                         visual_embeddings[i:i + 1],
+                                         sub_text)  # 1 * 2 * h * w
+            score_map_list.append(sub_score_map)
+        score_map = torch.cat(score_map_list, dim=0)  # b * 2 * h * w
+        x_orig[self.score_concat_index] = torch.cat(
+            [x_orig[self.score_concat_index], score_map], dim=1)
+        return x_orig, score_map
+
+    def forward(self, img, text_list=None):
+        if text_list is None:
+            bsz = img.size()[0]
+            text_list = ['foregeound'] * bsz
+        x = self.extract_feat(img)
+        _x_orig = [x[i] for i in range(4)]
+        name_list = []
+        for name in text_list:
+            name_list.append('others')
+            name_list.append(name[0:20])
+        x_orig, score_map = self.after_extract_feat(x, name_list)
+        x_orig = list(self.neck(x_orig))
+        _x_orig = x_orig
+        pred = self.head_fpn(_x_orig)
+        return pred
diff --git a/modelscope/models/cv/shop_segmentation/shop_seg_model.py b/modelscope/models/cv/shop_segmentation/shop_seg_model.py
new file mode 100644
index 00000000..ac0d67fa
--- /dev/null
+++ b/modelscope/models/cv/shop_segmentation/shop_seg_model.py
@@ -0,0 +1,117 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import os.path as osp
+from typing import Any, Dict
+
+import json
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from PIL import Image
+
+from modelscope.metainfo import Models
+from modelscope.models.base import TorchModel
+from modelscope.models.builder import MODELS
+from modelscope.models.cv.shop_segmentation import SHOPSEG
+from modelscope.outputs import OutputKeys
+from modelscope.preprocessors import LoadImage
+from modelscope.utils.constant import ModelFile, Tasks
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+__all__ = ['ShopSegmentation']
+
+
+@MODELS.register_module(
+    Tasks.shop_segmentation, module_name=Models.shop_segmentation)
+class ShopSegmentation(TorchModel):
+    """ shop segmentation model.
+    """
+
+    def __init__(self, model_dir, device_id=0, *args, **kwargs):
+        super().__init__(
+            model_dir=model_dir, device_id=device_id, *args, **kwargs)
+
+        self.model = SHOPSEG(model_dir=model_dir)
+        pretrained_params = torch.load(
+            '{}/{}'.format(model_dir, ModelFile.TORCH_MODEL_BIN_FILE),
+            map_location='cpu')
+        self.model.load_state_dict(pretrained_params)
+        self.model.eval()
+        if device_id >= 0 and torch.cuda.is_available():
+            self.model.to('cuda:{}'.format(device_id))
+            logger.info('Use GPU: {}'.format(device_id))
+        else:
+            device_id = -1
+            logger.info('Use CPU for inference')
+        self.device_id = device_id
+
+    def preprocess(self, img, size=1024):
+        mean = [0.48145466, 0.4578275, 0.40821073]
+        std = [0.26862954, 0.26130258, 0.27577711]
+        h, w, c = img.shape
+        max_hw = max(h, w)
+        ratio = 1.0 * size / max_hw
+        crop_h, crop_w = int(ratio * h), int(ratio * w)
+        pil_img = Image.fromarray(img)
+        pil_img = pil_img.resize((crop_w, crop_h), Image.BILINEAR)
+        np_img = np.array(pil_img, dtype=np.float32) / 255.
+
+        for j in range(3):
+            np_img[:, :, j] = (np_img[:, :, j] - mean[j]) / std[j]
+
+        img_pad = np.zeros((size, size, 3), dtype=np.float32)
+        img_pad[:crop_h, :crop_w] = np_img
+
+        img_pad = torch.from_numpy(img_pad).permute(2, 0,
+                                                    1).unsqueeze(0).float()
+        return img_pad, h, w, crop_h, crop_w
+
+    def postprocess(self, tensors, crop_h, crop_w, ori_h, ori_w):
+        output = np.clip(tensors * 255., a_min=0, a_max=255.)
+        crop_output = np.array(output[:crop_h, :crop_w], dtype=np.uint8)
+
+        pil_output = Image.fromarray(crop_output)
+        pil_output = pil_output.resize((ori_w, ori_h), Image.BILINEAR)
+        np_output = np.array(pil_output, dtype=np.uint8)
+
+        np_output[np_output < 128] = 0
+        np_output[np_output >= 128] = 255
+        np_output = np.uint8(np_output)
+        return np_output
+
+    def forward(self, image):
+        """
+        image should be numpy array, dtype=np.uint8, shape: height*width*3
+        """
+        image_tensor, ori_h, ori_w, crop_h, crop_w = self.preprocess(
+            image, size=1024)
+        pred = self.inference(image_tensor)
+        msk = self.postprocess(pred, crop_h, crop_w, ori_h, ori_w, size=1024)
+
+        outputs = {OutputKeys.MASKS: msk}
+        return outputs
+
+    def inference(self, image):
+        """
+        image should be tensor, 1 * 3 * 1024 * 1024
+        """
+        with torch.no_grad():
+            if self.device_id == -1:
+                output = self.model(image)
+            else:
+                device = torch.device('cuda', self.device_id)
+                output = self.model(image.to(device))
+            output = F.interpolate(output, size=(1024, 1024), mode='bilinear')
+            output = F.softmax(output, dim=1)
+            output = torch.argmax(output, dim=1)
+            output = output[0]
+            if self.device_id == -1:
+                pred = output.data.numpy()
+            else:
+                pred = output.data.cpu().numpy()
+
+            del output
+        return pred
diff --git a/modelscope/models/cv/shop_segmentation/utils.py b/modelscope/models/cv/shop_segmentation/utils.py
new file mode 100644
index 00000000..4035b0ef
--- /dev/null
+++ b/modelscope/models/cv/shop_segmentation/utils.py
@@ -0,0 +1,198 @@
+# CLIP Tokenizer
+# Adapted from https://github.com/openai/CLIP.
+# Originally MIT License, Copyright (c) 2021 OpenAI.
+
+import gzip
+import html
+import os
+from functools import lru_cache
+from typing import Any, List, Union
+
+import ftfy
+import regex as re
+import torch
+
+
+@lru_cache()
+def default_bpe():
+    return os.path.join(
+        os.path.dirname(os.path.abspath(__file__)),
+        'bpe_simple_vocab_16e6.txt.gz')
+
+
+@lru_cache()
+def bytes_to_unicode():
+    """
+    Returns list of utf-8 byte and a corresponding list of unicode strings.
+    The reversible bpe codes work on unicode strings.
+    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
+    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
+    This is a signficant percentage of your normal, say, 32K bpe vocab.
+    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
+    And avoids mapping to whitespace/control characters the bpe code barfs on.
+    """
+    bs = list(range(ord('!'),
+                    ord('~') + 1)) + list(range(
+                        ord('¡'),
+                        ord('¬') + 1)) + list(range(ord('®'),
+                                                    ord('ÿ') + 1))
+    cs = bs[:]
+    n = 0
+    for b in range(2**8):
+        if b not in bs:
+            bs.append(b)
+            cs.append(2**8 + n)
+            n += 1
+    cs = [chr(n) for n in cs]
+    return dict(zip(bs, cs))
+
+
+def get_pairs(word):
+    """Return set of symbol pairs in a word.
+    Word is represented as tuple of symbols (symbols being variable-length strings).
+    """
+    pairs = set()
+    prev_char = word[0]
+    for char in word[1:]:
+        pairs.add((prev_char, char))
+        prev_char = char
+    return pairs
+
+
+def basic_clean(text):
+    text = ftfy.fix_text(text)
+    text = html.unescape(html.unescape(text))
+    return text.strip()
+
+
+def whitespace_clean(text):
+    text = re.sub(r'\s+', ' ', text)
+    text = text.strip()
+    return text
+
+
+class SimpleTokenizer(object):
+
+    def __init__(self, bpe_path: str = default_bpe()):
+        self.byte_encoder = bytes_to_unicode()
+        self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
+        merges = gzip.open(bpe_path).read().decode('utf-8').split('\n')
+        merges = merges[1:49152 - 256 - 2 + 1]
+        merges = [tuple(merge.split()) for merge in merges]
+        vocab = list(bytes_to_unicode().values())
+        vocab = vocab + [v + '</w>' for v in vocab]
+        for merge in merges:
+            vocab.append(''.join(merge))
+        vocab.extend(['<|startoftext|>', '<|endoftext|>'])
+        self.encoder = dict(zip(vocab, range(len(vocab))))
+        self.decoder = {v: k for k, v in self.encoder.items()}
+        self.bpe_ranks = dict(zip(merges, range(len(merges))))
+        self.cache = {
+            '<|startoftext|>': '<|startoftext|>',
+            '<|endoftext|>': '<|endoftext|>'
+        }
+        self.pat = re.compile(
+            r"""<\|startoftext\|>|<\|endoftext\|>|'s|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+""",
+            re.IGNORECASE)
+
+    def bpe(self, token):
+        if token in self.cache:
+            return self.cache[token]
+        word = tuple(token[:-1]) + (token[-1] + '</w>', )
+        pairs = get_pairs(word)
+
+        if not pairs:
+            return token + '</w>'
+
+        error_list = []
+        while True:
+            bigram = min(
+                pairs, key=lambda pair: self.bpe_ranks.get(pair, float('inf')))
+            if bigram not in self.bpe_ranks:
+                break
+            first, second = bigram
+            new_word = []
+            i = 0
+            while i < len(word):
+                try:
+                    j = word.index(first, i)
+                    new_word.extend(word[i:j])
+                    i = j
+                except Exception as err:
+                    error_list.append(err)
+                    new_word.extend(word[i:])
+                    break
+
+                if word[i] == first and i < len(word) - 1 and word[
+                        i + 1] == second:
+                    new_word.append(first + second)
+                    i += 2
+                else:
+                    new_word.append(word[i])
+                    i += 1
+            new_word = tuple(new_word)
+            word = new_word
+            if len(word) == 1:
+                break
+            else:
+                pairs = get_pairs(word)
+        word = ' '.join(word)
+        self.cache[token] = word
+        return word
+
+    def encode(self, text):
+        bpe_tokens = []
+        text = whitespace_clean(basic_clean(text)).lower()
+        for token in re.findall(self.pat, text):
+            token = ''.join(self.byte_encoder[b]
+                            for b in token.encode('utf-8'))
+            bpe_tokens.extend(self.encoder[bpe_token]
+                              for bpe_token in self.bpe(token).split(' '))
+        return bpe_tokens
+
+    def decode(self, tokens):
+        text = ''.join([self.decoder[token] for token in tokens])
+        text = bytearray([self.byte_decoder[c] for c in text]).decode(
+            'utf-8', errors='replace').replace('</w>', ' ')
+        return text
+
+
+def tokenize(tokenizer,
+             texts,
+             context_length: int = 77,
+             truncate: bool = False) -> torch.LongTensor:
+    """
+    Returns the tokenized representation of given input string(s)
+    Parameters
+    ----------
+    texts : Union[str, List[str]]
+        An input string or a list of input strings to tokenize
+    context_length : int
+        The context length to use; all CLIP models use 77 as the context length
+    truncate: bool
+        Whether to truncate the text in case its encoding is longer than the context length
+    Returns
+    -------
+    A two-dimensional tensor containing the resulting tokens, shape = [number of input strings, context_length]
+    """
+    if isinstance(texts, str):
+        texts = [texts]
+
+    sot_token = tokenizer.encoder['<|startoftext|>']
+    eot_token = tokenizer.encoder['<|endoftext|>']
+    all_tokens = [[sot_token] + tokenizer.encode(text) + [eot_token]
+                  for text in texts]
+    result = torch.zeros(len(all_tokens), context_length, dtype=torch.long)
+
+    for i, tokens in enumerate(all_tokens):
+        if len(tokens) > context_length:
+            if truncate:
+                tokens = tokens[:context_length]
+                tokens[-1] = eot_token
+            else:
+                raise RuntimeError(
+                    f'Input {texts[i]} is too long for context length {context_length}'
+                )
+        result[i, :len(tokens)] = torch.tensor(tokens)
+
+    return result
diff --git a/modelscope/models/cv/skin_retouching/detection_model/detection_module.py b/modelscope/models/cv/skin_retouching/detection_model/detection_module.py
index f89ce37b..5db9c44c 100644
--- a/modelscope/models/cv/skin_retouching/detection_model/detection_module.py
+++ b/modelscope/models/cv/skin_retouching/detection_model/detection_module.py
@@ -1,3 +1,4 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
 import torch
 import torch.nn as nn
 
diff --git a/modelscope/models/cv/skin_retouching/detection_model/detection_unet_in.py b/modelscope/models/cv/skin_retouching/detection_model/detection_unet_in.py
index b48f6e5f..c0be1a52 100644
--- a/modelscope/models/cv/skin_retouching/detection_model/detection_unet_in.py
+++ b/modelscope/models/cv/skin_retouching/detection_model/detection_unet_in.py
@@ -1,3 +1,4 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
diff --git a/modelscope/models/cv/skin_retouching/inpainting_model/gconv.py b/modelscope/models/cv/skin_retouching/inpainting_model/gconv.py
index e0910d2c..8b3eb2fc 100644
--- a/modelscope/models/cv/skin_retouching/inpainting_model/gconv.py
+++ b/modelscope/models/cv/skin_retouching/inpainting_model/gconv.py
@@ -1,3 +1,4 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
 import torch
 import torch.nn as nn
 
diff --git a/modelscope/models/cv/skin_retouching/inpainting_model/inpainting_unet.py b/modelscope/models/cv/skin_retouching/inpainting_model/inpainting_unet.py
index 09cea1fc..dd220dd6 100644
--- a/modelscope/models/cv/skin_retouching/inpainting_model/inpainting_unet.py
+++ b/modelscope/models/cv/skin_retouching/inpainting_model/inpainting_unet.py
@@ -1,3 +1,4 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
diff --git a/modelscope/models/cv/skin_retouching/retinaface/box_utils.py b/modelscope/models/cv/skin_retouching/retinaface/box_utils.py
index 89cf8bf6..a4aeffd1 100644
--- a/modelscope/models/cv/skin_retouching/retinaface/box_utils.py
+++ b/modelscope/models/cv/skin_retouching/retinaface/box_utils.py
@@ -6,7 +6,8 @@ import torch
 
 
 def point_form(boxes: torch.Tensor) -> torch.Tensor:
-    """Convert prior_boxes to (x_min, y_min, x_max, y_max) representation for comparison to point form ground truth data.
+    """Convert prior_boxes to (x_min, y_min, x_max, y_max) representation for comparison to point form \
+       ground truth data.
 
     Args:
         boxes: center-size default boxes from priorbox layers.
diff --git a/modelscope/models/cv/skin_retouching/unet_deploy.py b/modelscope/models/cv/skin_retouching/unet_deploy.py
index cb37b04c..0ff75b85 100755
--- a/modelscope/models/cv/skin_retouching/unet_deploy.py
+++ b/modelscope/models/cv/skin_retouching/unet_deploy.py
@@ -1,3 +1,4 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
 import warnings
 
 import torch
diff --git a/modelscope/models/cv/skin_retouching/utils.py b/modelscope/models/cv/skin_retouching/utils.py
index 12653f41..eb0da6b9 100644
--- a/modelscope/models/cv/skin_retouching/utils.py
+++ b/modelscope/models/cv/skin_retouching/utils.py
@@ -1,3 +1,4 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
 import time
 from typing import Dict, List, Optional, Tuple, Union
 
diff --git a/modelscope/models/cv/skin_retouching/weights_init.py b/modelscope/models/cv/skin_retouching/weights_init.py
index efd24843..ae62d4a4 100644
--- a/modelscope/models/cv/skin_retouching/weights_init.py
+++ b/modelscope/models/cv/skin_retouching/weights_init.py
@@ -1,3 +1,4 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
 import torch
 import torch.nn as nn
 
diff --git a/modelscope/models/cv/text_driven_segmentation/__init__.py b/modelscope/models/cv/text_driven_segmentation/__init__.py
new file mode 100644
index 00000000..aefaa698
--- /dev/null
+++ b/modelscope/models/cv/text_driven_segmentation/__init__.py
@@ -0,0 +1,2 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from .lseg_base import TextDrivenSegmentation
diff --git a/modelscope/models/cv/text_driven_segmentation/clip.py b/modelscope/models/cv/text_driven_segmentation/clip.py
new file mode 100644
index 00000000..1cec5f39
--- /dev/null
+++ b/modelscope/models/cv/text_driven_segmentation/clip.py
@@ -0,0 +1,169 @@
+#  CLIP
+# Adapted from https://github.com/openai/CLIP.
+# Originally MIT License, Copyright (c) 2021 OpenAI.
+
+import hashlib
+import os
+import urllib
+import warnings
+from typing import Any, List, Union
+
+import torch
+from PIL import Image
+from pkg_resources import packaging
+from torchvision.transforms import (CenterCrop, Compose, Normalize, Resize,
+                                    ToTensor)
+from tqdm import tqdm
+
+from .model import build_model
+from .simple_tokenizer import SimpleTokenizer as _Tokenizer
+
+try:
+    from torchvision.transforms import InterpolationMode
+    BICUBIC = InterpolationMode.BICUBIC
+except ImportError:
+    BICUBIC = Image.BICUBIC
+
+if packaging.version.parse(
+        torch.__version__) < packaging.version.parse('1.7.1'):
+    warnings.warn('PyTorch version 1.7.1 or higher is recommended')
+__all__ = ['load', 'tokenize']
+
+
+def _convert_image_to_rgb(image):
+    return image.convert('RGB')
+
+
+def _transform(n_px):
+    return Compose([
+        Resize(n_px, interpolation=BICUBIC),
+        CenterCrop(n_px),
+        _convert_image_to_rgb,
+        ToTensor(),
+        Normalize((0.48145466, 0.4578275, 0.40821073),
+                  (0.26862954, 0.26130258, 0.27577711)),
+    ])
+
+
+def load(name: str,
+         device: Union[str, torch.device] = 'cuda'
+         if torch.cuda.is_available() else 'cpu',
+         jit: bool = False,
+         root: str = None):
+
+    if not jit:
+        model = build_model().to(device)
+        if str(device) == 'cpu':
+            model.float()
+        return model, _transform(model.visual.input_resolution)
+
+    # patch the device names
+    device_holder = torch.jit.trace(
+        lambda: torch.ones([]).to(torch.device(device)), example_inputs=[])
+    device_node = [
+        n for n in device_holder.graph.findAllNodes('prim::Constant')
+        if 'Device' in repr(n)
+    ][-1]
+
+    def patch_device(module):
+        try:
+            graphs = [module.graph] if hasattr(module, 'graph') else []
+        except RuntimeError:
+            graphs = []
+
+        if hasattr(module, 'forward1'):
+            graphs.append(module.forward1.graph)
+
+        for graph in graphs:
+            for node in graph.findAllNodes('prim::Constant'):
+                if 'value' in node.attributeNames() and str(
+                        node['value']).startswith('cuda'):
+                    node.copyAttributes(device_node)
+
+    model.apply(patch_device)
+    patch_device(model.encode_image)
+    patch_device(model.encode_text)
+
+    # patch dtype to float32 on CPU
+    if str(device) == 'cpu':
+        float_holder = torch.jit.trace(
+            lambda: torch.ones([]).float(), example_inputs=[])
+        float_input = list(float_holder.graph.findNode('aten::to').inputs())[1]
+        float_node = float_input.node()
+
+        def patch_float(module):
+            try:
+                graphs = [module.graph] if hasattr(module, 'graph') else []
+            except RuntimeError:
+                graphs = []
+
+            if hasattr(module, 'forward1'):
+                graphs.append(module.forward1.graph)
+
+            for graph in graphs:
+                for node in graph.findAllNodes('aten::to'):
+                    inputs = list(node.inputs())
+                    for i in [
+                            1, 2
+                    ]:  # dtype can be the second or third argument to aten::to()
+                        if inputs[i].node()['value'] == 5:
+                            inputs[i].node().copyAttributes(float_node)
+
+        model.apply(patch_float)
+        patch_float(model.encode_image)
+        patch_float(model.encode_text)
+
+        model.float()
+
+    return model, _transform(model.input_resolution.item())
+
+
+def tokenize(
+        _tokenizer,
+        texts: Union[str, List[str]],
+        context_length: int = 77,
+        truncate: bool = False) -> Union[torch.IntTensor, torch.LongTensor]:
+    """
+    Returns the tokenized representation of given input string(s)
+
+    Parameters
+    ----------
+    texts : Union[str, List[str]]
+        An input string or a list of input strings to tokenize
+
+    context_length : int
+        The context length to use; all CLIP models use 77 as the context length
+
+    truncate: bool
+        Whether to truncate the text in case its encoding is longer than the context length
+
+    Returns
+    -------
+    A two-dimensional tensor containing the resulting tokens, shape = [number of input strings, context_length].
+    We return LongTensor when torch version is <1.8.0, since older index_select requires indices to be long.
+    """
+    if isinstance(texts, str):
+        texts = [texts]
+
+    sot_token = _tokenizer.encoder['<|startoftext|>']
+    eot_token = _tokenizer.encoder['<|endoftext|>']
+    all_tokens = [[sot_token] + _tokenizer.encode(text) + [eot_token]
+                  for text in texts]
+    if packaging.version.parse(
+            torch.__version__) < packaging.version.parse('1.8.0'):
+        result = torch.zeros(len(all_tokens), context_length, dtype=torch.long)
+    else:
+        result = torch.zeros(len(all_tokens), context_length, dtype=torch.int)
+
+    for i, tokens in enumerate(all_tokens):
+        if len(tokens) > context_length:
+            if truncate:
+                tokens = tokens[:context_length]
+                tokens[-1] = eot_token
+            else:
+                raise RuntimeError(
+                    f'Input {texts[i]} is too long for context length {context_length}'
+                )
+        result[i, :len(tokens)] = torch.tensor(tokens)
+
+    return result
diff --git a/modelscope/models/cv/text_driven_segmentation/lseg_base.py b/modelscope/models/cv/text_driven_segmentation/lseg_base.py
new file mode 100644
index 00000000..c79861a7
--- /dev/null
+++ b/modelscope/models/cv/text_driven_segmentation/lseg_base.py
@@ -0,0 +1,26 @@
+# Adapted from https://github.com/isl-org/lang-seg.
+# Originally MIT License, Copyright (c) 2021 Intelligent Systems Lab Org.
+
+import torch
+import torch.nn as nn
+
+from .lseg_net import LSeg
+
+
+class TextDrivenSegmentation(nn.Module):
+
+    def __init__(self, model_dir):
+        super(TextDrivenSegmentation, self).__init__()
+        self.net = LSeg(model_dir=model_dir)
+        self.model_dir = model_dir
+
+    def forward(self, img, txt_list):
+        b = img.size()[0]
+        batch_name_list = txt_list
+        xout_list = []
+        for i in range(b):
+            labelset = ['others', batch_name_list[i]]
+            xout = self.net(img[i:i + 1], labelset=labelset)
+            xout_list.append(xout)
+        score_map = torch.cat(xout_list, dim=0)
+        return score_map
diff --git a/modelscope/models/cv/text_driven_segmentation/lseg_blocks.py b/modelscope/models/cv/text_driven_segmentation/lseg_blocks.py
new file mode 100644
index 00000000..56d4a65d
--- /dev/null
+++ b/modelscope/models/cv/text_driven_segmentation/lseg_blocks.py
@@ -0,0 +1,332 @@
+# Adapted from https://github.com/isl-org/lang-seg.
+# Originally MIT License, Copyright (c) 2021 Intelligent Systems Lab Org.
+
+import torch
+import torch.nn as nn
+
+from .lseg_vit import _make_pretrained_clip_vitl16_384, forward_vit
+
+
+def _make_encoder(
+    backbone,
+    features,
+    use_pretrained=True,
+    groups=1,
+    expand=False,
+    exportable=True,
+    hooks=None,
+    use_vit_only=False,
+    use_readout='ignore',
+    enable_attention_hooks=False,
+):
+    if backbone == 'clip_vitl16_384':
+        clip_pretrained, pretrained = _make_pretrained_clip_vitl16_384(
+            use_pretrained,
+            hooks=hooks,
+            use_readout=use_readout,
+            enable_attention_hooks=enable_attention_hooks,
+        )
+        scratch = _make_scratch([256, 512, 1024, 1024],
+                                features,
+                                groups=groups,
+                                expand=expand)
+    else:
+        raise NotImplementedError(f"Backbone '{backbone}' not implemented")
+
+    return clip_pretrained, pretrained, scratch
+
+
+def _make_scratch(in_shape, out_shape, groups=1, expand=False):
+    scratch = nn.Module()
+
+    out_shape1 = out_shape
+    out_shape2 = out_shape
+    out_shape3 = out_shape
+    out_shape4 = out_shape
+    if expand is True:
+        out_shape1 = out_shape
+        out_shape2 = out_shape * 2
+        out_shape3 = out_shape * 4
+        out_shape4 = out_shape * 8
+
+    scratch.layer1_rn = nn.Conv2d(
+        in_shape[0],
+        out_shape1,
+        kernel_size=3,
+        stride=1,
+        padding=1,
+        bias=False,
+        groups=groups,
+    )
+    scratch.layer2_rn = nn.Conv2d(
+        in_shape[1],
+        out_shape2,
+        kernel_size=3,
+        stride=1,
+        padding=1,
+        bias=False,
+        groups=groups,
+    )
+    scratch.layer3_rn = nn.Conv2d(
+        in_shape[2],
+        out_shape3,
+        kernel_size=3,
+        stride=1,
+        padding=1,
+        bias=False,
+        groups=groups,
+    )
+    scratch.layer4_rn = nn.Conv2d(
+        in_shape[3],
+        out_shape4,
+        kernel_size=3,
+        stride=1,
+        padding=1,
+        bias=False,
+        groups=groups,
+    )
+
+    return scratch
+
+
+class Interpolate(nn.Module):
+    """Interpolation module."""
+
+    def __init__(self, scale_factor, mode, align_corners=False):
+        """Init.
+
+        Args:
+            scale_factor (float): scaling
+            mode (str): interpolation mode
+        """
+        super(Interpolate, self).__init__()
+
+        self.interp = nn.functional.interpolate
+        self.scale_factor = scale_factor
+        self.mode = mode
+        self.align_corners = align_corners
+
+    def forward(self, x):
+        """Forward pass.
+
+        Args:
+            x (tensor): input
+
+        Returns:
+            tensor: interpolated data
+        """
+
+        x = self.interp(
+            x,
+            scale_factor=self.scale_factor,
+            mode=self.mode,
+            align_corners=self.align_corners,
+        )
+
+        return x
+
+
+class ResidualConvUnit(nn.Module):
+    """Residual convolution module."""
+
+    def __init__(self, features):
+        """Init.
+
+        Args:
+            features (int): number of features
+        """
+        super().__init__()
+
+        self.conv1 = nn.Conv2d(
+            features, features, kernel_size=3, stride=1, padding=1, bias=True)
+
+        self.conv2 = nn.Conv2d(
+            features, features, kernel_size=3, stride=1, padding=1, bias=True)
+
+        self.relu = nn.ReLU(inplace=True)
+
+    def forward(self, x):
+        """Forward pass.
+
+        Args:
+            x (tensor): input
+
+        Returns:
+            tensor: output
+        """
+        out = self.relu(x)
+        out = self.conv1(out)
+        out = self.relu(out)
+        out = self.conv2(out)
+
+        return out + x
+
+
+class FeatureFusionBlock(nn.Module):
+    """Feature fusion block."""
+
+    def __init__(self, features):
+        """Init.
+
+        Args:
+            features (int): number of features
+        """
+        super(FeatureFusionBlock, self).__init__()
+
+        self.resConfUnit1 = ResidualConvUnit(features)
+        self.resConfUnit2 = ResidualConvUnit(features)
+
+    def forward(self, *xs):
+        """Forward pass.
+
+        Returns:
+            tensor: output
+        """
+        output = xs[0]
+
+        if len(xs) == 2:
+            output += self.resConfUnit1(xs[1])
+
+        output = self.resConfUnit2(output)
+
+        output = nn.functional.interpolate(
+            output, scale_factor=2, mode='bilinear', align_corners=True)
+
+        return output
+
+
+class ResidualConvUnit_custom(nn.Module):
+    """Residual convolution module."""
+
+    def __init__(self, features, activation, bn):
+        """Init.
+
+        Args:
+            features (int): number of features
+        """
+        super().__init__()
+
+        self.bn = bn
+
+        self.groups = 1
+
+        self.conv1 = nn.Conv2d(
+            features,
+            features,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            bias=not self.bn,
+            groups=self.groups,
+        )
+
+        self.conv2 = nn.Conv2d(
+            features,
+            features,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            bias=not self.bn,
+            groups=self.groups,
+        )
+
+        if self.bn is True:
+            self.bn1 = nn.BatchNorm2d(features)
+            self.bn2 = nn.BatchNorm2d(features)
+
+        self.activation = activation
+
+        self.skip_add = nn.quantized.FloatFunctional()
+
+    def forward(self, x):
+        """Forward pass.
+
+        Args:
+            x (tensor): input
+
+        Returns:
+            tensor: output
+        """
+
+        out = self.activation(x)
+        out = self.conv1(out)
+        if self.bn is True:
+            out = self.bn1(out)
+
+        out = self.activation(out)
+        out = self.conv2(out)
+        if self.bn is True:
+            out = self.bn2(out)
+
+        if self.groups > 1:
+            out = self.conv_merge(out)
+
+        return self.skip_add.add(out, x)
+
+
+class FeatureFusionBlock_custom(nn.Module):
+    """Feature fusion block."""
+
+    def __init__(
+        self,
+        features,
+        activation,
+        deconv=False,
+        bn=False,
+        expand=False,
+        align_corners=True,
+    ):
+        """Init.
+
+        Args:
+            features (int): number of features
+        """
+        super(FeatureFusionBlock_custom, self).__init__()
+
+        self.deconv = deconv
+        self.align_corners = align_corners
+
+        self.groups = 1
+
+        self.expand = expand
+        out_features = features
+        if self.expand is True:
+            out_features = features // 2
+
+        self.out_conv = nn.Conv2d(
+            features,
+            out_features,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            bias=True,
+            groups=1,
+        )
+
+        self.resConfUnit1 = ResidualConvUnit_custom(features, activation, bn)
+        self.resConfUnit2 = ResidualConvUnit_custom(features, activation, bn)
+
+        self.skip_add = nn.quantized.FloatFunctional()
+
+    def forward(self, *xs):
+        """Forward pass.
+
+        Returns:
+            tensor: output
+        """
+        output = xs[0]
+
+        if len(xs) == 2:
+            res = self.resConfUnit1(xs[1])
+            output = self.skip_add.add(output, res)
+
+        output = self.resConfUnit2(output)
+
+        output = nn.functional.interpolate(
+            output,
+            scale_factor=2,
+            mode='bilinear',
+            align_corners=self.align_corners)
+
+        output = self.out_conv(output)
+        return output
diff --git a/modelscope/models/cv/text_driven_segmentation/lseg_model.py b/modelscope/models/cv/text_driven_segmentation/lseg_model.py
new file mode 100644
index 00000000..9a5754c6
--- /dev/null
+++ b/modelscope/models/cv/text_driven_segmentation/lseg_model.py
@@ -0,0 +1,109 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import os.path as osp
+from typing import Any, Dict
+
+import json
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from PIL import Image
+
+from modelscope.metainfo import Models
+from modelscope.models.base import TorchModel
+from modelscope.models.builder import MODELS
+from modelscope.models.cv.text_driven_segmentation import \
+    TextDrivenSegmentation
+from modelscope.outputs import OutputKeys
+from modelscope.preprocessors import LoadImage
+from modelscope.utils.constant import ModelFile, Tasks
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+__all__ = ['TextDrivenSeg']
+
+
+@MODELS.register_module(
+    Tasks.text_driven_segmentation,
+    module_name=Models.text_driven_segmentation)
+class TextDrivenSeg(TorchModel):
+    """ text driven segmentation model.
+    """
+
+    def __init__(self, model_dir, device_id=0, *args, **kwargs):
+        super().__init__(
+            model_dir=model_dir, device_id=device_id, *args, **kwargs)
+        self.model = TextDrivenSegmentation(model_dir=model_dir)
+        pretrained_params = torch.load('{}/{}'.format(
+            model_dir, ModelFile.TORCH_MODEL_BIN_FILE))
+        self.model.load_state_dict(pretrained_params)
+        self.model.eval()
+        if device_id >= 0 and torch.cuda.is_available():
+            self.model.to('cuda:{}'.format(device_id))
+            logger.info('Use GPU: {}'.format(device_id))
+        else:
+            device_id = -1
+            logger.info('Use CPU for inference')
+        self.device_id = device_id
+
+    def preprocess(self, img, size=640):
+        mean = [0.48145466, 0.4578275, 0.40821073]
+        std = [0.26862954, 0.26130258, 0.27577711]
+        h, w, c = img.shape
+        max_hw = max(h, w)
+        ratio = 1.0 * size / max_hw
+        crop_h, crop_w = int(ratio * h), int(ratio * w)
+        pil_img = Image.fromarray(img)
+        pil_img = pil_img.resize((crop_w, crop_h), Image.BILINEAR)
+        np_img = np.array(pil_img, dtype=np.float32) / 255.
+        for j in range(3):
+            np_img[:, :, j] = (np_img[:, :, j] - mean[j]) / std[j]
+        img_pad = np.zeros((size, size, 3), dtype=np.float32)
+        img_pad[:crop_h, :crop_w] = np_img
+        img_pad = torch.from_numpy(img_pad).permute(2, 0,
+                                                    1).unsqueeze(0).float()
+        return img_pad, h, w, crop_h, crop_w
+
+    def postprocess(self, tensors, crop_h, crop_w, ori_h, ori_w):
+        output = np.clip(tensors * 255., a_min=0, a_max=255.)
+        crop_output = np.array(output[:crop_h, :crop_w], dtype=np.uint8)
+        pil_output = Image.fromarray(crop_output)
+        pil_output = pil_output.resize((ori_w, ori_h), Image.BILINEAR)
+        np_output = np.array(pil_output, dtype=np.uint8)
+        np_output[np_output < 128] = 0
+        np_output[np_output >= 128] = 255
+        np_output = np.uint8(np_output)
+        return np_output
+
+    def forward(self, image, text):
+        """
+        image should be numpy array, dtype=np.uint8, shape: height*width*3
+        """
+        image_tensor, ori_h, ori_w, crop_h, crop_w = self.preprocess(
+            image, size=640)
+        pred = self.inference(image_tensor, text)
+        msk = self.postprocess(pred, crop_h, crop_w, ori_h, ori_w, size=640)
+        outputs = {OutputKeys.MASKS: msk}
+        return outputs
+
+    def inference(self, image, text):
+        """
+        image should be tensor, 1 * 3 * 640 * 640
+        """
+        with torch.no_grad():
+            if self.device_id == -1:
+                output = self.model(image)
+            else:
+                device = torch.device('cuda', self.device_id)
+                output = self.model(image.to(device), [text])
+            output = F.interpolate(output, size=(640, 640), mode='bilinear')
+            output = F.softmax(output, dim=1)
+            output = torch.argmax(output, dim=1)
+            output = output[0]
+            if self.device_id == -1:
+                pred = output.data.numpy()
+            else:
+                pred = output.data.cpu().numpy()
+            del output
+        return pred
diff --git a/modelscope/models/cv/text_driven_segmentation/lseg_net.py b/modelscope/models/cv/text_driven_segmentation/lseg_net.py
new file mode 100644
index 00000000..541a4a38
--- /dev/null
+++ b/modelscope/models/cv/text_driven_segmentation/lseg_net.py
@@ -0,0 +1,195 @@
+# Adapted from https://github.com/isl-org/lang-seg.
+# Originally MIT License, Copyright (c) 2021 Intelligent Systems Lab Org.
+
+import numpy as np
+import torch
+import torch.nn as nn
+
+from . import clip
+from .lseg_blocks import (FeatureFusionBlock, FeatureFusionBlock_custom,
+                          Interpolate, _make_encoder, forward_vit)
+from .simple_tokenizer import SimpleTokenizer
+
+
+class depthwise_clipseg_conv(nn.Module):
+
+    def __init__(self):
+        super(depthwise_clipseg_conv, self).__init__()
+        self.depthwise = nn.Conv2d(1, 1, kernel_size=3, padding=1)
+
+    def depthwise_clipseg(self, x, channels):
+        x = torch.cat(
+            [self.depthwise(x[:, i].unsqueeze(1)) for i in range(channels)],
+            dim=1)
+        return x
+
+    def forward(self, x):
+        channels = x.shape[1]
+        out = self.depthwise_clipseg(x, channels)
+        return out
+
+
+class depthwise_conv(nn.Module):
+
+    def __init__(self, kernel_size=3, stride=1, padding=1):
+        super(depthwise_conv, self).__init__()
+        self.depthwise = nn.Conv2d(
+            1, 1, kernel_size=kernel_size, stride=stride, padding=padding)
+
+    def forward(self, x):
+        # support for 4D tensor with NCHW
+        C, H, W = x.shape[1:]
+        x = x.reshape(-1, 1, H, W)
+        x = self.depthwise(x)
+        x = x.view(-1, C, H, W)
+        return x
+
+
+class depthwise_block(nn.Module):
+
+    def __init__(self, kernel_size=3, stride=1, padding=1, activation='relu'):
+        super(depthwise_block, self).__init__()
+        self.depthwise = depthwise_conv(kernel_size=3, stride=1, padding=1)
+        if activation == 'relu':
+            self.activation = nn.ReLU()
+        elif activation == 'lrelu':
+            self.activation = nn.LeakyReLU()
+        elif activation == 'tanh':
+            self.activation = nn.Tanh()
+
+    def forward(self, x, act=True):
+        x = self.depthwise(x)
+        if act:
+            x = self.activation(x)
+        return x
+
+
+class bottleneck_block(nn.Module):
+
+    def __init__(self, kernel_size=3, stride=1, padding=1, activation='relu'):
+        super(bottleneck_block, self).__init__()
+        self.depthwise = depthwise_conv(kernel_size=3, stride=1, padding=1)
+        if activation == 'relu':
+            self.activation = nn.ReLU()
+        elif activation == 'lrelu':
+            self.activation = nn.LeakyReLU()
+        elif activation == 'tanh':
+            self.activation = nn.Tanh()
+
+    def forward(self, x, act=True):
+        sum_layer = x.max(dim=1, keepdim=True)[0]
+        x = self.depthwise(x)
+        x = x + sum_layer
+        if act:
+            x = self.activation(x)
+        return x
+
+
+class BaseModel(torch.nn.Module):
+
+    def load(self, path):
+        """Load model from file.
+        Args:
+            path (str): file path
+        """
+        parameters = torch.load(path, map_location=torch.device('cpu'))
+
+        if 'optimizer' in parameters:
+            parameters = parameters['model']
+
+        self.load_state_dict(parameters)
+
+
+def _make_fusion_block(features, use_bn):
+    return FeatureFusionBlock_custom(
+        features,
+        activation=nn.ReLU(False),
+        deconv=False,
+        bn=use_bn,
+        expand=False,
+        align_corners=True,
+    )
+
+
+class LSeg(BaseModel):
+
+    def __init__(
+        self,
+        features=256,
+        backbone='clip_vitl16_384',
+        readout='project',
+        use_bn=True,
+        model_dir=None,
+    ):
+        super(LSeg, self).__init__()
+        hooks = {
+            'clip_vitl16_384': [5, 11, 17, 23],
+        }
+
+        # Instantiate backbone and reassemble blocks
+        self.clip_pretrained, self.pretrained, self.scratch = _make_encoder(
+            backbone,
+            features,
+            groups=1,
+            expand=False,
+            exportable=False,
+            hooks=hooks[backbone],
+            use_readout=readout,
+        )
+
+        self.scratch.refinenet1 = _make_fusion_block(features, use_bn)
+        self.scratch.refinenet2 = _make_fusion_block(features, use_bn)
+        self.scratch.refinenet3 = _make_fusion_block(features, use_bn)
+        self.scratch.refinenet4 = _make_fusion_block(features, use_bn)
+
+        self.logit_scale = nn.Parameter(torch.ones([])
+                                        * np.log(1 / 0.07)).exp()
+        self.out_c = 512
+        self.scratch.head1 = nn.Conv2d(features, self.out_c, kernel_size=1)
+
+        self.scratch.output_conv = nn.Sequential(
+            Interpolate(scale_factor=2, mode='bilinear', align_corners=True), )
+
+        self.tau = 0.07
+        self.model_dir = model_dir
+        self.tokenizer = SimpleTokenizer(model_dir
+                                         + '/bpe_simple_vocab_16e6.txt.gz')
+
+    def forward(self, x, labelset=''):
+        text = clip.tokenize(self.tokenizer, labelset)
+
+        layer_1, layer_2, layer_3, layer_4 = forward_vit(self.pretrained, x)
+
+        layer_1_rn = self.scratch.layer1_rn(layer_1)
+        layer_2_rn = self.scratch.layer2_rn(layer_2)
+        layer_3_rn = self.scratch.layer3_rn(layer_3)
+        layer_4_rn = self.scratch.layer4_rn(layer_4)
+
+        path_4 = self.scratch.refinenet4(layer_4_rn)
+        path_3 = self.scratch.refinenet3(path_4, layer_3_rn)
+        path_2 = self.scratch.refinenet2(path_3, layer_2_rn)
+        path_1 = self.scratch.refinenet1(path_2, layer_1_rn)
+
+        text = text.to(x.device)
+        text_features = self.clip_pretrained.encode_text(text)
+
+        image_features = self.scratch.head1(path_1)
+
+        imshape = image_features.shape
+        image_features = image_features.permute(0, 2, 3,
+                                                1).reshape(-1, self.out_c)
+
+        # normalized features
+        image_features = image_features / image_features.norm(
+            dim=-1, keepdim=True)
+        text_features = text_features / text_features.norm(
+            dim=-1, keepdim=True)
+
+        logits_per_image = image_features @ text_features.t() / self.tau
+
+        out = logits_per_image.float().view(imshape[0], imshape[2], imshape[3],
+                                            -1).permute(0, 3, 1, 2)
+
+        out = self.scratch.output_conv(out)
+
+        return out
diff --git a/modelscope/models/cv/text_driven_segmentation/lseg_vit.py b/modelscope/models/cv/text_driven_segmentation/lseg_vit.py
new file mode 100644
index 00000000..5298832f
--- /dev/null
+++ b/modelscope/models/cv/text_driven_segmentation/lseg_vit.py
@@ -0,0 +1,541 @@
+# Adapted from https://github.com/isl-org/lang-seg.
+# Originally MIT License, Copyright (c) 2021 Intelligent Systems Lab Org.
+
+import math
+import types
+
+import timm
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.checkpoint as checkpoint
+
+from . import clip
+
+activations = {}
+
+
+def get_activation(name):
+
+    def hook(model, input, output):
+        activations[name] = output
+
+    return hook
+
+
+attention = {}
+
+
+def get_attention(name):
+
+    def hook(module, input, output):
+        x = input[0]
+        B, N, C = x.shape
+        qkv = (
+            module.qkv(x).reshape(B, N, 3, module.num_heads,
+                                  C // module.num_heads).permute(
+                                      2, 0, 3, 1, 4))
+        q, k, _ = (
+            qkv[0],
+            qkv[1],
+            qkv[2],
+        )  # make torchscript happy (cannot use tensor as tuple)
+
+        attn = (q @ k.transpose(-2, -1)) * module.scale
+
+        attn = attn.softmax(dim=-1)  # [:,:,1,1:]
+        attention[name] = attn
+
+    return hook
+
+
+def get_mean_attention_map(attn, token, shape):
+    attn = attn[:, :, token, 1:]
+    attn = attn.unflatten(2, torch.Size([shape[2] // 16,
+                                         shape[3] // 16])).float()
+    attn = torch.nn.functional.interpolate(
+        attn, size=shape[2:], mode='bicubic', align_corners=False).squeeze(0)
+
+    all_attn = torch.mean(attn, 0)
+
+    return all_attn
+
+
+class Slice(nn.Module):
+
+    def __init__(self, start_index=1):
+        super(Slice, self).__init__()
+        self.start_index = start_index
+
+    def forward(self, x):
+        return x[:, self.start_index:]
+
+
+class AddReadout(nn.Module):
+
+    def __init__(self, start_index=1):
+        super(AddReadout, self).__init__()
+        self.start_index = start_index
+
+    def forward(self, x):
+        if self.start_index == 2:
+            readout = (x[:, 0] + x[:, 1]) / 2
+        else:
+            readout = x[:, 0]
+        return x[:, self.start_index:] + readout.unsqueeze(1)
+
+
+class ProjectReadout(nn.Module):
+
+    def __init__(self, in_features, start_index=1):
+        super(ProjectReadout, self).__init__()
+        self.start_index = start_index
+
+        self.project = nn.Sequential(
+            nn.Linear(2 * in_features, in_features), nn.GELU())
+
+    def forward(self, x):
+        readout = x[:, 0].unsqueeze(1).expand_as(x[:, self.start_index:])
+        features = torch.cat((x[:, self.start_index:], readout), -1)
+
+        return self.project(features)
+
+
+class Transpose(nn.Module):
+
+    def __init__(self, dim0, dim1):
+        super(Transpose, self).__init__()
+        self.dim0 = dim0
+        self.dim1 = dim1
+
+    def forward(self, x):
+        x = x.transpose(self.dim0, self.dim1)
+        return x
+
+
+def forward_vit(pretrained, x):
+    b, c, h, w = x.shape
+
+    # encoder
+    _ = pretrained.model.forward_flex(x)
+
+    layer_1 = pretrained.activations['1']
+    layer_2 = pretrained.activations['2']
+    layer_3 = pretrained.activations['3']
+    layer_4 = pretrained.activations['4']
+
+    layer_1 = pretrained.act_postprocess1[0:2](layer_1)
+    layer_2 = pretrained.act_postprocess2[0:2](layer_2)
+    layer_3 = pretrained.act_postprocess3[0:2](layer_3)
+    layer_4 = pretrained.act_postprocess4[0:2](layer_4)
+
+    unflatten = nn.Sequential(
+        nn.Unflatten(
+            2,
+            torch.Size([
+                h // pretrained.model.patch_size[1],
+                w // pretrained.model.patch_size[0],
+            ]),
+        ))
+
+    if layer_1.ndim == 3:
+        layer_1 = unflatten(layer_1)
+    if layer_2.ndim == 3:
+        layer_2 = unflatten(layer_2)
+    if layer_3.ndim == 3:
+        layer_3 = unflatten(layer_3)
+    if layer_4.ndim == 3:
+        layer_4 = unflatten(layer_4)
+
+    layer_1 = pretrained.act_postprocess1[3:len(pretrained.act_postprocess1)](
+        layer_1)
+    layer_2 = pretrained.act_postprocess2[3:len(pretrained.act_postprocess2)](
+        layer_2)
+    layer_3 = pretrained.act_postprocess3[3:len(pretrained.act_postprocess3)](
+        layer_3)
+    layer_4 = pretrained.act_postprocess4[3:len(pretrained.act_postprocess4)](
+        layer_4)
+
+    return layer_1, layer_2, layer_3, layer_4
+
+
+def _resize_pos_embed(self, posemb, gs_h, gs_w):
+    posemb_tok, posemb_grid = (
+        posemb[:, :self.start_index],
+        posemb[0, self.start_index:],
+    )
+
+    gs_old = int(math.sqrt(len(posemb_grid)))
+
+    posemb_grid = posemb_grid.reshape(1, gs_old, gs_old,
+                                      -1).permute(0, 3, 1, 2)
+    posemb_grid = F.interpolate(
+        posemb_grid, size=(gs_h, gs_w), mode='bilinear')
+    posemb_grid = posemb_grid.permute(0, 2, 3, 1).reshape(1, gs_h * gs_w, -1)
+
+    posemb = torch.cat([posemb_tok, posemb_grid], dim=1)
+
+    return posemb
+
+
+def forward_flex(self, x):
+    b, c, h, w = x.shape
+
+    pos_embed = self._resize_pos_embed(self.pos_embed, h // self.patch_size[1],
+                                       w // self.patch_size[0])
+
+    B = x.shape[0]
+
+    if hasattr(self.patch_embed, 'backbone'):
+        x = self.patch_embed.backbone(x)
+        if isinstance(x, (list, tuple)):
+            x = x[
+                -1]  # last feature if backbone outputs list/tuple of features
+    x = self.patch_embed.proj(x).flatten(2).transpose(1, 2)
+
+    if getattr(self, 'dist_token', None) is not None:
+        cls_tokens = self.cls_token.expand(
+            B, -1, -1)  # stole cls_tokens impl from Phil Wang, thanks
+        dist_token = self.dist_token.expand(B, -1, -1)
+        x = torch.cat((cls_tokens, dist_token, x), dim=1)
+    else:
+        cls_tokens = self.cls_token.expand(
+            B, -1, -1)  # stole cls_tokens impl from Phil Wang, thanks
+        x = torch.cat((cls_tokens, x), dim=1)
+
+    x = x + pos_embed
+    x = self.pos_drop(x)
+
+    gradient_checkpoint = False
+    for blk in self.blocks:
+        if gradient_checkpoint:
+            x = checkpoint.checkpoint(blk, x)
+        else:
+            x = blk(x)
+
+    x = self.norm(x)
+
+    return x
+
+
+def get_readout_oper(vit_features, features, use_readout, start_index=1):
+    if use_readout == 'ignore':
+        readout_oper = [Slice(start_index)] * len(features)
+    elif use_readout == 'add':
+        readout_oper = [AddReadout(start_index)] * len(features)
+    elif use_readout == 'project':
+        readout_oper = [
+            ProjectReadout(vit_features, start_index) for out_feat in features
+        ]
+    else:
+        assert (
+            False
+        ), "wrong operation for readout token, use_readout can be 'ignore', 'add', or 'project'"
+
+    return readout_oper
+
+
+def adapt_input_conv(in_chans, conv_weight):
+    conv_type = conv_weight.dtype
+    conv_weight = conv_weight.float(
+    )  # Some weights are in torch.half, ensure it's float for sum on CPU
+    O, II, J, K = conv_weight.shape
+    if in_chans == 1:
+        if II > 3:
+            assert conv_weight.shape[1] % 3 == 0
+            # For models with space2depth stems
+            conv_weight = conv_weight.reshape(O, II // 3, 3, J, K)
+            conv_weight = conv_weight.sum(dim=2, keepdim=False)
+        else:
+            conv_weight = conv_weight.sum(dim=1, keepdim=True)
+    elif in_chans != 3:
+        if II != 3:
+            raise NotImplementedError(
+                'Weight format not supported by conversion.')
+        else:
+            # NOTE this strategy should be better than random init, but there could be other combinations of
+            # the original RGB input layer weights that'd work better for specific cases.
+            repeat = int(math.ceil(in_chans / 3))
+            conv_weight = conv_weight.repeat(1, repeat, 1,
+                                             1)[:, :in_chans, :, :]
+            conv_weight *= (3 / float(in_chans))
+    conv_weight = conv_weight.to(conv_type)
+    return conv_weight
+
+
+@torch.no_grad()
+def _load_weights(model, checkpoint_path, prefix=''):
+    """ Load weights from .npz checkpoints for official Google Brain Flax implementation
+    """
+    import numpy as np
+
+    def _n2p(w, t=True):
+        if w.ndim == 4 and w.shape[0] == w.shape[1] == w.shape[2] == 1:
+            w = w.flatten()
+        if t:
+            if w.ndim == 4:
+                w = w.transpose([3, 2, 0, 1])
+            elif w.ndim == 3:
+                w = w.transpose([2, 0, 1])
+            elif w.ndim == 2:
+                w = w.transpose([1, 0])
+        return torch.from_numpy(w)
+
+    w = np.load(checkpoint_path)
+    if not prefix and 'opt/target/embedding/kernel' in w:
+        prefix = 'opt/target/'
+
+    if hasattr(model.patch_embed, 'backbone'):
+        # hybrid
+        backbone = model.patch_embed.backbone
+        stem_only = not hasattr(backbone, 'stem')
+        stem = backbone if stem_only else backbone.stem
+        stem.conv.weight.copy_(
+            adapt_input_conv(stem.conv.weight.shape[1],
+                             _n2p(w[f'{prefix}conv_root/kernel'])))
+        stem.norm.weight.copy_(_n2p(w[f'{prefix}gn_root/scale']))
+        stem.norm.bias.copy_(_n2p(w[f'{prefix}gn_root/bias']))
+        if not stem_only:
+            for i, stage in enumerate(backbone.stages):
+                for j, block in enumerate(stage.blocks):
+                    bp = f'{prefix}block{i + 1}/unit{j + 1}/'
+                    for r in range(3):
+                        getattr(block, f'conv{r + 1}').weight.copy_(
+                            _n2p(w[f'{bp}conv{r + 1}/kernel']))
+                        getattr(block, f'norm{r + 1}').weight.copy_(
+                            _n2p(w[f'{bp}gn{r + 1}/scale']))
+                        getattr(block, f'norm{r + 1}').bias.copy_(
+                            _n2p(w[f'{bp}gn{r + 1}/bias']))
+                    if block.downsample is not None:
+                        block.downsample.conv.weight.copy_(
+                            _n2p(w[f'{bp}conv_proj/kernel']))
+                        block.downsample.norm.weight.copy_(
+                            _n2p(w[f'{bp}gn_proj/scale']))
+                        block.downsample.norm.bias.copy_(
+                            _n2p(w[f'{bp}gn_proj/bias']))
+        embed_conv_w = _n2p(w[f'{prefix}embedding/kernel'])
+    else:
+        embed_conv_w = adapt_input_conv(model.patch_embed.proj.weight.shape[1],
+                                        _n2p(w[f'{prefix}embedding/kernel']))
+    model.patch_embed.proj.weight.copy_(embed_conv_w)
+    model.patch_embed.proj.bias.copy_(_n2p(w[f'{prefix}embedding/bias']))
+    model.cls_token.copy_(_n2p(w[f'{prefix}cls'], t=False))
+    pos_embed_w = _n2p(
+        w[f'{prefix}Transformer/posembed_input/pos_embedding'], t=False)
+    if pos_embed_w.shape != model.pos_embed.shape:
+        pos_embed_w = resize_pos_embed(  # resize pos embedding when different size from pretrained weights
+            pos_embed_w, model.pos_embed, getattr(model, 'num_prefix_tokens',
+                                                  1),
+            model.patch_embed.grid_size)
+    model.pos_embed.copy_(pos_embed_w)
+    model.norm.weight.copy_(_n2p(w[f'{prefix}Transformer/encoder_norm/scale']))
+    model.norm.bias.copy_(_n2p(w[f'{prefix}Transformer/encoder_norm/bias']))
+    if isinstance(
+            model.head, nn.Linear
+    ) and model.head.bias.shape[0] == w[f'{prefix}head/bias'].shape[-1]:
+        model.head.weight.copy_(_n2p(w[f'{prefix}head/kernel']))
+        model.head.bias.copy_(_n2p(w[f'{prefix}head/bias']))
+    # NOTE representation layer has been removed, not used in latest 21k/1k pretrained weights
+    # if isinstance(getattr(model.pre_logits, 'fc', None), nn.Linear) and f'{prefix}pre_logits/bias' in w:
+    #     model.pre_logits.fc.weight.copy_(_n2p(w[f'{prefix}pre_logits/kernel']))
+    #     model.pre_logits.fc.bias.copy_(_n2p(w[f'{prefix}pre_logits/bias']))
+    for i, block in enumerate(model.blocks.children()):
+        block_prefix = f'{prefix}Transformer/encoderblock_{i}/'
+        mha_prefix = block_prefix + 'MultiHeadDotProductAttention_1/'
+        block.norm1.weight.copy_(_n2p(w[f'{block_prefix}LayerNorm_0/scale']))
+        block.norm1.bias.copy_(_n2p(w[f'{block_prefix}LayerNorm_0/bias']))
+        block.attn.qkv.weight.copy_(
+            torch.cat([
+                _n2p(w[f'{mha_prefix}{n}/kernel'], t=False).flatten(1).T
+                for n in ('query', 'key', 'value')
+            ]))
+        block.attn.qkv.bias.copy_(
+            torch.cat([
+                _n2p(w[f'{mha_prefix}{n}/bias'], t=False).reshape(-1)
+                for n in ('query', 'key', 'value')
+            ]))
+        block.attn.proj.weight.copy_(
+            _n2p(w[f'{mha_prefix}out/kernel']).flatten(1))
+        block.attn.proj.bias.copy_(_n2p(w[f'{mha_prefix}out/bias']))
+        for r in range(2):
+            getattr(block.mlp, f'fc{r + 1}').weight.copy_(
+                _n2p(w[f'{block_prefix}MlpBlock_3/Dense_{r}/kernel']))
+            getattr(block.mlp, f'fc{r + 1}').bias.copy_(
+                _n2p(w[f'{block_prefix}MlpBlock_3/Dense_{r}/bias']))
+        block.norm2.weight.copy_(_n2p(w[f'{block_prefix}LayerNorm_2/scale']))
+        block.norm2.bias.copy_(_n2p(w[f'{block_prefix}LayerNorm_2/bias']))
+
+
+def resize_pos_embed(posemb, posemb_new, num_prefix_tokens=1, gs_new=()):
+    # Rescale the grid of position embeddings when loading from state_dict. Adapted from
+    # https://github.com/google-research/vision_transformer/blob/00883dd691c63a6830751563748663526e811cee/vit_jax/checkpoint.py#L224
+    ntok_new = posemb_new.shape[1]
+    if num_prefix_tokens:
+        posemb_prefix, posemb_grid = posemb[:, :num_prefix_tokens], posemb[
+            0, num_prefix_tokens:]
+        ntok_new -= num_prefix_tokens
+    else:
+        posemb_prefix, posemb_grid = posemb[:, :0], posemb[0]
+    gs_old = int(math.sqrt(len(posemb_grid)))
+    if not len(gs_new):  # backwards compatibility
+        gs_new = [int(math.sqrt(ntok_new))] * 2
+    assert len(gs_new) >= 2
+    posemb_grid = posemb_grid.reshape(1, gs_old, gs_old,
+                                      -1).permute(0, 3, 1, 2)
+    posemb_grid = F.interpolate(
+        posemb_grid, size=gs_new, mode='bicubic', align_corners=False)
+    posemb_grid = posemb_grid.permute(0, 2, 3,
+                                      1).reshape(1, gs_new[0] * gs_new[1], -1)
+    posemb = torch.cat([posemb_prefix, posemb_grid], dim=1)
+    return posemb
+
+
+def _make_pretrained_clip_vitl16_384(pretrained,
+                                     use_readout='ignore',
+                                     hooks=None,
+                                     enable_attention_hooks=False):
+    clip_pretrained, _ = clip.load('ViT-B/32', device='cpu', jit=False)
+
+    # model = timm.create_model("vit_large_patch16_384", pretrained=pretrained)
+    model = timm.create_model('vit_large_patch16_384', pretrained=False)
+    hooks = [5, 11, 17, 23] if hooks is None else hooks
+    pretrained = _make_vit_b16_backbone(
+        model,
+        features=[256, 512, 1024, 1024],
+        hooks=hooks,
+        vit_features=1024,
+        use_readout=use_readout,
+        enable_attention_hooks=enable_attention_hooks,
+    )
+    return clip_pretrained, pretrained
+
+
+def _make_vit_b16_backbone(
+    model,
+    features=[96, 192, 384, 768],
+    size=[384, 384],
+    hooks=[2, 5, 8, 11],
+    vit_features=768,
+    use_readout='ignore',
+    start_index=1,
+    enable_attention_hooks=False,
+):
+    pretrained = nn.Module()
+
+    pretrained.model = model
+    pretrained.model.blocks[hooks[0]].register_forward_hook(
+        get_activation('1'))
+    pretrained.model.blocks[hooks[1]].register_forward_hook(
+        get_activation('2'))
+    pretrained.model.blocks[hooks[2]].register_forward_hook(
+        get_activation('3'))
+    pretrained.model.blocks[hooks[3]].register_forward_hook(
+        get_activation('4'))
+
+    pretrained.activations = activations
+
+    if enable_attention_hooks:
+        pretrained.model.blocks[hooks[0]].attn.register_forward_hook(
+            get_attention('attn_1'))
+        pretrained.model.blocks[hooks[1]].attn.register_forward_hook(
+            get_attention('attn_2'))
+        pretrained.model.blocks[hooks[2]].attn.register_forward_hook(
+            get_attention('attn_3'))
+        pretrained.model.blocks[hooks[3]].attn.register_forward_hook(
+            get_attention('attn_4'))
+        pretrained.attention = attention
+
+    readout_oper = get_readout_oper(vit_features, features, use_readout,
+                                    start_index)
+
+    # 32, 48, 136, 384
+    pretrained.act_postprocess1 = nn.Sequential(
+        readout_oper[0],
+        Transpose(1, 2),
+        nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])),
+        nn.Conv2d(
+            in_channels=vit_features,
+            out_channels=features[0],
+            kernel_size=1,
+            stride=1,
+            padding=0,
+        ),
+        nn.ConvTranspose2d(
+            in_channels=features[0],
+            out_channels=features[0],
+            kernel_size=4,
+            stride=4,
+            padding=0,
+            bias=True,
+            dilation=1,
+            groups=1,
+        ),
+    )
+
+    pretrained.act_postprocess2 = nn.Sequential(
+        readout_oper[1],
+        Transpose(1, 2),
+        nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])),
+        nn.Conv2d(
+            in_channels=vit_features,
+            out_channels=features[1],
+            kernel_size=1,
+            stride=1,
+            padding=0,
+        ),
+        nn.ConvTranspose2d(
+            in_channels=features[1],
+            out_channels=features[1],
+            kernel_size=2,
+            stride=2,
+            padding=0,
+            bias=True,
+            dilation=1,
+            groups=1,
+        ),
+    )
+
+    pretrained.act_postprocess3 = nn.Sequential(
+        readout_oper[2],
+        Transpose(1, 2),
+        nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])),
+        nn.Conv2d(
+            in_channels=vit_features,
+            out_channels=features[2],
+            kernel_size=1,
+            stride=1,
+            padding=0,
+        ),
+    )
+
+    pretrained.act_postprocess4 = nn.Sequential(
+        readout_oper[3],
+        Transpose(1, 2),
+        nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])),
+        nn.Conv2d(
+            in_channels=vit_features,
+            out_channels=features[3],
+            kernel_size=1,
+            stride=1,
+            padding=0,
+        ),
+        nn.Conv2d(
+            in_channels=features[3],
+            out_channels=features[3],
+            kernel_size=3,
+            stride=2,
+            padding=1,
+        ),
+    )
+
+    pretrained.model.start_index = start_index
+    pretrained.model.patch_size = [16, 16]
+
+    # We inject this function into the VisionTransformer instances so that
+    # we can use it with interpolated position embeddings without modifying the library source.
+    pretrained.model.forward_flex = types.MethodType(forward_flex,
+                                                     pretrained.model)
+    pretrained.model._resize_pos_embed = types.MethodType(
+        _resize_pos_embed, pretrained.model)
+
+    return pretrained
diff --git a/modelscope/models/cv/text_driven_segmentation/model.py b/modelscope/models/cv/text_driven_segmentation/model.py
new file mode 100644
index 00000000..f98d480d
--- /dev/null
+++ b/modelscope/models/cv/text_driven_segmentation/model.py
@@ -0,0 +1,456 @@
+# Adapted from https://github.com/isl-org/lang-seg.
+# Originally MIT License, Copyright (c) 2021 Intelligent Systems Lab Org.
+
+from collections import OrderedDict
+from typing import Tuple, Union
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+
+class Bottleneck(nn.Module):
+    expansion = 4
+
+    def __init__(self, inplanes, planes, stride=1):
+        super().__init__()
+
+        # all conv layers have stride 1. an avgpool is performed after the second convolution when stride > 1
+        self.conv1 = nn.Conv2d(inplanes, planes, 1, bias=False)
+        self.bn1 = nn.BatchNorm2d(planes)
+        self.relu1 = nn.ReLU(inplace=True)
+
+        self.conv2 = nn.Conv2d(planes, planes, 3, padding=1, bias=False)
+        self.bn2 = nn.BatchNorm2d(planes)
+        self.relu2 = nn.ReLU(inplace=True)
+
+        self.avgpool = nn.AvgPool2d(stride) if stride > 1 else nn.Identity()
+
+        self.conv3 = nn.Conv2d(planes, planes * self.expansion, 1, bias=False)
+        self.bn3 = nn.BatchNorm2d(planes * self.expansion)
+        self.relu3 = nn.ReLU(inplace=True)
+
+        self.downsample = None
+        self.stride = stride
+
+        if stride > 1 or inplanes != planes * Bottleneck.expansion:
+            # downsampling layer is prepended with an avgpool, and the subsequent convolution has stride 1
+            self.downsample = nn.Sequential(
+                OrderedDict([('-1', nn.AvgPool2d(stride)),
+                             ('0',
+                              nn.Conv2d(
+                                  inplanes,
+                                  planes * self.expansion,
+                                  1,
+                                  stride=1,
+                                  bias=False)),
+                             ('1', nn.BatchNorm2d(planes * self.expansion))]))
+
+    def forward(self, x: torch.Tensor):
+        identity = x
+
+        out = self.relu1(self.bn1(self.conv1(x)))
+        out = self.relu2(self.bn2(self.conv2(out)))
+        out = self.avgpool(out)
+        out = self.bn3(self.conv3(out))
+
+        if self.downsample is not None:
+            identity = self.downsample(x)
+
+        out += identity
+        out = self.relu3(out)
+        return out
+
+
+class AttentionPool2d(nn.Module):
+
+    def __init__(self,
+                 spacial_dim: int,
+                 embed_dim: int,
+                 num_heads: int,
+                 output_dim: int = None):
+        super().__init__()
+        self.positional_embedding = nn.Parameter(
+            torch.randn(spacial_dim**2 + 1, embed_dim) / embed_dim**0.5)
+        self.k_proj = nn.Linear(embed_dim, embed_dim)
+        self.q_proj = nn.Linear(embed_dim, embed_dim)
+        self.v_proj = nn.Linear(embed_dim, embed_dim)
+        self.c_proj = nn.Linear(embed_dim, output_dim or embed_dim)
+        self.num_heads = num_heads
+
+    def forward(self, x):
+        x = x.flatten(start_dim=2).permute(2, 0, 1)  # NCHW -> (HW)NC
+        x = torch.cat([x.mean(dim=0, keepdim=True), x], dim=0)  # (HW+1)NC
+        x = x + self.positional_embedding[:, None, :].to(x.dtype)  # (HW+1)NC
+        x, _ = F.multi_head_attention_forward(
+            query=x[:1],
+            key=x,
+            value=x,
+            embed_dim_to_check=x.shape[-1],
+            num_heads=self.num_heads,
+            q_proj_weight=self.q_proj.weight,
+            k_proj_weight=self.k_proj.weight,
+            v_proj_weight=self.v_proj.weight,
+            in_proj_weight=None,
+            in_proj_bias=torch.cat(
+                [self.q_proj.bias, self.k_proj.bias, self.v_proj.bias]),
+            bias_k=None,
+            bias_v=None,
+            add_zero_attn=False,
+            dropout_p=0,
+            out_proj_weight=self.c_proj.weight,
+            out_proj_bias=self.c_proj.bias,
+            use_separate_proj_weight=True,
+            training=self.training,
+            need_weights=False)
+        return x.squeeze(0)
+
+
+class ModifiedResNet(nn.Module):
+    """
+    A ResNet class that is similar to torchvision's but contains the following changes:
+    - There are now 3 "stem" convolutions as opposed to 1, with an average pool instead of a max pool.
+    - Performs anti-aliasing strided convolutions, where an avgpool is prepended to convolutions with stride > 1
+    - The final pooling layer is a QKV attention instead of an average pool
+    """
+
+    def __init__(self,
+                 layers,
+                 output_dim,
+                 heads,
+                 input_resolution=224,
+                 width=64):
+        super().__init__()
+        self.output_dim = output_dim
+        self.input_resolution = input_resolution
+
+        # the 3-layer stem
+        self.conv1 = nn.Conv2d(
+            3, width // 2, kernel_size=3, stride=2, padding=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(width // 2)
+        self.relu1 = nn.ReLU(inplace=True)
+        self.conv2 = nn.Conv2d(
+            width // 2, width // 2, kernel_size=3, padding=1, bias=False)
+        self.bn2 = nn.BatchNorm2d(width // 2)
+        self.relu2 = nn.ReLU(inplace=True)
+        self.conv3 = nn.Conv2d(
+            width // 2, width, kernel_size=3, padding=1, bias=False)
+        self.bn3 = nn.BatchNorm2d(width)
+        self.relu3 = nn.ReLU(inplace=True)
+        self.avgpool = nn.AvgPool2d(2)
+
+        # residual layers
+        self._inplanes = width  # this is a *mutable* variable used during construction
+        self.layer1 = self._make_layer(width, layers[0])
+        self.layer2 = self._make_layer(width * 2, layers[1], stride=2)
+        self.layer3 = self._make_layer(width * 4, layers[2], stride=2)
+        self.layer4 = self._make_layer(width * 8, layers[3], stride=2)
+
+        embed_dim = width * 32  # the ResNet feature dimension
+        self.attnpool = AttentionPool2d(input_resolution // 32, embed_dim,
+                                        heads, output_dim)
+
+    def _make_layer(self, planes, blocks, stride=1):
+        layers = [Bottleneck(self._inplanes, planes, stride)]
+
+        self._inplanes = planes * Bottleneck.expansion
+        for _ in range(1, blocks):
+            layers.append(Bottleneck(self._inplanes, planes))
+
+        return nn.Sequential(*layers)
+
+    def forward(self, x):
+
+        def stem(x):
+            x = self.relu1(self.bn1(self.conv1(x)))
+            x = self.relu2(self.bn2(self.conv2(x)))
+            x = self.relu3(self.bn3(self.conv3(x)))
+            x = self.avgpool(x)
+            return x
+
+        x = x.type(self.conv1.weight.dtype)
+        x = stem(x)
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+        x = self.layer4(x)
+        x = self.attnpool(x)
+
+        return x
+
+
+class LayerNorm(nn.LayerNorm):
+    """Subclass torch's LayerNorm to handle fp16."""
+
+    def forward(self, x: torch.Tensor):
+        orig_type = x.dtype
+        ret = super().forward(x.type(torch.float32))
+        return ret.type(orig_type)
+
+
+class QuickGELU(nn.Module):
+
+    def forward(self, x: torch.Tensor):
+        return x * torch.sigmoid(1.702 * x)
+
+
+class ResidualAttentionBlock(nn.Module):
+
+    def __init__(self,
+                 d_model: int,
+                 n_head: int,
+                 attn_mask: torch.Tensor = None):
+        super().__init__()
+
+        self.attn = nn.MultiheadAttention(d_model, n_head)
+        self.ln_1 = LayerNorm(d_model)
+        self.mlp = nn.Sequential(
+            OrderedDict([('c_fc', nn.Linear(d_model, d_model * 4)),
+                         ('gelu', QuickGELU()),
+                         ('c_proj', nn.Linear(d_model * 4, d_model))]))
+        self.ln_2 = LayerNorm(d_model)
+        self.attn_mask = attn_mask
+
+    def attention(self, x: torch.Tensor):
+        self.attn_mask = self.attn_mask.to(
+            dtype=x.dtype,
+            device=x.device) if self.attn_mask is not None else None
+        return self.attn(
+            x, x, x, need_weights=False, attn_mask=self.attn_mask)[0]
+
+    def forward(self, x: torch.Tensor):
+        x = x + self.attention(self.ln_1(x))
+        x = x + self.mlp(self.ln_2(x))
+        return x
+
+
+class Transformer(nn.Module):
+
+    def __init__(self, width, layers, heads, attn_mask=None):
+        super().__init__()
+        self.width = width
+        self.layers = layers
+        self.resblocks = nn.Sequential(*[
+            ResidualAttentionBlock(width, heads, attn_mask)
+            for _ in range(layers)
+        ])
+
+    def forward(self, x: torch.Tensor):
+        return self.resblocks(x)
+
+
+class VisionTransformer(nn.Module):
+
+    def __init__(self, input_resolution: int, patch_size: int, width: int,
+                 layers: int, heads: int, output_dim: int):
+        super().__init__()
+        self.input_resolution = input_resolution
+        self.output_dim = output_dim
+        self.conv1 = nn.Conv2d(
+            in_channels=3,
+            out_channels=width,
+            kernel_size=patch_size,
+            stride=patch_size,
+            bias=False)
+
+        scale = width**-0.5
+        self.class_embedding = nn.Parameter(scale * torch.randn(width))
+        self.positional_embedding = nn.Parameter(scale * torch.randn(
+            (input_resolution // patch_size)**2 + 1, width))
+        self.ln_pre = LayerNorm(width)
+
+        self.transformer = Transformer(width, layers, heads)
+
+        self.ln_post = LayerNorm(width)
+        self.proj = nn.Parameter(scale * torch.randn(width, output_dim))
+
+    def forward(self, x: torch.Tensor):
+        x = self.conv1(x)  # shape = [*, width, grid, grid]
+        x = x.reshape(x.shape[0], x.shape[1],
+                      -1)  # shape = [*, width, grid ** 2]
+        x = x.permute(0, 2, 1)  # shape = [*, grid ** 2, width]
+        x1 = self.class_embedding.to(x.dtype)
+        x2 = torch.zeros(
+            x.shape[0], 1, x.shape[-1], dtype=x.dtype, device=x.device)
+        x = torch.cat([x1 + x2, x], dim=1)  # shape = [*, grid ** 2 + 1, width]
+        x = x + self.positional_embedding.to(x.dtype)
+        x = self.ln_pre(x)
+
+        x = x.permute(1, 0, 2)  # NLD -> LND
+        x = self.transformer(x)
+        x = x.permute(1, 0, 2)  # LND -> NLD
+
+        x = self.ln_post(x[:, 0, :])
+
+        if self.proj is not None:
+            x = x @ self.proj
+
+        return x
+
+
+class CLIP(nn.Module):
+
+    def __init__(
+            self,
+            embed_dim: int,
+            # vision
+            image_resolution: int,
+            vision_layers: Union[Tuple[int, int, int, int], int],
+            vision_width: int,
+            vision_patch_size: int,
+            # text
+            context_length: int,
+            vocab_size: int,
+            transformer_width: int,
+            transformer_heads: int,
+            transformer_layers: int):
+        super().__init__()
+
+        self.context_length = context_length
+
+        if isinstance(vision_layers, (tuple, list)):
+            vision_heads = vision_width * 32 // 64
+            self.visual = ModifiedResNet(
+                layers=vision_layers,
+                output_dim=embed_dim,
+                heads=vision_heads,
+                input_resolution=image_resolution,
+                width=vision_width)
+        else:
+            vision_heads = vision_width // 64
+            self.visual = VisionTransformer(
+                input_resolution=image_resolution,
+                patch_size=vision_patch_size,
+                width=vision_width,
+                layers=vision_layers,
+                heads=vision_heads,
+                output_dim=embed_dim)
+
+        self.transformer = Transformer(
+            width=transformer_width,
+            layers=transformer_layers,
+            heads=transformer_heads,
+            attn_mask=self.build_attention_mask())
+
+        self.vocab_size = vocab_size
+        self.token_embedding = nn.Embedding(vocab_size, transformer_width)
+        self.positional_embedding = nn.Parameter(
+            torch.empty(self.context_length, transformer_width))
+        self.ln_final = LayerNorm(transformer_width)
+
+        self.text_projection = nn.Parameter(
+            torch.empty(transformer_width, embed_dim))
+        self.logit_scale = nn.Parameter(torch.ones([]) * np.log(1 / 0.07))
+
+        self.initialize_parameters()
+
+    def initialize_parameters(self):
+        nn.init.normal_(self.token_embedding.weight, std=0.02)
+        nn.init.normal_(self.positional_embedding, std=0.01)
+
+        if isinstance(self.visual, ModifiedResNet):
+            if self.visual.attnpool is not None:
+                std = self.visual.attnpool.c_proj.in_features**-0.5
+                nn.init.normal_(self.visual.attnpool.q_proj.weight, std=std)
+                nn.init.normal_(self.visual.attnpool.k_proj.weight, std=std)
+                nn.init.normal_(self.visual.attnpool.v_proj.weight, std=std)
+                nn.init.normal_(self.visual.attnpool.c_proj.weight, std=std)
+
+            for resnet_block in [
+                    self.visual.layer1, self.visual.layer2, self.visual.layer3,
+                    self.visual.layer4
+            ]:
+                for name, param in resnet_block.named_parameters():
+                    if name.endswith('bn3.weight'):
+                        nn.init.zeros_(param)
+
+        proj_std = (self.transformer.width**-0.5) * (
+            (2 * self.transformer.layers)**-0.5)
+        attn_std = self.transformer.width**-0.5
+        fc_std = (2 * self.transformer.width)**-0.5
+        for block in self.transformer.resblocks:
+            nn.init.normal_(block.attn.in_proj_weight, std=attn_std)
+            nn.init.normal_(block.attn.out_proj.weight, std=proj_std)
+            nn.init.normal_(block.mlp.c_fc.weight, std=fc_std)
+            nn.init.normal_(block.mlp.c_proj.weight, std=proj_std)
+
+        if self.text_projection is not None:
+            nn.init.normal_(
+                self.text_projection, std=self.transformer.width**-0.5)
+
+    def build_attention_mask(self):
+        # lazily create causal attention mask, with full attention between the vision tokens
+        # pytorch uses additive attention mask; fill with -inf
+        mask = torch.empty(self.context_length, self.context_length)
+        mask.fill_(float('-inf'))
+        mask.triu_(1)  # zero out the lower diagonal
+        return mask
+
+    @property
+    def dtype(self):
+        return self.visual.conv1.weight.dtype
+
+    def encode_image(self, image):
+        return self.visual(image.type(self.dtype))
+
+    def encode_text(self, text):
+        x = self.token_embedding(text).type(self.dtype)
+        x = x + self.positional_embedding.type(self.dtype)
+        x = x.permute(1, 0, 2)  # NLD -> LND
+        x = self.transformer(x)
+        x = x.permute(1, 0, 2)  # LND -> NLD
+        x = self.ln_final(x).type(self.dtype)
+        x = x[torch.arange(x.shape[0]),
+              text.argmax(dim=-1)] @ self.text_projection
+        return x
+
+    def forward(self, image, text):
+        image_features = self.encode_image(image)
+        text_features = self.encode_text(text)
+
+        # normalized features
+        image_features = image_features / image_features.norm(
+            dim=1, keepdim=True)
+        text_features = text_features / text_features.norm(dim=1, keepdim=True)
+
+        # cosine similarity as logits
+        logit_scale = self.logit_scale.exp()
+        logits_per_image = logit_scale * image_features @ text_features.t()
+        logits_per_text = logits_per_image.t()
+
+        # shape = [global_batch_size, global_batch_size]
+        return logits_per_image, logits_per_text
+
+
+def convert_weights(model: nn.Module):
+    """Convert applicable model parameters to fp16"""
+
+    def _convert_weights_to_fp16(ll):
+        if isinstance(ll, (nn.Conv1d, nn.Conv2d, nn.Linear)):
+            ll.weight.data = ll.weight.data.half()
+            if ll.bias is not None:
+                ll.bias.data = ll.bias.data.half()
+
+        if isinstance(ll, nn.MultiheadAttention):
+            for attr in [
+                    *[f'{s}_proj_weight' for s in ['in', 'q', 'k', 'v']],
+                    'in_proj_bias', 'bias_k', 'bias_v'
+            ]:
+                tensor = getattr(ll, attr)
+                if tensor is not None:
+                    tensor.data = tensor.data.half()
+
+        for name in ['text_projection', 'proj']:
+            if hasattr(ll, name):
+                attr = getattr(ll, name)
+                if attr is not None:
+                    attr.data = attr.data.half()
+
+    model.apply(_convert_weights_to_fp16)
+
+
+def build_model():
+    model = CLIP(512, 224, 12, 768, 32, 77, 49408, 512, 8, 12)
+    convert_weights(model)
+    return model.eval()
diff --git a/modelscope/models/cv/text_driven_segmentation/simple_tokenizer.py b/modelscope/models/cv/text_driven_segmentation/simple_tokenizer.py
new file mode 100644
index 00000000..361d67c6
--- /dev/null
+++ b/modelscope/models/cv/text_driven_segmentation/simple_tokenizer.py
@@ -0,0 +1,155 @@
+# CLIP
+# Adapted from https://github.com/openai/CLIP.
+# Originally MIT License, Copyright (c) 2021 OpenAI.
+
+import gzip
+import html
+import os
+from functools import lru_cache
+
+import ftfy
+import regex as re
+
+
+@lru_cache()
+def default_bpe():
+    return os.path.join(
+        os.path.dirname(os.path.abspath(__file__)),
+        'bpe_simple_vocab_16e6.txt.gz')
+
+
+@lru_cache()
+def bytes_to_unicode():
+    """
+    Returns list of utf-8 byte and a corresponding list of unicode strings.
+    The reversible bpe codes work on unicode strings.
+    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
+    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
+    This is a signficant percentage of your normal, say, 32K bpe vocab.
+    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
+    And avoids mapping to whitespace/control characters the bpe code barfs on.
+    """
+    bs = list(range(ord('!'),
+                    ord('~') + 1)) + list(range(
+                        ord('¡'),
+                        ord('¬') + 1)) + list(range(ord('®'),
+                                                    ord('ÿ') + 1))
+    cs = bs[:]
+    n = 0
+    for b in range(2**8):
+        if b not in bs:
+            bs.append(b)
+            cs.append(2**8 + n)
+            n += 1
+    cs = [chr(n) for n in cs]
+    return dict(zip(bs, cs))
+
+
+def get_pairs(word):
+    """Return set of symbol pairs in a word.
+    Word is represented as tuple of symbols (symbols being variable-length strings).
+    """
+    pairs = set()
+    prev_char = word[0]
+    for char in word[1:]:
+        pairs.add((prev_char, char))
+        prev_char = char
+    return pairs
+
+
+def basic_clean(text):
+    text = ftfy.fix_text(text)
+    text = html.unescape(html.unescape(text))
+    return text.strip()
+
+
+def whitespace_clean(text):
+    text = re.sub(r'\s+', ' ', text)
+    text = text.strip()
+    return text
+
+
+class SimpleTokenizer(object):
+
+    def __init__(self, bpe_path: str = default_bpe()):
+        self.byte_encoder = bytes_to_unicode()
+        self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
+        merges = gzip.open(bpe_path).read().decode('utf-8').split('\n')
+        merges = merges[1:49152 - 256 - 2 + 1]
+        merges = [tuple(merge.split()) for merge in merges]
+        vocab = list(bytes_to_unicode().values())
+        vocab = vocab + [v + '</w>' for v in vocab]
+        for merge in merges:
+            vocab.append(''.join(merge))
+        vocab.extend(['<|startoftext|>', '<|endoftext|>'])
+        self.encoder = dict(zip(vocab, range(len(vocab))))
+        self.decoder = {v: k for k, v in self.encoder.items()}
+        self.bpe_ranks = dict(zip(merges, range(len(merges))))
+        self.cache = {
+            '<|startoftext|>': '<|startoftext|>',
+            '<|endoftext|>': '<|endoftext|>'
+        }
+        self.pat = re.compile(
+            r"""<\|startoftext\|>|<\|endoftext\|>|'s|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+""",
+            re.IGNORECASE)
+
+    def bpe(self, token):
+        if token in self.cache:
+            return self.cache[token]
+        word = tuple(token[:-1]) + (token[-1] + '</w>', )
+        pairs = get_pairs(word)
+
+        if not pairs:
+            return token + '</w>'
+
+        while True:
+            bigram = min(
+                pairs, key=lambda pair: self.bpe_ranks.get(pair, float('inf')))
+            if bigram not in self.bpe_ranks:
+                break
+            first, second = bigram
+            new_word = []
+            i = 0
+            error_list = []
+            while i < len(word):
+                try:
+                    j = word.index(first, i)
+                    new_word.extend(word[i:j])
+                    i = j
+                except Exception as err:
+                    new_word.extend(word[i:])
+                    error_list.append(err)
+                    break
+
+                if word[i] == first and i < len(word) - 1 and word[
+                        i + 1] == second:
+                    new_word.append(first + second)
+                    i += 2
+                else:
+                    new_word.append(word[i])
+                    i += 1
+            new_word = tuple(new_word)
+            word = new_word
+            if len(word) == 1:
+                break
+            else:
+                pairs = get_pairs(word)
+        word = ' '.join(word)
+        self.cache[token] = word
+        return word
+
+    def encode(self, text):
+        bpe_tokens = []
+        text = whitespace_clean(basic_clean(text)).lower()
+        for token in re.findall(self.pat, text):
+            token = ''.join(self.byte_encoder[b]
+                            for b in token.encode('utf-8'))
+            bpe_tokens.extend(self.encoder[bpe_token]
+                              for bpe_token in self.bpe(token).split(' '))
+        return bpe_tokens
+
+    def decode(self, tokens):
+        text = ''.join([self.decoder[token] for token in tokens])
+        text = bytearray([self.byte_decoder[c] for c in text]).decode(
+            'utf-8', errors='replace').replace('</w>', ' ')
+        return text
diff --git a/modelscope/models/cv/tinynas_detection/__init__.py b/modelscope/models/cv/tinynas_detection/__init__.py
new file mode 100644
index 00000000..13532d10
--- /dev/null
+++ b/modelscope/models/cv/tinynas_detection/__init__.py
@@ -0,0 +1,24 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+# The AIRDet implementation is also open-sourced by the authors, and available at https://github.com/tinyvision/AIRDet.
+
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .tinynas_detector import Tinynas_detector
+
+else:
+    _import_structure = {
+        'tinynas_detector': ['TinynasDetector'],
+    }
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/models/cv/tinynas_detection/backbone/__init__.py b/modelscope/models/cv/tinynas_detection/backbone/__init__.py
new file mode 100644
index 00000000..186d06a3
--- /dev/null
+++ b/modelscope/models/cv/tinynas_detection/backbone/__init__.py
@@ -0,0 +1,16 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+# The AIRDet implementation is also open-sourced by the authors, and available at https://github.com/tinyvision/AIRDet.
+
+import copy
+
+from .darknet import CSPDarknet
+from .tinynas import load_tinynas_net
+
+
+def build_backbone(cfg):
+    backbone_cfg = copy.deepcopy(cfg)
+    name = backbone_cfg.pop('name')
+    if name == 'CSPDarknet':
+        return CSPDarknet(**backbone_cfg)
+    elif name == 'TinyNAS':
+        return load_tinynas_net(backbone_cfg)
diff --git a/modelscope/models/cv/tinynas_detection/backbone/darknet.py b/modelscope/models/cv/tinynas_detection/backbone/darknet.py
new file mode 100644
index 00000000..d3294f0d
--- /dev/null
+++ b/modelscope/models/cv/tinynas_detection/backbone/darknet.py
@@ -0,0 +1,126 @@
+# Copyright (c) Megvii Inc. All rights reserved.
+# Copyright (c) Alibaba, Inc. and its affiliates.
+# The AIRDet implementation is also open-sourced by the authors, and available at https://github.com/tinyvision/AIRDet.
+
+import torch
+from torch import nn
+
+from ..core.base_ops import (BaseConv, CSPLayer, DWConv, Focus, ResLayer,
+                             SPPBottleneck)
+
+
+class CSPDarknet(nn.Module):
+
+    def __init__(
+        self,
+        dep_mul,
+        wid_mul,
+        out_features=('dark3', 'dark4', 'dark5'),
+        depthwise=False,
+        act='silu',
+        reparam=False,
+    ):
+        super(CSPDarknet, self).__init__()
+        assert out_features, 'please provide output features of Darknet'
+        self.out_features = out_features
+        Conv = DWConv if depthwise else BaseConv
+
+        base_channels = int(wid_mul * 64)  # 64
+        base_depth = max(round(dep_mul * 3), 1)  # 3
+
+        # stem
+        # self.stem = Focus(3, base_channels, ksize=3, act=act)
+        self.stem = Focus(3, base_channels, 3, act=act)
+
+        # dark2
+        self.dark2 = nn.Sequential(
+            Conv(base_channels, base_channels * 2, 3, 2, act=act),
+            CSPLayer(
+                base_channels * 2,
+                base_channels * 2,
+                n=base_depth,
+                depthwise=depthwise,
+                act=act,
+                reparam=reparam,
+            ),
+        )
+
+        # dark3
+        self.dark3 = nn.Sequential(
+            Conv(base_channels * 2, base_channels * 4, 3, 2, act=act),
+            CSPLayer(
+                base_channels * 4,
+                base_channels * 4,
+                n=base_depth * 3,
+                depthwise=depthwise,
+                act=act,
+                reparam=reparam,
+            ),
+        )
+
+        # dark4
+        self.dark4 = nn.Sequential(
+            Conv(base_channels * 4, base_channels * 8, 3, 2, act=act),
+            CSPLayer(
+                base_channels * 8,
+                base_channels * 8,
+                n=base_depth * 3,
+                depthwise=depthwise,
+                act=act,
+                reparam=reparam,
+            ),
+        )
+
+        # dark5
+        self.dark5 = nn.Sequential(
+            Conv(base_channels * 8, base_channels * 16, 3, 2, act=act),
+            SPPBottleneck(
+                base_channels * 16, base_channels * 16, activation=act),
+            CSPLayer(
+                base_channels * 16,
+                base_channels * 16,
+                n=base_depth,
+                shortcut=False,
+                depthwise=depthwise,
+                act=act,
+                reparam=reparam,
+            ),
+        )
+
+    def init_weights(self, pretrain=None):
+
+        if pretrain is None:
+            return
+        else:
+            pretrained_dict = torch.load(
+                pretrain, map_location='cpu')['state_dict']
+            new_params = self.state_dict().copy()
+            for k, v in pretrained_dict.items():
+                ks = k.split('.')
+                if ks[0] == 'fc' or ks[-1] == 'total_ops' or ks[
+                        -1] == 'total_params':
+                    continue
+                else:
+                    new_params[k] = v
+
+            self.load_state_dict(new_params)
+            print(f' load pretrain backbone from {pretrain}')
+
+    def forward(self, x):
+        outputs = {}
+        x = self.stem(x)
+        outputs['stem'] = x
+        x = self.dark2(x)
+        outputs['dark2'] = x
+        x = self.dark3(x)
+        outputs['dark3'] = x
+        x = self.dark4(x)
+        outputs['dark4'] = x
+        x = self.dark5(x)
+        outputs['dark5'] = x
+        features_out = [
+            outputs['stem'], outputs['dark2'], outputs['dark3'],
+            outputs['dark4'], outputs['dark5']
+        ]
+
+        return features_out
diff --git a/modelscope/models/cv/tinynas_detection/backbone/tinynas.py b/modelscope/models/cv/tinynas_detection/backbone/tinynas.py
new file mode 100755
index 00000000..814ee550
--- /dev/null
+++ b/modelscope/models/cv/tinynas_detection/backbone/tinynas.py
@@ -0,0 +1,347 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+# The AIRDet implementation is also open-sourced by the authors, and available at https://github.com/tinyvision/AIRDet.
+
+import torch
+import torch.nn as nn
+
+from ..core.base_ops import Focus, SPPBottleneck, get_activation
+from ..core.repvgg_block import RepVggBlock
+
+
+class ConvKXBN(nn.Module):
+
+    def __init__(self, in_c, out_c, kernel_size, stride):
+        super(ConvKXBN, self).__init__()
+        self.conv1 = nn.Conv2d(
+            in_c,
+            out_c,
+            kernel_size,
+            stride, (kernel_size - 1) // 2,
+            groups=1,
+            bias=False)
+        self.bn1 = nn.BatchNorm2d(out_c)
+
+    def forward(self, x):
+        return self.bn1(self.conv1(x))
+
+
+class ConvKXBNRELU(nn.Module):
+
+    def __init__(self, in_c, out_c, kernel_size, stride, act='silu'):
+        super(ConvKXBNRELU, self).__init__()
+        self.conv = ConvKXBN(in_c, out_c, kernel_size, stride)
+        if act is None:
+            self.activation_function = torch.relu
+        else:
+            self.activation_function = get_activation(act)
+
+    def forward(self, x):
+        output = self.conv(x)
+        return self.activation_function(output)
+
+
+class ResConvK1KX(nn.Module):
+
+    def __init__(self,
+                 in_c,
+                 out_c,
+                 btn_c,
+                 kernel_size,
+                 stride,
+                 force_resproj=False,
+                 act='silu'):
+        super(ResConvK1KX, self).__init__()
+        self.stride = stride
+        self.conv1 = ConvKXBN(in_c, btn_c, 1, 1)
+        self.conv2 = RepVggBlock(
+            btn_c, out_c, kernel_size, stride, act='identity')
+
+        if act is None:
+            self.activation_function = torch.relu
+        else:
+            self.activation_function = get_activation(act)
+
+        if stride == 2:
+            self.residual_downsample = nn.AvgPool2d(kernel_size=2, stride=2)
+        else:
+            self.residual_downsample = nn.Identity()
+
+        if in_c != out_c or force_resproj:
+            self.residual_proj = ConvKXBN(in_c, out_c, 1, 1)
+        else:
+            self.residual_proj = nn.Identity()
+
+    def forward(self, x):
+        if self.stride != 2:
+            reslink = self.residual_downsample(x)
+            reslink = self.residual_proj(reslink)
+
+        output = x
+        output = self.conv1(output)
+        output = self.activation_function(output)
+        output = self.conv2(output)
+        if self.stride != 2:
+            output = output + reslink
+        output = self.activation_function(output)
+
+        return output
+
+
+class SuperResConvK1KX(nn.Module):
+
+    def __init__(self,
+                 in_c,
+                 out_c,
+                 btn_c,
+                 kernel_size,
+                 stride,
+                 num_blocks,
+                 with_spp=False,
+                 act='silu'):
+        super(SuperResConvK1KX, self).__init__()
+        if act is None:
+            self.act = torch.relu
+        else:
+            self.act = get_activation(act)
+        self.block_list = nn.ModuleList()
+        for block_id in range(num_blocks):
+            if block_id == 0:
+                in_channels = in_c
+                out_channels = out_c
+                this_stride = stride
+                force_resproj = False  # as a part of CSPLayer, DO NOT need this flag
+                this_kernel_size = kernel_size
+            else:
+                in_channels = out_c
+                out_channels = out_c
+                this_stride = 1
+                force_resproj = False
+                this_kernel_size = kernel_size
+            the_block = ResConvK1KX(
+                in_channels,
+                out_channels,
+                btn_c,
+                this_kernel_size,
+                this_stride,
+                force_resproj,
+                act=act)
+            self.block_list.append(the_block)
+            if block_id == 0 and with_spp:
+                self.block_list.append(
+                    SPPBottleneck(out_channels, out_channels))
+
+    def forward(self, x):
+        output = x
+        for block in self.block_list:
+            output = block(output)
+        return output
+
+
+class ResConvKXKX(nn.Module):
+
+    def __init__(self,
+                 in_c,
+                 out_c,
+                 btn_c,
+                 kernel_size,
+                 stride,
+                 force_resproj=False,
+                 act='silu'):
+        super(ResConvKXKX, self).__init__()
+        self.stride = stride
+        if self.stride == 2:
+            self.downsampler = ConvKXBNRELU(in_c, out_c, 3, 2, act=act)
+        else:
+            self.conv1 = ConvKXBN(in_c, btn_c, kernel_size, 1)
+            self.conv2 = RepVggBlock(
+                btn_c, out_c, kernel_size, stride, act='identity')
+
+            if act is None:
+                self.activation_function = torch.relu
+            else:
+                self.activation_function = get_activation(act)
+
+            if stride == 2:
+                self.residual_downsample = nn.AvgPool2d(
+                    kernel_size=2, stride=2)
+            else:
+                self.residual_downsample = nn.Identity()
+
+            if in_c != out_c or force_resproj:
+                self.residual_proj = ConvKXBN(in_c, out_c, 1, 1)
+            else:
+                self.residual_proj = nn.Identity()
+
+    def forward(self, x):
+        if self.stride == 2:
+            return self.downsampler(x)
+        reslink = self.residual_downsample(x)
+        reslink = self.residual_proj(reslink)
+
+        output = x
+        output = self.conv1(output)
+        output = self.activation_function(output)
+        output = self.conv2(output)
+
+        output = output + reslink
+        output = self.activation_function(output)
+
+        return output
+
+
+class SuperResConvKXKX(nn.Module):
+
+    def __init__(self,
+                 in_c,
+                 out_c,
+                 btn_c,
+                 kernel_size,
+                 stride,
+                 num_blocks,
+                 with_spp=False,
+                 act='silu'):
+        super(SuperResConvKXKX, self).__init__()
+        if act is None:
+            self.act = torch.relu
+        else:
+            self.act = get_activation(act)
+        self.block_list = nn.ModuleList()
+        for block_id in range(num_blocks):
+            if block_id == 0:
+                in_channels = in_c
+                out_channels = out_c
+                this_stride = stride
+                force_resproj = False  # as a part of CSPLayer, DO NOT need this flag
+                this_kernel_size = kernel_size
+            else:
+                in_channels = out_c
+                out_channels = out_c
+                this_stride = 1
+                force_resproj = False
+                this_kernel_size = kernel_size
+            the_block = ResConvKXKX(
+                in_channels,
+                out_channels,
+                btn_c,
+                this_kernel_size,
+                this_stride,
+                force_resproj,
+                act=act)
+            self.block_list.append(the_block)
+            if block_id == 0 and with_spp:
+                self.block_list.append(
+                    SPPBottleneck(out_channels, out_channels))
+
+    def forward(self, x):
+        output = x
+        for block in self.block_list:
+            output = block(output)
+        return output
+
+
+class TinyNAS(nn.Module):
+
+    def __init__(self,
+                 structure_info=None,
+                 out_indices=[0, 1, 2, 4, 5],
+                 out_channels=[None, None, 128, 256, 512],
+                 with_spp=False,
+                 use_focus=False,
+                 need_conv1=True,
+                 act='silu'):
+        super(TinyNAS, self).__init__()
+        assert len(out_indices) == len(out_channels)
+        self.out_indices = out_indices
+        self.need_conv1 = need_conv1
+
+        self.block_list = nn.ModuleList()
+        if need_conv1:
+            self.conv1_list = nn.ModuleList()
+        for idx, block_info in enumerate(structure_info):
+            the_block_class = block_info['class']
+            if the_block_class == 'ConvKXBNRELU':
+                if use_focus:
+                    the_block = Focus(block_info['in'], block_info['out'],
+                                      block_info['k'])
+                else:
+                    the_block = ConvKXBNRELU(
+                        block_info['in'],
+                        block_info['out'],
+                        block_info['k'],
+                        block_info['s'],
+                        act=act)
+                self.block_list.append(the_block)
+            elif the_block_class == 'SuperResConvK1KX':
+                spp = with_spp if idx == len(structure_info) - 1 else False
+                the_block = SuperResConvK1KX(
+                    block_info['in'],
+                    block_info['out'],
+                    block_info['btn'],
+                    block_info['k'],
+                    block_info['s'],
+                    block_info['L'],
+                    spp,
+                    act=act)
+                self.block_list.append(the_block)
+            elif the_block_class == 'SuperResConvKXKX':
+                spp = with_spp if idx == len(structure_info) - 1 else False
+                the_block = SuperResConvKXKX(
+                    block_info['in'],
+                    block_info['out'],
+                    block_info['btn'],
+                    block_info['k'],
+                    block_info['s'],
+                    block_info['L'],
+                    spp,
+                    act=act)
+                self.block_list.append(the_block)
+            if need_conv1:
+                if idx in self.out_indices and out_channels[
+                        self.out_indices.index(idx)] is not None:
+                    self.conv1_list.append(
+                        nn.Conv2d(block_info['out'],
+                                  out_channels[self.out_indices.index(idx)],
+                                  1))
+                else:
+                    self.conv1_list.append(None)
+
+    def init_weights(self, pretrain=None):
+        pass
+
+    def forward(self, x):
+        output = x
+        stage_feature_list = []
+        for idx, block in enumerate(self.block_list):
+            output = block(output)
+            if idx in self.out_indices:
+                if self.need_conv1 and self.conv1_list[idx] is not None:
+                    true_out = self.conv1_list[idx](output)
+                    stage_feature_list.append(true_out)
+                else:
+                    stage_feature_list.append(output)
+        return stage_feature_list
+
+
+def load_tinynas_net(backbone_cfg):
+    # load masternet model to path
+    import ast
+
+    struct_str = ''.join([x.strip() for x in backbone_cfg.net_structure_str])
+    struct_info = ast.literal_eval(struct_str)
+    for layer in struct_info:
+        if 'nbitsA' in layer:
+            del layer['nbitsA']
+        if 'nbitsW' in layer:
+            del layer['nbitsW']
+
+    model = TinyNAS(
+        structure_info=struct_info,
+        out_indices=backbone_cfg.out_indices,
+        out_channels=backbone_cfg.out_channels,
+        with_spp=backbone_cfg.with_spp,
+        use_focus=backbone_cfg.use_focus,
+        act=backbone_cfg.act,
+        need_conv1=backbone_cfg.need_conv1,
+    )
+
+    return model
diff --git a/modelscope/models/cv/tinynas_detection/core/__init__.py b/modelscope/models/cv/tinynas_detection/core/__init__.py
new file mode 100644
index 00000000..3dad5e72
--- /dev/null
+++ b/modelscope/models/cv/tinynas_detection/core/__init__.py
@@ -0,0 +1,2 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+# The AIRDet implementation is also open-sourced by the authors, and available at https://github.com/tinyvision/AIRDet.
diff --git a/modelscope/models/cv/tinynas_detection/core/base_ops.py b/modelscope/models/cv/tinynas_detection/core/base_ops.py
new file mode 100644
index 00000000..62729ca2
--- /dev/null
+++ b/modelscope/models/cv/tinynas_detection/core/base_ops.py
@@ -0,0 +1,474 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+# The AIRDet implementation is also open-sourced by the authors, and available at https://github.com/tinyvision/AIRDet.
+import math
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from .repvgg_block import RepVggBlock
+
+
+class SiLU(nn.Module):
+    """export-friendly version of nn.SiLU()"""
+
+    @staticmethod
+    def forward(x):
+        return x * torch.sigmoid(x)
+
+
+def get_activation(name='silu', inplace=True):
+    if name == 'silu':
+        module = nn.SiLU(inplace=inplace)
+    elif name == 'relu':
+        module = nn.ReLU(inplace=inplace)
+    elif name == 'lrelu':
+        module = nn.LeakyReLU(0.1, inplace=inplace)
+    else:
+        raise AttributeError('Unsupported act type: {}'.format(name))
+    return module
+
+
+def get_norm(name, out_channels, inplace=True):
+    if name == 'bn':
+        module = nn.BatchNorm2d(out_channels)
+    elif name == 'gn':
+        module = nn.GroupNorm(num_channels=out_channels, num_groups=32)
+    return module
+
+
+class BaseConv(nn.Module):
+    """A Conv2d -> Batchnorm -> silu/leaky relu block"""
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 ksize,
+                 stride=1,
+                 groups=1,
+                 bias=False,
+                 act='silu',
+                 norm='bn'):
+        super().__init__()
+        # same padding
+        pad = (ksize - 1) // 2
+        self.conv = nn.Conv2d(
+            in_channels,
+            out_channels,
+            kernel_size=ksize,
+            stride=stride,
+            padding=pad,
+            groups=groups,
+            bias=bias,
+        )
+        if norm is not None:
+            self.bn = get_norm(norm, out_channels, inplace=True)
+        if act is not None:
+            self.act = get_activation(act, inplace=True)
+        self.with_norm = norm is not None
+        self.with_act = act is not None
+
+    def forward(self, x):
+        x = self.conv(x)
+        if self.with_norm:
+            # x = self.norm(x)
+            x = self.bn(x)
+        if self.with_act:
+            x = self.act(x)
+        return x
+
+    def fuseforward(self, x):
+        return self.act(self.conv(x))
+
+
+class DepthWiseConv(nn.Module):
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 ksize,
+                 stride=1,
+                 groups=None,
+                 bias=False,
+                 act='silu',
+                 norm='bn'):
+        super().__init__()
+        padding = (ksize - 1) // 2
+        self.depthwise = nn.Conv2d(
+            in_channels,
+            in_channels,
+            kernel_size=ksize,
+            stride=stride,
+            padding=padding,
+            groups=in_channels,
+            bias=bias,
+        )
+
+        self.pointwise = nn.Conv2d(
+            in_channels,
+            out_channels,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            bias=bias)
+        if norm is not None:
+            self.dwnorm = get_norm(norm, in_channels, inplace=True)
+            self.pwnorm = get_norm(norm, out_channels, inplace=True)
+        if act is not None:
+            self.act = get_activation(act, inplace=True)
+
+        self.with_norm = norm is not None
+        self.with_act = act is not None
+        self.order = ['depthwise', 'dwnorm', 'pointwise', 'act']
+
+    def forward(self, x):
+
+        for layer_name in self.order:
+            layer = self.__getattr__(layer_name)
+            if layer is not None:
+                x = layer(x)
+        return x
+
+
+class DWConv(nn.Module):
+    """Depthwise Conv + Conv"""
+
+    def __init__(self, in_channels, out_channels, ksize, stride=1, act='silu'):
+        super().__init__()
+        self.dconv = BaseConv(
+            in_channels,
+            in_channels,
+            ksize=ksize,
+            stride=stride,
+            groups=in_channels,
+            act=act,
+        )
+        self.pconv = BaseConv(
+            in_channels, out_channels, ksize=1, stride=1, groups=1, act=act)
+
+    def forward(self, x):
+        x = self.dconv(x)
+        return self.pconv(x)
+
+
+class Bottleneck(nn.Module):
+    # Standard bottleneck
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        shortcut=True,
+        expansion=0.5,
+        depthwise=False,
+        act='silu',
+        reparam=False,
+    ):
+        super().__init__()
+        hidden_channels = int(out_channels * expansion)
+        Conv = DWConv if depthwise else BaseConv
+        k_conv1 = 3 if reparam else 1
+        self.conv1 = BaseConv(
+            in_channels, hidden_channels, k_conv1, stride=1, act=act)
+        if reparam:
+            self.conv2 = RepVggBlock(
+                hidden_channels, out_channels, 3, stride=1, act=act)
+        else:
+            self.conv2 = Conv(
+                hidden_channels, out_channels, 3, stride=1, act=act)
+        self.use_add = shortcut and in_channels == out_channels
+
+    def forward(self, x):
+        y = self.conv2(self.conv1(x))
+        if self.use_add:
+            y = y + x
+        return y
+
+
+class ResLayer(nn.Module):
+    'Residual layer with `in_channels` inputs.'
+
+    def __init__(self, in_channels: int):
+        super().__init__()
+        mid_channels = in_channels // 2
+        self.layer1 = BaseConv(
+            in_channels, mid_channels, ksize=1, stride=1, act='lrelu')
+        self.layer2 = BaseConv(
+            mid_channels, in_channels, ksize=3, stride=1, act='lrelu')
+
+    def forward(self, x):
+        out = self.layer2(self.layer1(x))
+        return x + out
+
+
+class SPPBottleneck(nn.Module):
+    """Spatial pyramid pooling layer used in YOLOv3-SPP"""
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_sizes=(5, 9, 13),
+                 activation='silu'):
+        super().__init__()
+        hidden_channels = in_channels // 2
+        self.conv1 = BaseConv(
+            in_channels, hidden_channels, 1, stride=1, act=activation)
+        self.m = nn.ModuleList([
+            nn.MaxPool2d(kernel_size=ks, stride=1, padding=ks // 2)
+            for ks in kernel_sizes
+        ])
+        conv2_channels = hidden_channels * (len(kernel_sizes) + 1)
+        self.conv2 = BaseConv(
+            conv2_channels, out_channels, 1, stride=1, act=activation)
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = torch.cat([x] + [m(x) for m in self.m], dim=1)
+        x = self.conv2(x)
+        return x
+
+
+class CSPLayer(nn.Module):
+    """C3 in yolov5, CSP Bottleneck with 3 convolutions"""
+
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        n=1,
+        shortcut=True,
+        expansion=0.5,
+        depthwise=False,
+        act='silu',
+        reparam=False,
+    ):
+        """
+        Args:
+            in_channels (int): input channels.
+            out_channels (int): output channels.
+            n (int): number of Bottlenecks. Default value: 1.
+        """
+        # ch_in, ch_out, number, shortcut, groups, expansion
+        super().__init__()
+        hidden_channels = int(out_channels * expansion)  # hidden channels
+        self.conv1 = BaseConv(
+            in_channels, hidden_channels, 1, stride=1, act=act)
+        self.conv2 = BaseConv(
+            in_channels, hidden_channels, 1, stride=1, act=act)
+        self.conv3 = BaseConv(
+            2 * hidden_channels, out_channels, 1, stride=1, act=act)
+        module_list = [
+            Bottleneck(
+                hidden_channels,
+                hidden_channels,
+                shortcut,
+                1.0,
+                depthwise,
+                act=act,
+                reparam=reparam) for _ in range(n)
+        ]
+        self.m = nn.Sequential(*module_list)
+
+    def forward(self, x):
+        x_1 = self.conv1(x)
+        x_2 = self.conv2(x)
+        x_1 = self.m(x_1)
+        x = torch.cat((x_1, x_2), dim=1)
+        return self.conv3(x)
+
+
+class Focus(nn.Module):
+    """Focus width and height information into channel space."""
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 ksize=1,
+                 stride=1,
+                 act='silu'):
+        super().__init__()
+        self.conv = BaseConv(
+            in_channels * 4, out_channels, ksize, stride, act=act)
+
+    def forward(self, x):
+        # shape of x (b,c,w,h) -> y(b,4c,w/2,h/2)
+        patch_top_left = x[..., ::2, ::2]
+        patch_top_right = x[..., ::2, 1::2]
+        patch_bot_left = x[..., 1::2, ::2]
+        patch_bot_right = x[..., 1::2, 1::2]
+        x = torch.cat(
+            (
+                patch_top_left,
+                patch_bot_left,
+                patch_top_right,
+                patch_bot_right,
+            ),
+            dim=1,
+        )
+        return self.conv(x)
+
+
+class fast_Focus(nn.Module):
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 ksize=1,
+                 stride=1,
+                 act='silu'):
+        super(Focus, self).__init__()
+        self.conv1 = self.focus_conv(w1=1.0)
+        self.conv2 = self.focus_conv(w3=1.0)
+        self.conv3 = self.focus_conv(w2=1.0)
+        self.conv4 = self.focus_conv(w4=1.0)
+
+        self.conv = BaseConv(
+            in_channels * 4, out_channels, ksize, stride, act=act)
+
+    def forward(self, x):
+        return self.conv(
+            torch.cat(
+                [self.conv1(x),
+                 self.conv2(x),
+                 self.conv3(x),
+                 self.conv4(x)], 1))
+
+    def focus_conv(self, w1=0.0, w2=0.0, w3=0.0, w4=0.0):
+        conv = nn.Conv2d(3, 3, 2, 2, groups=3, bias=False)
+        conv.weight = self.init_weights_constant(w1, w2, w3, w4)
+        conv.weight.requires_grad = False
+        return conv
+
+    def init_weights_constant(self, w1=0.0, w2=0.0, w3=0.0, w4=0.0):
+        return nn.Parameter(
+            torch.tensor([[[[w1, w2], [w3, w4]]], [[[w1, w2], [w3, w4]]],
+                          [[[w1, w2], [w3, w4]]]]))
+
+
+# shufflenet block
+def channel_shuffle(x, groups=2):
+    bat_size, channels, w, h = x.shape
+    group_c = channels // groups
+    x = x.view(bat_size, groups, group_c, w, h)
+    x = torch.transpose(x, 1, 2).contiguous()
+    x = x.view(bat_size, -1, w, h)
+    return x
+
+
+def conv_1x1_bn(in_c, out_c, stride=1):
+    return nn.Sequential(
+        nn.Conv2d(in_c, out_c, 1, stride, 0, bias=False),
+        nn.BatchNorm2d(out_c), nn.ReLU(True))
+
+
+def conv_bn(in_c, out_c, stride=2):
+    return nn.Sequential(
+        nn.Conv2d(in_c, out_c, 3, stride, 1, bias=False),
+        nn.BatchNorm2d(out_c), nn.ReLU(True))
+
+
+class ShuffleBlock(nn.Module):
+
+    def __init__(self, in_c, out_c, downsample=False):
+        super(ShuffleBlock, self).__init__()
+        self.downsample = downsample
+        half_c = out_c // 2
+        if downsample:
+            self.branch1 = nn.Sequential(
+                # 3*3 dw conv, stride = 2
+                # nn.Conv2d(in_c, in_c, 3, 2, 1, groups=in_c, bias=False),
+                nn.Conv2d(in_c, in_c, 3, 1, 1, groups=in_c, bias=False),
+                nn.BatchNorm2d(in_c),
+                # 1*1 pw conv
+                nn.Conv2d(in_c, half_c, 1, 1, 0, bias=False),
+                nn.BatchNorm2d(half_c),
+                nn.ReLU(True))
+
+            self.branch2 = nn.Sequential(
+                # 1*1 pw conv
+                nn.Conv2d(in_c, half_c, 1, 1, 0, bias=False),
+                nn.BatchNorm2d(half_c),
+                nn.ReLU(True),
+                # 3*3 dw conv, stride = 2
+                # nn.Conv2d(half_c, half_c, 3, 2, 1, groups=half_c, bias=False),
+                nn.Conv2d(half_c, half_c, 3, 1, 1, groups=half_c, bias=False),
+                nn.BatchNorm2d(half_c),
+                # 1*1 pw conv
+                nn.Conv2d(half_c, half_c, 1, 1, 0, bias=False),
+                nn.BatchNorm2d(half_c),
+                nn.ReLU(True))
+        else:
+            # in_c = out_c
+            assert in_c == out_c
+
+            self.branch2 = nn.Sequential(
+                # 1*1 pw conv
+                nn.Conv2d(half_c, half_c, 1, 1, 0, bias=False),
+                nn.BatchNorm2d(half_c),
+                nn.ReLU(True),
+                # 3*3 dw conv, stride = 1
+                nn.Conv2d(half_c, half_c, 3, 1, 1, groups=half_c, bias=False),
+                nn.BatchNorm2d(half_c),
+                # 1*1 pw conv
+                nn.Conv2d(half_c, half_c, 1, 1, 0, bias=False),
+                nn.BatchNorm2d(half_c),
+                nn.ReLU(True))
+
+    def forward(self, x):
+        out = None
+        if self.downsample:
+            # if it is downsampling, we don't need to do channel split
+            out = torch.cat((self.branch1(x), self.branch2(x)), 1)
+        else:
+            # channel split
+            channels = x.shape[1]
+            c = channels // 2
+            x1 = x[:, :c, :, :]
+            x2 = x[:, c:, :, :]
+            out = torch.cat((x1, self.branch2(x2)), 1)
+        return channel_shuffle(out, 2)
+
+
+class ShuffleCSPLayer(nn.Module):
+    """C3 in yolov5, CSP Bottleneck with 3 convolutions"""
+
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        n=1,
+        shortcut=True,
+        expansion=0.5,
+        depthwise=False,
+        act='silu',
+    ):
+        """
+        Args:
+            in_channels (int): input channels.
+            out_channels (int): output channels.
+            n (int): number of Bottlenecks. Default value: 1.
+        """
+        # ch_in, ch_out, number, shortcut, groups, expansion
+        super().__init__()
+        hidden_channels = int(out_channels * expansion)  # hidden channels
+        self.conv1 = BaseConv(
+            in_channels, hidden_channels, 1, stride=1, act=act)
+        self.conv2 = BaseConv(
+            in_channels, hidden_channels, 1, stride=1, act=act)
+        module_list = [
+            Bottleneck(
+                hidden_channels,
+                hidden_channels,
+                shortcut,
+                1.0,
+                depthwise,
+                act=act) for _ in range(n)
+        ]
+        self.m = nn.Sequential(*module_list)
+
+    def forward(self, x):
+        x_1 = self.conv1(x)
+        x_2 = self.conv2(x)
+        x_1 = self.m(x_1)
+        x = torch.cat((x_1, x_2), dim=1)
+        # add channel shuffle
+        return channel_shuffle(x, 2)
diff --git a/modelscope/models/cv/tinynas_detection/core/neck_ops.py b/modelscope/models/cv/tinynas_detection/core/neck_ops.py
new file mode 100644
index 00000000..7f481665
--- /dev/null
+++ b/modelscope/models/cv/tinynas_detection/core/neck_ops.py
@@ -0,0 +1,324 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+# The AIRDet implementation is also open-sourced by the authors, and available at https://github.com/tinyvision/AIRDet.
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class Swish(nn.Module):
+
+    def __init__(self, inplace=True):
+        super(Swish, self).__init__()
+        self.inplace = inplace
+
+    def forward(self, x):
+        if self.inplace:
+            x.mul_(F.sigmoid(x))
+            return x
+        else:
+            return x * F.sigmoid(x)
+
+
+def get_activation(name='silu', inplace=True):
+    if name is None:
+        return nn.Identity()
+
+    if isinstance(name, str):
+        if name == 'silu':
+            module = nn.SiLU(inplace=inplace)
+        elif name == 'relu':
+            module = nn.ReLU(inplace=inplace)
+        elif name == 'lrelu':
+            module = nn.LeakyReLU(0.1, inplace=inplace)
+        elif name == 'swish':
+            module = Swish(inplace=inplace)
+        elif name == 'hardsigmoid':
+            module = nn.Hardsigmoid(inplace=inplace)
+        else:
+            raise AttributeError('Unsupported act type: {}'.format(name))
+        return module
+    elif isinstance(name, nn.Module):
+        return name
+    else:
+        raise AttributeError('Unsupported act type: {}'.format(name))
+
+
+class ConvBNLayer(nn.Module):
+
+    def __init__(self,
+                 ch_in,
+                 ch_out,
+                 filter_size=3,
+                 stride=1,
+                 groups=1,
+                 padding=0,
+                 act=None):
+        super(ConvBNLayer, self).__init__()
+        self.conv = nn.Conv2d(
+            in_channels=ch_in,
+            out_channels=ch_out,
+            kernel_size=filter_size,
+            stride=stride,
+            padding=padding,
+            groups=groups,
+            bias=False)
+        self.bn = nn.BatchNorm2d(ch_out, )
+        self.act = get_activation(act, inplace=True)
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.bn(x)
+        x = self.act(x)
+
+        return x
+
+
+class RepVGGBlock(nn.Module):
+
+    def __init__(self, ch_in, ch_out, act='relu', deploy=False):
+        super(RepVGGBlock, self).__init__()
+        self.ch_in = ch_in
+        self.ch_out = ch_out
+        self.deploy = deploy
+        self.in_channels = ch_in
+        self.groups = 1
+        if self.deploy is False:
+            self.rbr_dense = ConvBNLayer(
+                ch_in, ch_out, 3, stride=1, padding=1, act=None)
+            self.rbr_1x1 = ConvBNLayer(
+                ch_in, ch_out, 1, stride=1, padding=0, act=None)
+            # self.rbr_identity = nn.BatchNorm2d(num_features=ch_in) if ch_out == ch_in else None
+            self.rbr_identity = None
+        else:
+            self.rbr_reparam = nn.Conv2d(
+                in_channels=self.ch_in,
+                out_channels=self.ch_out,
+                kernel_size=3,
+                stride=1,
+                padding=1,
+                groups=1)
+        self.act = get_activation(act) if act is None or isinstance(
+            act, (str, dict)) else act
+
+    def forward(self, x):
+        if self.deploy:
+            print('----------deploy----------')
+            y = self.rbr_reparam(x)
+        else:
+            if self.rbr_identity is None:
+                y = self.rbr_dense(x) + self.rbr_1x1(x)
+            else:
+                y = self.rbr_dense(x) + self.rbr_1x1(x) + self.rbr_identity(x)
+
+        y = self.act(y)
+        return y
+
+    def switch_to_deploy(self):
+        print('switch')
+        if not hasattr(self, 'rbr_reparam'):
+            # return
+            self.rbr_reparam = nn.Conv2d(
+                in_channels=self.ch_in,
+                out_channels=self.ch_out,
+                kernel_size=3,
+                stride=1,
+                padding=1,
+                groups=1)
+        print('switch')
+        kernel, bias = self.get_equivalent_kernel_bias()
+        self.rbr_reparam.weight.data = kernel
+        self.rbr_reparam.bias.data = bias
+        for para in self.parameters():
+            para.detach_()
+        # self.__delattr__(self.rbr_dense)
+        # self.__delattr__(self.rbr_1x1)
+        self.__delattr__('rbr_dense')
+        self.__delattr__('rbr_1x1')
+        if hasattr(self, 'rbr_identity'):
+            self.__delattr__('rbr_identity')
+        if hasattr(self, 'id_tensor'):
+            self.__delattr__('id_tensor')
+        self.deploy = True
+
+    def get_equivalent_kernel_bias(self):
+        kernel3x3, bias3x3 = self._fuse_bn_tensor(self.rbr_dense)
+        kernel1x1, bias1x1 = self._fuse_bn_tensor(self.rbr_1x1)
+        kernelid, biasid = self._fuse_bn_tensor(self.rbr_identity)
+        return kernel3x3 + self._pad_1x1_to_3x3_tensor(
+            kernel1x1) + kernelid, bias3x3 + bias1x1 + biasid
+
+    def _pad_1x1_to_3x3_tensor(self, kernel1x1):
+        if kernel1x1 is None:
+            return 0
+        else:
+            return torch.nn.functional.pad(kernel1x1, [1, 1, 1, 1])
+
+    def _fuse_bn_tensor(self, branch):
+        if branch is None:
+            return 0, 0
+        # if isinstance(branch, nn.Sequential):
+        if isinstance(branch, ConvBNLayer):
+            kernel = branch.conv.weight
+            running_mean = branch.bn.running_mean
+            running_var = branch.bn.running_var
+            gamma = branch.bn.weight
+            beta = branch.bn.bias
+            eps = branch.bn.eps
+        else:
+            assert isinstance(branch, nn.BatchNorm2d)
+            if not hasattr(self, 'id_tensor'):
+                input_dim = self.in_channels // self.groups
+                kernel_value = np.zeros((self.in_channels, input_dim, 3, 3),
+                                        dtype=np.float32)
+                for i in range(self.in_channels):
+                    kernel_value[i, i % input_dim, 1, 1] = 1
+                self.id_tensor = torch.from_numpy(kernel_value).to(
+                    branch.weight.device)
+            kernel = self.id_tensor
+            running_mean = branch.running_mean
+            running_var = branch.running_var
+            gamma = branch.weight
+            beta = branch.bias
+            eps = branch.eps
+        std = (running_var + eps).sqrt()
+        t = (gamma / std).reshape(-1, 1, 1, 1)
+        return kernel * t, beta - running_mean * gamma / std
+
+
+class BasicBlock(nn.Module):
+
+    def __init__(self, ch_in, ch_out, act='relu', shortcut=True):
+        super(BasicBlock, self).__init__()
+        assert ch_in == ch_out
+        # self.conv1 = ConvBNLayer(ch_in, ch_out, 3, stride=1, padding=1, act=act)
+        # self.conv1 = ConvBNLayer(ch_in, ch_out, 1, stride=1, padding=0, act=act)
+        self.conv2 = RepVGGBlock(ch_in, ch_out, act=act)
+        self.shortcut = shortcut
+
+    def forward(self, x):
+        # y = self.conv1(x)
+        y = self.conv2(x)
+        if self.shortcut:
+            return x + y
+        else:
+            return y
+
+
+class BasicBlock_3x3(nn.Module):
+
+    def __init__(self, ch_in, ch_out, act='relu', shortcut=True):
+        super(BasicBlock_3x3, self).__init__()
+        assert ch_in == ch_out
+        self.conv1 = ConvBNLayer(
+            ch_in, ch_out, 3, stride=1, padding=1, act=act)
+        # self.conv1 = ConvBNLayer(ch_in, ch_out, 1, stride=1, padding=0, act=act)
+        self.conv2 = RepVGGBlock(ch_in, ch_out, act=act)
+        self.shortcut = shortcut
+
+    def forward(self, x):
+        y = self.conv1(x)
+        y = self.conv2(y)
+        if self.shortcut:
+            return x + y
+        else:
+            return y
+
+
+class BasicBlock_3x3_Reverse(nn.Module):
+
+    def __init__(self, ch_in, ch_out, act='relu', shortcut=True):
+        super(BasicBlock_3x3_Reverse, self).__init__()
+        assert ch_in == ch_out
+        self.conv1 = ConvBNLayer(
+            ch_in, ch_out, 3, stride=1, padding=1, act=act)
+        # self.conv1 = ConvBNLayer(ch_in, ch_out, 1, stride=1, padding=0, act=act)
+        self.conv2 = RepVGGBlock(ch_in, ch_out, act=act)
+        self.shortcut = shortcut
+
+    def forward(self, x):
+        y = self.conv2(x)
+        y = self.conv1(y)
+        if self.shortcut:
+            return x + y
+        else:
+            return y
+
+
+class SPP(nn.Module):
+
+    def __init__(
+        self,
+        ch_in,
+        ch_out,
+        k,
+        pool_size,
+        act='swish',
+    ):
+        super(SPP, self).__init__()
+        self.pool = []
+        for i, size in enumerate(pool_size):
+            pool = nn.MaxPool2d(
+                kernel_size=size, stride=1, padding=size // 2, ceil_mode=False)
+            self.add_module('pool{}'.format(i), pool)
+            self.pool.append(pool)
+        self.conv = ConvBNLayer(ch_in, ch_out, k, padding=k // 2, act=act)
+
+    def forward(self, x):
+        outs = [x]
+
+        for pool in self.pool:
+            outs.append(pool(x))
+        y = torch.cat(outs, axis=1)
+
+        y = self.conv(y)
+        return y
+
+
+class CSPStage(nn.Module):
+
+    def __init__(self, block_fn, ch_in, ch_out, n, act='swish', spp=False):
+        super(CSPStage, self).__init__()
+
+        ch_mid = int(ch_out // 2)
+        self.conv1 = ConvBNLayer(ch_in, ch_mid, 1, act=act)
+        self.conv2 = ConvBNLayer(ch_in, ch_mid, 1, act=act)
+        # self.conv2 = ConvBNLayer(ch_in, ch_mid, 3, stride=1, padding=1, act=act)
+        self.convs = nn.Sequential()
+
+        next_ch_in = ch_mid
+        for i in range(n):
+            if block_fn == 'BasicBlock':
+                self.convs.add_module(
+                    str(i),
+                    BasicBlock(next_ch_in, ch_mid, act=act, shortcut=False))
+            elif block_fn == 'BasicBlock_3x3':
+                self.convs.add_module(
+                    str(i),
+                    BasicBlock_3x3(next_ch_in, ch_mid, act=act, shortcut=True))
+            elif block_fn == 'BasicBlock_3x3_Reverse':
+                self.convs.add_module(
+                    str(i),
+                    BasicBlock_3x3_Reverse(
+                        next_ch_in, ch_mid, act=act, shortcut=True))
+            else:
+                raise NotImplementedError
+            if i == (n - 1) // 2 and spp:
+                self.convs.add_module(
+                    'spp', SPP(ch_mid * 4, ch_mid, 1, [5, 9, 13], act=act))
+            next_ch_in = ch_mid
+        # self.convs = nn.Sequential(*convs)
+        self.conv3 = ConvBNLayer(ch_mid * (n + 1), ch_out, 1, act=act)
+
+    def forward(self, x):
+        y1 = self.conv1(x)
+        y2 = self.conv2(x)
+
+        mid_out = [y1]
+        for conv in self.convs:
+            y2 = conv(y2)
+            mid_out.append(y2)
+        y = torch.cat(mid_out, axis=1)
+        y = self.conv3(y)
+        return y
diff --git a/modelscope/models/cv/tinynas_detection/core/repvgg_block.py b/modelscope/models/cv/tinynas_detection/core/repvgg_block.py
new file mode 100644
index 00000000..06966a4e
--- /dev/null
+++ b/modelscope/models/cv/tinynas_detection/core/repvgg_block.py
@@ -0,0 +1,205 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+# The AIRDet implementation is also open-sourced by the authors, and available at https://github.com/tinyvision/AIRDet.
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.nn.init as init
+from torch.nn.parameter import Parameter
+
+
+def get_activation(name='silu', inplace=True):
+    if name == 'silu':
+        module = nn.SiLU(inplace=inplace)
+    elif name == 'relu':
+        module = nn.ReLU(inplace=inplace)
+    elif name == 'lrelu':
+        module = nn.LeakyReLU(0.1, inplace=inplace)
+    elif name == 'identity':
+        module = nn.Identity()
+    else:
+        raise AttributeError('Unsupported act type: {}'.format(name))
+    return module
+
+
+def conv_bn(in_channels, out_channels, kernel_size, stride, padding, groups=1):
+    '''Basic cell for rep-style block, including conv and bn'''
+    result = nn.Sequential()
+    result.add_module(
+        'conv',
+        nn.Conv2d(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            groups=groups,
+            bias=False))
+    result.add_module('bn', nn.BatchNorm2d(num_features=out_channels))
+    return result
+
+
+class RepVggBlock(nn.Module):
+    '''RepVggBlock is a basic rep-style block, including training and deploy status
+    This code is based on https://github.com/DingXiaoH/RepVGG/blob/main/repvgg.py
+    '''
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size=3,
+                 stride=1,
+                 padding=1,
+                 dilation=1,
+                 groups=1,
+                 padding_mode='zeros',
+                 deploy=False,
+                 use_se=False,
+                 act='relu',
+                 norm=None):
+        super(RepVggBlock, self).__init__()
+        """ Initialization of the class.
+        Args:
+            in_channels (int): Number of channels in the input image
+            out_channels (int): Number of channels produced by the convolution
+            kernel_size (int or tuple): Size of the convolving kernel
+            stride (int or tuple, optional): Stride of the convolution. Default: 1
+            padding (int or tuple, optional): Zero-padding added to both sides of
+                the input. Default: 1
+            dilation (int or tuple, optional): Spacing between kernel elements. Default: 1
+            groups (int, optional): Number of blocked connections from input
+                channels to output channels. Default: 1
+            padding_mode (string, optional): Default: 'zeros'
+            deploy: Whether to be deploy status or training status. Default: False
+            use_se: Whether to use se. Default: False
+        """
+        self.deploy = deploy
+        self.groups = groups
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+
+        assert kernel_size == 3
+        assert padding == 1
+
+        padding_11 = padding - kernel_size // 2
+
+        if isinstance(act, str):
+            self.nonlinearity = get_activation(act)
+        else:
+            self.nonlinearity = act
+
+        if use_se:
+            raise NotImplementedError('se block not supported yet')
+        else:
+            self.se = nn.Identity()
+
+        if deploy:
+            self.rbr_reparam = nn.Conv2d(
+                in_channels=in_channels,
+                out_channels=out_channels,
+                kernel_size=kernel_size,
+                stride=stride,
+                padding=padding,
+                dilation=dilation,
+                groups=groups,
+                bias=True,
+                padding_mode=padding_mode)
+
+        else:
+            self.rbr_identity = None
+            self.rbr_dense = conv_bn(
+                in_channels=in_channels,
+                out_channels=out_channels,
+                kernel_size=kernel_size,
+                stride=stride,
+                padding=padding,
+                groups=groups)
+            self.rbr_1x1 = conv_bn(
+                in_channels=in_channels,
+                out_channels=out_channels,
+                kernel_size=1,
+                stride=stride,
+                padding=padding_11,
+                groups=groups)
+
+    def forward(self, inputs):
+        '''Forward process'''
+        if hasattr(self, 'rbr_reparam'):
+            return self.nonlinearity(self.se(self.rbr_reparam(inputs)))
+
+        if self.rbr_identity is None:
+            id_out = 0
+        else:
+            id_out = self.rbr_identity(inputs)
+
+        return self.nonlinearity(
+            self.se(self.rbr_dense(inputs) + self.rbr_1x1(inputs) + id_out))
+
+    def get_equivalent_kernel_bias(self):
+        kernel3x3, bias3x3 = self._fuse_bn_tensor(self.rbr_dense)
+        kernel1x1, bias1x1 = self._fuse_bn_tensor(self.rbr_1x1)
+        kernelid, biasid = self._fuse_bn_tensor(self.rbr_identity)
+        return kernel3x3 + self._pad_1x1_to_3x3_tensor(
+            kernel1x1) + kernelid, bias3x3 + bias1x1 + biasid
+
+    def _pad_1x1_to_3x3_tensor(self, kernel1x1):
+        if kernel1x1 is None:
+            return 0
+        else:
+            return torch.nn.functional.pad(kernel1x1, [1, 1, 1, 1])
+
+    def _fuse_bn_tensor(self, branch):
+        if branch is None:
+            return 0, 0
+        if isinstance(branch, nn.Sequential):
+            kernel = branch.conv.weight
+            running_mean = branch.bn.running_mean
+            running_var = branch.bn.running_var
+            gamma = branch.bn.weight
+            beta = branch.bn.bias
+            eps = branch.bn.eps
+        else:
+            assert isinstance(branch, nn.BatchNorm2d)
+            if not hasattr(self, 'id_tensor'):
+                input_dim = self.in_channels // self.groups
+                kernel_value = np.zeros((self.in_channels, input_dim, 3, 3),
+                                        dtype=np.float32)
+                for i in range(self.in_channels):
+                    kernel_value[i, i % input_dim, 1, 1] = 1
+                self.id_tensor = torch.from_numpy(kernel_value).to(
+                    branch.weight.device)
+            kernel = self.id_tensor
+            running_mean = branch.running_mean
+            running_var = branch.running_var
+            gamma = branch.weight
+            beta = branch.bias
+            eps = branch.eps
+        std = (running_var + eps).sqrt()
+        t = (gamma / std).reshape(-1, 1, 1, 1)
+        return kernel * t, beta - running_mean * gamma / std
+
+    def switch_to_deploy(self):
+        if hasattr(self, 'rbr_reparam'):
+            return
+        kernel, bias = self.get_equivalent_kernel_bias()
+        self.rbr_reparam = nn.Conv2d(
+            in_channels=self.rbr_dense.conv.in_channels,
+            out_channels=self.rbr_dense.conv.out_channels,
+            kernel_size=self.rbr_dense.conv.kernel_size,
+            stride=self.rbr_dense.conv.stride,
+            padding=self.rbr_dense.conv.padding,
+            dilation=self.rbr_dense.conv.dilation,
+            groups=self.rbr_dense.conv.groups,
+            bias=True)
+        self.rbr_reparam.weight.data = kernel
+        self.rbr_reparam.bias.data = bias
+        for para in self.parameters():
+            para.detach_()
+        self.__delattr__('rbr_dense')
+        self.__delattr__('rbr_1x1')
+        if hasattr(self, 'rbr_identity'):
+            self.__delattr__('rbr_identity')
+        if hasattr(self, 'id_tensor'):
+            self.__delattr__('id_tensor')
+        self.deploy = True
diff --git a/modelscope/models/cv/tinynas_detection/core/utils.py b/modelscope/models/cv/tinynas_detection/core/utils.py
new file mode 100644
index 00000000..482f12fb
--- /dev/null
+++ b/modelscope/models/cv/tinynas_detection/core/utils.py
@@ -0,0 +1,196 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+# The AIRDet implementation is also open-sourced by the authors, and available at https://github.com/tinyvision/AIRDet.
+
+import numpy as np
+import torch
+import torchvision
+
+__all__ = [
+    'filter_box',
+    'postprocess_airdet',
+    'bboxes_iou',
+    'matrix_iou',
+    'adjust_box_anns',
+    'xyxy2xywh',
+    'xyxy2cxcywh',
+]
+
+
+def multiclass_nms(multi_bboxes,
+                   multi_scores,
+                   score_thr,
+                   iou_thr,
+                   max_num=100,
+                   score_factors=None):
+    """NMS for multi-class bboxes.
+
+    Args:
+        multi_bboxes (Tensor): shape (n, #class*4) or (n, 4)
+        multi_scores (Tensor): shape (n, #class), where the last column
+            contains scores of the background class, but this will be ignored.
+        score_thr (float): bbox threshold, bboxes with scores lower than it
+            will not be considered.
+        nms_thr (float): NMS IoU threshold
+        max_num (int): if there are more than max_num bboxes after NMS,
+            only top max_num will be kept.
+        score_factors (Tensor): The factors multiplied to scores before
+            applying NMS
+
+    Returns:
+        tuple: (bboxes, labels), tensors of shape (k, 5) and (k, 1). Labels \
+            are 0-based.
+    """
+    num_classes = multi_scores.size(1)
+    # exclude background category
+    if multi_bboxes.shape[1] > 4:
+        bboxes = multi_bboxes.view(multi_scores.size(0), -1, 4)
+    else:
+        bboxes = multi_bboxes[:, None].expand(
+            multi_scores.size(0), num_classes, 4)
+    scores = multi_scores
+    # filter out boxes with low scores
+    valid_mask = scores > score_thr  # 1000 * 80 bool
+
+    # We use masked_select for ONNX exporting purpose,
+    # which is equivalent to bboxes = bboxes[valid_mask]
+    # (TODO): as ONNX does not support repeat now,
+    # we have to use this ugly code
+    # bboxes -> 1000, 4
+    bboxes = torch.masked_select(
+        bboxes,
+        torch.stack((valid_mask, valid_mask, valid_mask, valid_mask),
+                    -1)).view(-1, 4)  # mask->  1000*80*4, 80000*4
+    if score_factors is not None:
+        scores = scores * score_factors[:, None]
+    scores = torch.masked_select(scores, valid_mask)
+    labels = valid_mask.nonzero(as_tuple=False)[:, 1]
+
+    if bboxes.numel() == 0:
+        bboxes = multi_bboxes.new_zeros((0, 5))
+        labels = multi_bboxes.new_zeros((0, ), dtype=torch.long)
+        scores = multi_bboxes.new_zeros((0, ))
+
+        return bboxes, scores, labels
+
+    keep = torchvision.ops.batched_nms(bboxes, scores, labels, iou_thr)
+
+    if max_num > 0:
+        keep = keep[:max_num]
+
+    return bboxes[keep], scores[keep], labels[keep]
+
+
+def filter_box(output, scale_range):
+    """
+    output: (N, 5+class) shape
+    """
+    min_scale, max_scale = scale_range
+    w = output[:, 2] - output[:, 0]
+    h = output[:, 3] - output[:, 1]
+    keep = (w * h > min_scale * min_scale) & (w * h < max_scale * max_scale)
+    return output[keep]
+
+
+def filter_results(boxlist, num_classes, nms_thre):
+    boxes = boxlist.bbox
+    scores = boxlist.get_field('scores')
+    cls = boxlist.get_field('labels')
+    nms_out_index = torchvision.ops.batched_nms(
+        boxes,
+        scores,
+        cls,
+        nms_thre,
+    )
+    boxlist = boxlist[nms_out_index]
+
+    return boxlist
+
+
+def postprocess_airdet(prediction,
+                       num_classes,
+                       conf_thre=0.7,
+                       nms_thre=0.45,
+                       imgs=None):
+    box_corner = prediction.new(prediction.shape)
+    box_corner[:, :, 0] = prediction[:, :, 0] - prediction[:, :, 2] / 2
+    box_corner[:, :, 1] = prediction[:, :, 1] - prediction[:, :, 3] / 2
+    box_corner[:, :, 2] = prediction[:, :, 0] + prediction[:, :, 2] / 2
+    box_corner[:, :, 3] = prediction[:, :, 1] + prediction[:, :, 3] / 2
+    prediction[:, :, :4] = box_corner[:, :, :4]
+    output = [None for _ in range(len(prediction))]
+    for i, image_pred in enumerate(prediction):
+        # If none are remaining => process next image
+        if not image_pred.size(0):
+            continue
+        multi_bboxes = image_pred[:, :4]
+        multi_scores = image_pred[:, 5:]
+        detections, scores, labels = multiclass_nms(multi_bboxes, multi_scores,
+                                                    conf_thre, nms_thre, 500)
+        detections = torch.cat(
+            (detections, scores[:, None], scores[:, None], labels[:, None]),
+            dim=1)
+
+        if output[i] is None:
+            output[i] = detections
+        else:
+            output[i] = torch.cat((output[i], detections))
+    return output
+
+
+def bboxes_iou(bboxes_a, bboxes_b, xyxy=True):
+    if bboxes_a.shape[1] != 4 or bboxes_b.shape[1] != 4:
+        raise IndexError
+
+    if xyxy:
+        tl = torch.max(bboxes_a[:, None, :2], bboxes_b[:, :2])
+        br = torch.min(bboxes_a[:, None, 2:], bboxes_b[:, 2:])
+        area_a = torch.prod(bboxes_a[:, 2:] - bboxes_a[:, :2], 1)
+        area_b = torch.prod(bboxes_b[:, 2:] - bboxes_b[:, :2], 1)
+    else:
+        tl = torch.max(
+            (bboxes_a[:, None, :2] - bboxes_a[:, None, 2:] / 2),
+            (bboxes_b[:, :2] - bboxes_b[:, 2:] / 2),
+        )
+        br = torch.min(
+            (bboxes_a[:, None, :2] + bboxes_a[:, None, 2:] / 2),
+            (bboxes_b[:, :2] + bboxes_b[:, 2:] / 2),
+        )
+
+        area_a = torch.prod(bboxes_a[:, 2:], 1)
+        area_b = torch.prod(bboxes_b[:, 2:], 1)
+    en = (tl < br).type(tl.type()).prod(dim=2)
+    area_i = torch.prod(br - tl, 2) * en  # * ((tl < br).all())
+    return area_i / (area_a[:, None] + area_b - area_i)
+
+
+def matrix_iou(a, b):
+    """
+    return iou of a and b, numpy version for data augenmentation
+    """
+    lt = np.maximum(a[:, np.newaxis, :2], b[:, :2])
+    rb = np.minimum(a[:, np.newaxis, 2:], b[:, 2:])
+
+    area_i = np.prod(rb - lt, axis=2) * (lt < rb).all(axis=2)
+    area_a = np.prod(a[:, 2:] - a[:, :2], axis=1)
+    area_b = np.prod(b[:, 2:] - b[:, :2], axis=1)
+    return area_i / (area_a[:, np.newaxis] + area_b - area_i + 1e-12)
+
+
+def adjust_box_anns(bbox, scale_ratio, padw, padh, w_max, h_max):
+    bbox[:, 0::2] = np.clip(bbox[:, 0::2] * scale_ratio + padw, 0, w_max)
+    bbox[:, 1::2] = np.clip(bbox[:, 1::2] * scale_ratio + padh, 0, h_max)
+    return bbox
+
+
+def xyxy2xywh(bboxes):
+    bboxes[:, 2] = bboxes[:, 2] - bboxes[:, 0]
+    bboxes[:, 3] = bboxes[:, 3] - bboxes[:, 1]
+    return bboxes
+
+
+def xyxy2cxcywh(bboxes):
+    bboxes[:, 2] = bboxes[:, 2] - bboxes[:, 0]
+    bboxes[:, 3] = bboxes[:, 3] - bboxes[:, 1]
+    bboxes[:, 0] = bboxes[:, 0] + bboxes[:, 2] * 0.5
+    bboxes[:, 1] = bboxes[:, 1] + bboxes[:, 3] * 0.5
+    return bboxes
diff --git a/modelscope/models/cv/tinynas_detection/detector.py b/modelscope/models/cv/tinynas_detection/detector.py
new file mode 100644
index 00000000..615b13a8
--- /dev/null
+++ b/modelscope/models/cv/tinynas_detection/detector.py
@@ -0,0 +1,181 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+# The AIRDet implementation is also open-sourced by the authors, and available at https://github.com/tinyvision/AIRDet.
+
+import os.path as osp
+import pickle
+
+import cv2
+import torch
+import torchvision
+
+from modelscope.metainfo import Models
+from modelscope.models.base.base_torch_model import TorchModel
+from modelscope.models.builder import MODELS
+from modelscope.utils.config import Config
+from modelscope.utils.constant import ModelFile, Tasks
+from .backbone import build_backbone
+from .head import build_head
+from .neck import build_neck
+from .utils import parse_config
+
+
+class SingleStageDetector(TorchModel):
+    """
+    The base class of single stage detector.
+    """
+
+    def __init__(self, model_dir: str, *args, **kwargs):
+        """
+        init model by cfg
+        """
+        super().__init__(model_dir, *args, **kwargs)
+
+        config_path = osp.join(model_dir, 'airdet_s.py')
+        config = parse_config(config_path)
+        self.cfg = config
+        model_path = osp.join(model_dir, config.model.name)
+        label_map = osp.join(model_dir, config.model.class_map)
+        self.label_map = pickle.load(open(label_map, 'rb'))
+        self.size_divisible = config.dataset.size_divisibility
+        self.num_classes = config.model.head.num_classes
+        self.conf_thre = config.model.head.nms_conf_thre
+        self.nms_thre = config.model.head.nms_iou_thre
+
+        self.backbone = build_backbone(self.cfg.model.backbone)
+        self.neck = build_neck(self.cfg.model.neck)
+        self.head = build_head(self.cfg.model.head)
+
+        self.load_pretrain_model(model_path)
+
+    def load_pretrain_model(self, pretrain_model):
+
+        state_dict = torch.load(pretrain_model, map_location='cpu')['model']
+        new_state_dict = {}
+        for k, v in state_dict.items():
+            k = k.replace('module.', '')
+            new_state_dict[k] = v
+        self.load_state_dict(new_state_dict, strict=True)
+
+    def inference(self, x):
+
+        if self.training:
+            return self.forward_train(x)
+        else:
+            return self.forward_eval(x)
+
+    def forward_train(self, x):
+
+        pass
+
+    def forward_eval(self, x):
+
+        x = self.backbone(x)
+        x = self.neck(x)
+        prediction = self.head(x)
+
+        return prediction
+
+    def preprocess(self, image):
+        image = torch.from_numpy(image).type(torch.float32)
+        image = image.permute(2, 0, 1)
+        shape = image.shape  # c, h, w
+        if self.size_divisible > 0:
+            import math
+            stride = self.size_divisible
+            shape = list(shape)
+            shape[1] = int(math.ceil(shape[1] / stride) * stride)
+            shape[2] = int(math.ceil(shape[2] / stride) * stride)
+            shape = tuple(shape)
+        pad_img = image.new(*shape).zero_()
+        pad_img[:, :image.shape[1], :image.shape[2]].copy_(image)
+        pad_img = pad_img.unsqueeze(0)
+
+        return pad_img
+
+    def postprocess(self, preds):
+        bboxes, scores, labels_idx = postprocess_gfocal(
+            preds, self.num_classes, self.conf_thre, self.nms_thre)
+        bboxes = bboxes.cpu().numpy()
+        scores = scores.cpu().numpy()
+        labels_idx = labels_idx.cpu().numpy()
+        labels = [self.label_map[idx + 1][0]['name'] for idx in labels_idx]
+
+        return (bboxes, scores, labels)
+
+
+def multiclass_nms(multi_bboxes,
+                   multi_scores,
+                   score_thr,
+                   iou_thr,
+                   max_num=100,
+                   score_factors=None):
+    """NMS for multi-class bboxes.
+
+    Args:
+        multi_bboxes (Tensor): shape (n, #class*4) or (n, 4)
+        multi_scores (Tensor): shape (n, #class), where the last column
+            contains scores of the background class, but this will be ignored.
+        score_thr (float): bbox threshold, bboxes with scores lower than it
+            will not be considered.
+        nms_thr (float): NMS IoU threshold
+        max_num (int): if there are more than max_num bboxes after NMS,
+            only top max_num will be kept.
+        score_factors (Tensor): The factors multiplied to scores before
+            applying NMS
+
+    Returns:
+        tuple: (bboxes, labels), tensors of shape (k, 5) and (k, 1). Labels \
+            are 0-based.
+    """
+    num_classes = multi_scores.size(1)
+    # exclude background category
+    if multi_bboxes.shape[1] > 4:
+        bboxes = multi_bboxes.view(multi_scores.size(0), -1, 4)
+    else:
+        bboxes = multi_bboxes[:, None].expand(
+            multi_scores.size(0), num_classes, 4)
+    scores = multi_scores
+    # filter out boxes with low scores
+    valid_mask = scores > score_thr  # 1000 * 80 bool
+
+    # We use masked_select for ONNX exporting purpose,
+    # which is equivalent to bboxes = bboxes[valid_mask]
+    # (TODO): as ONNX does not support repeat now,
+    # we have to use this ugly code
+    # bboxes -> 1000, 4
+    bboxes = torch.masked_select(
+        bboxes,
+        torch.stack((valid_mask, valid_mask, valid_mask, valid_mask),
+                    -1)).view(-1, 4)  # mask->  1000*80*4, 80000*4
+    if score_factors is not None:
+        scores = scores * score_factors[:, None]
+    scores = torch.masked_select(scores, valid_mask)
+    labels = valid_mask.nonzero(as_tuple=False)[:, 1]
+
+    if bboxes.numel() == 0:
+        bboxes = multi_bboxes.new_zeros((0, 5))
+        labels = multi_bboxes.new_zeros((0, ), dtype=torch.long)
+        scores = multi_bboxes.new_zeros((0, ))
+
+        return bboxes, scores, labels
+
+    keep = torchvision.ops.batched_nms(bboxes, scores, labels, iou_thr)
+
+    if max_num > 0:
+        keep = keep[:max_num]
+
+    return bboxes[keep], scores[keep], labels[keep]
+
+
+def postprocess_gfocal(prediction, num_classes, conf_thre=0.05, nms_thre=0.7):
+    assert prediction.shape[0] == 1
+    for i, image_pred in enumerate(prediction):
+        # If none are remaining => process next image
+        if not image_pred.size(0):
+            continue
+        multi_bboxes = image_pred[:, :4]
+        multi_scores = image_pred[:, 4:]
+        detections, scores, labels = multiclass_nms(multi_bboxes, multi_scores,
+                                                    conf_thre, nms_thre, 500)
+
+    return detections, scores, labels
diff --git a/modelscope/models/cv/tinynas_detection/head/__init__.py b/modelscope/models/cv/tinynas_detection/head/__init__.py
new file mode 100644
index 00000000..f870fae1
--- /dev/null
+++ b/modelscope/models/cv/tinynas_detection/head/__init__.py
@@ -0,0 +1,16 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+# The AIRDet implementation is also open-sourced by the authors, and available at https://github.com/tinyvision/AIRDet.
+
+import copy
+
+from .gfocal_v2_tiny import GFocalHead_Tiny
+
+
+def build_head(cfg):
+
+    head_cfg = copy.deepcopy(cfg)
+    name = head_cfg.pop('name')
+    if name == 'GFocalV2':
+        return GFocalHead_Tiny(**head_cfg)
+    else:
+        raise NotImplementedError
diff --git a/modelscope/models/cv/tinynas_detection/head/gfocal_v2_tiny.py b/modelscope/models/cv/tinynas_detection/head/gfocal_v2_tiny.py
new file mode 100644
index 00000000..41f35968
--- /dev/null
+++ b/modelscope/models/cv/tinynas_detection/head/gfocal_v2_tiny.py
@@ -0,0 +1,361 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+# The AIRDet implementation is also open-sourced by the authors, and available at https://github.com/tinyvision/AIRDet.
+
+import functools
+from functools import partial
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from ..core.base_ops import BaseConv, DWConv
+
+
+class Scale(nn.Module):
+
+    def __init__(self, scale=1.0):
+        super(Scale, self).__init__()
+        self.scale = nn.Parameter(torch.tensor(scale, dtype=torch.float))
+
+    def forward(self, x):
+        return x * self.scale
+
+
+def multi_apply(func, *args, **kwargs):
+
+    pfunc = partial(func, **kwargs) if kwargs else func
+    map_results = map(pfunc, *args)
+    return tuple(map(list, zip(*map_results)))
+
+
+def xyxy2CxCywh(xyxy, size=None):
+    x1 = xyxy[..., 0]
+    y1 = xyxy[..., 1]
+    x2 = xyxy[..., 2]
+    y2 = xyxy[..., 3]
+
+    cx = (x1 + x2) / 2
+    cy = (y1 + y2) / 2
+
+    w = x2 - x1
+    h = y2 - y1
+    if size is not None:
+        w = w.clamp(min=0, max=size[1])
+        h = h.clamp(min=0, max=size[0])
+    return torch.stack([cx, cy, w, h], axis=-1)
+
+
+def distance2bbox(points, distance, max_shape=None):
+    """Decode distance prediction to bounding box.
+    """
+    x1 = points[..., 0] - distance[..., 0]
+    y1 = points[..., 1] - distance[..., 1]
+    x2 = points[..., 0] + distance[..., 2]
+    y2 = points[..., 1] + distance[..., 3]
+    if max_shape is not None:
+        x1 = x1.clamp(min=0, max=max_shape[1])
+        y1 = y1.clamp(min=0, max=max_shape[0])
+        x2 = x2.clamp(min=0, max=max_shape[1])
+        y2 = y2.clamp(min=0, max=max_shape[0])
+    return torch.stack([x1, y1, x2, y2], -1)
+
+
+def bbox2distance(points, bbox, max_dis=None, eps=0.1):
+    """Decode bounding box based on distances.
+    """
+    left = points[:, 0] - bbox[:, 0]
+    top = points[:, 1] - bbox[:, 1]
+    right = bbox[:, 2] - points[:, 0]
+    bottom = bbox[:, 3] - points[:, 1]
+    if max_dis is not None:
+        left = left.clamp(min=0, max=max_dis - eps)
+        top = top.clamp(min=0, max=max_dis - eps)
+        right = right.clamp(min=0, max=max_dis - eps)
+        bottom = bottom.clamp(min=0, max=max_dis - eps)
+    return torch.stack([left, top, right, bottom], -1)
+
+
+class Integral(nn.Module):
+    """A fixed layer for calculating integral result from distribution.
+    """
+
+    def __init__(self, reg_max=16):
+        super(Integral, self).__init__()
+        self.reg_max = reg_max
+        self.register_buffer('project',
+                             torch.linspace(0, self.reg_max, self.reg_max + 1))
+
+    def forward(self, x):
+        """Forward feature from the regression head to get integral result of
+        bounding box location.
+        """
+        shape = x.size()
+        x = F.softmax(x.reshape(*shape[:-1], 4, self.reg_max + 1), dim=-1)
+        b, nb, ne, _ = x.size()
+        x = x.reshape(b * nb * ne, self.reg_max + 1)
+        y = self.project.type_as(x).unsqueeze(1)
+        x = torch.matmul(x, y).reshape(b, nb, 4)
+        return x
+
+
+class GFocalHead_Tiny(nn.Module):
+    """Ref to Generalized Focal Loss V2: Learning Reliable Localization Quality
+    Estimation for Dense Object Detection.
+    """
+
+    def __init__(
+            self,
+            num_classes,
+            in_channels,
+            stacked_convs=4,  # 4
+            feat_channels=256,
+            reg_max=12,
+            reg_topk=4,
+            reg_channels=64,
+            strides=[8, 16, 32],
+            add_mean=True,
+            norm='gn',
+            act='relu',
+            start_kernel_size=3,
+            conv_groups=1,
+            conv_type='BaseConv',
+            simOTA_cls_weight=1.0,
+            simOTA_iou_weight=3.0,
+            octbase=8,
+            simlqe=False,
+            **kwargs):
+        self.simlqe = simlqe
+        self.num_classes = num_classes
+        self.in_channels = in_channels
+        self.strides = strides
+        self.feat_channels = feat_channels if isinstance(feat_channels, list) \
+            else [feat_channels] * len(self.strides)
+
+        self.cls_out_channels = num_classes + 1  # add 1 for keep consistance with former models
+        # and will be deprecated in future.
+        self.stacked_convs = stacked_convs
+        self.conv_groups = conv_groups
+        self.reg_max = reg_max
+        self.reg_topk = reg_topk
+        self.reg_channels = reg_channels
+        self.add_mean = add_mean
+        self.total_dim = reg_topk
+        self.start_kernel_size = start_kernel_size
+
+        self.norm = norm
+        self.act = act
+        self.conv_module = DWConv if conv_type == 'DWConv' else BaseConv
+
+        if add_mean:
+            self.total_dim += 1
+
+        super(GFocalHead_Tiny, self).__init__()
+        self.integral = Integral(self.reg_max)
+
+        self._init_layers()
+
+    def _build_not_shared_convs(self, in_channel, feat_channels):
+        self.relu = nn.ReLU(inplace=True)
+        cls_convs = nn.ModuleList()
+        reg_convs = nn.ModuleList()
+
+        for i in range(self.stacked_convs):
+            chn = feat_channels if i > 0 else in_channel
+            kernel_size = 3 if i > 0 else self.start_kernel_size
+            cls_convs.append(
+                self.conv_module(
+                    chn,
+                    feat_channels,
+                    kernel_size,
+                    stride=1,
+                    groups=self.conv_groups,
+                    norm=self.norm,
+                    act=self.act))
+            reg_convs.append(
+                self.conv_module(
+                    chn,
+                    feat_channels,
+                    kernel_size,
+                    stride=1,
+                    groups=self.conv_groups,
+                    norm=self.norm,
+                    act=self.act))
+        if not self.simlqe:
+            conf_vector = [nn.Conv2d(4 * self.total_dim, self.reg_channels, 1)]
+        else:
+            conf_vector = [
+                nn.Conv2d(4 * (self.reg_max + 1), self.reg_channels, 1)
+            ]
+        conf_vector += [self.relu]
+        conf_vector += [nn.Conv2d(self.reg_channels, 1, 1), nn.Sigmoid()]
+        reg_conf = nn.Sequential(*conf_vector)
+
+        return cls_convs, reg_convs, reg_conf
+
+    def _init_layers(self):
+        """Initialize layers of the head."""
+        self.relu = nn.ReLU(inplace=True)
+        self.cls_convs = nn.ModuleList()
+        self.reg_convs = nn.ModuleList()
+        self.reg_confs = nn.ModuleList()
+
+        for i in range(len(self.strides)):
+            cls_convs, reg_convs, reg_conf = self._build_not_shared_convs(
+                self.in_channels[i], self.feat_channels[i])
+            self.cls_convs.append(cls_convs)
+            self.reg_convs.append(reg_convs)
+            self.reg_confs.append(reg_conf)
+
+        self.gfl_cls = nn.ModuleList([
+            nn.Conv2d(
+                self.feat_channels[i], self.cls_out_channels, 3, padding=1)
+            for i in range(len(self.strides))
+        ])
+
+        self.gfl_reg = nn.ModuleList([
+            nn.Conv2d(
+                self.feat_channels[i], 4 * (self.reg_max + 1), 3, padding=1)
+            for i in range(len(self.strides))
+        ])
+
+        self.scales = nn.ModuleList([Scale(1.0) for _ in self.strides])
+
+    def forward(self,
+                xin,
+                labels=None,
+                imgs=None,
+                conf_thre=0.05,
+                nms_thre=0.7):
+
+        # prepare labels during training
+        b, c, h, w = xin[0].shape
+        if labels is not None:
+            gt_bbox_list = []
+            gt_cls_list = []
+            for label in labels:
+                gt_bbox_list.append(label.bbox)
+                gt_cls_list.append((label.get_field('labels')
+                                    - 1).long())  # labels starts from 1
+
+        # prepare priors for label assignment and bbox decode
+        mlvl_priors_list = [
+            self.get_single_level_center_priors(
+                xin[i].shape[0],
+                xin[i].shape[-2:],
+                stride,
+                dtype=torch.float32,
+                device=xin[0].device) for i, stride in enumerate(self.strides)
+        ]
+        mlvl_priors = torch.cat(mlvl_priors_list, dim=1)
+
+        # forward for bboxes and classification prediction
+        cls_scores, bbox_preds = multi_apply(
+            self.forward_single,
+            xin,
+            self.cls_convs,
+            self.reg_convs,
+            self.gfl_cls,
+            self.gfl_reg,
+            self.reg_confs,
+            self.scales,
+        )
+        flatten_cls_scores = torch.cat(cls_scores, dim=1)
+        flatten_bbox_preds = torch.cat(bbox_preds, dim=1)
+
+        # calculating losses or bboxes decoded
+        if self.training:
+            loss = self.loss(flatten_cls_scores, flatten_bbox_preds,
+                             gt_bbox_list, gt_cls_list, mlvl_priors)
+            return loss
+        else:
+            output = self.get_bboxes(flatten_cls_scores, flatten_bbox_preds,
+                                     mlvl_priors)
+            return output
+
+    def forward_single(self, x, cls_convs, reg_convs, gfl_cls, gfl_reg,
+                       reg_conf, scale):
+        """Forward feature of a single scale level.
+
+        """
+        cls_feat = x
+        reg_feat = x
+
+        for cls_conv in cls_convs:
+            cls_feat = cls_conv(cls_feat)
+        for reg_conv in reg_convs:
+            reg_feat = reg_conv(reg_feat)
+
+        bbox_pred = scale(gfl_reg(reg_feat)).float()
+        N, C, H, W = bbox_pred.size()
+        prob = F.softmax(
+            bbox_pred.reshape(N, 4, self.reg_max + 1, H, W), dim=2)
+        if not self.simlqe:
+            prob_topk, _ = prob.topk(self.reg_topk, dim=2)
+
+            if self.add_mean:
+                stat = torch.cat(
+                    [prob_topk, prob_topk.mean(dim=2, keepdim=True)], dim=2)
+            else:
+                stat = prob_topk
+
+            quality_score = reg_conf(stat.reshape(N, 4 * self.total_dim, H, W))
+        else:
+            quality_score = reg_conf(
+                bbox_pred.reshape(N, 4 * (self.reg_max + 1), H, W))
+
+        cls_score = gfl_cls(cls_feat).sigmoid() * quality_score
+
+        flatten_cls_score = cls_score.flatten(start_dim=2).transpose(1, 2)
+        flatten_bbox_pred = bbox_pred.flatten(start_dim=2).transpose(1, 2)
+        return flatten_cls_score, flatten_bbox_pred
+
+    def get_single_level_center_priors(self, batch_size, featmap_size, stride,
+                                       dtype, device):
+
+        h, w = featmap_size
+        x_range = (torch.arange(0, int(w), dtype=dtype,
+                                device=device)) * stride
+        y_range = (torch.arange(0, int(h), dtype=dtype,
+                                device=device)) * stride
+
+        x = x_range.repeat(h, 1)
+        y = y_range.unsqueeze(-1).repeat(1, w)
+
+        y = y.flatten()
+        x = x.flatten()
+        strides = x.new_full((x.shape[0], ), stride)
+        priors = torch.stack([x, y, strides, strides], dim=-1)
+
+        return priors.unsqueeze(0).repeat(batch_size, 1, 1)
+
+    def sample(self, assign_result, gt_bboxes):
+        pos_inds = torch.nonzero(
+            assign_result.gt_inds > 0, as_tuple=False).squeeze(-1).unique()
+        neg_inds = torch.nonzero(
+            assign_result.gt_inds == 0, as_tuple=False).squeeze(-1).unique()
+        pos_assigned_gt_inds = assign_result.gt_inds[pos_inds] - 1
+
+        if gt_bboxes.numel() == 0:
+            # hack for index error case
+            assert pos_assigned_gt_inds.numel() == 0
+            pos_gt_bboxes = torch.empty_like(gt_bboxes).view(-1, 4)
+        else:
+            if len(gt_bboxes.shape) < 2:
+                gt_bboxes = gt_bboxes.view(-1, 4)
+            pos_gt_bboxes = gt_bboxes[pos_assigned_gt_inds, :]
+
+        return pos_inds, neg_inds, pos_gt_bboxes, pos_assigned_gt_inds
+
+    def get_bboxes(self,
+                   cls_preds,
+                   reg_preds,
+                   mlvl_center_priors,
+                   img_meta=None):
+
+        dis_preds = self.integral(reg_preds) * mlvl_center_priors[..., 2, None]
+        bboxes = distance2bbox(mlvl_center_priors[..., :2], dis_preds)
+
+        res = torch.cat([bboxes, cls_preds[..., 0:self.num_classes]], dim=-1)
+
+        return res
diff --git a/modelscope/models/cv/tinynas_detection/neck/__init__.py b/modelscope/models/cv/tinynas_detection/neck/__init__.py
new file mode 100644
index 00000000..3c418c29
--- /dev/null
+++ b/modelscope/models/cv/tinynas_detection/neck/__init__.py
@@ -0,0 +1,16 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+# The AIRDet implementation is also open-sourced by the authors, and available at https://github.com/tinyvision/AIRDet.
+
+import copy
+
+from .giraffe_fpn import GiraffeNeck
+from .giraffe_fpn_v2 import GiraffeNeckV2
+
+
+def build_neck(cfg):
+    neck_cfg = copy.deepcopy(cfg)
+    name = neck_cfg.pop('name')
+    if name == 'GiraffeNeck':
+        return GiraffeNeck(**neck_cfg)
+    elif name == 'GiraffeNeckV2':
+        return GiraffeNeckV2(**neck_cfg)
diff --git a/modelscope/models/cv/tinynas_detection/neck/giraffe_config.py b/modelscope/models/cv/tinynas_detection/neck/giraffe_config.py
new file mode 100644
index 00000000..289fdfd2
--- /dev/null
+++ b/modelscope/models/cv/tinynas_detection/neck/giraffe_config.py
@@ -0,0 +1,235 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+# The AIRDet implementation is also open-sourced by the authors, and available at https://github.com/tinyvision/AIRDet.
+
+import collections
+import itertools
+import os
+
+import networkx as nx
+from omegaconf import OmegaConf
+
+Node = collections.namedtuple('Node', ['id', 'inputs', 'type'])
+
+
+def get_graph_info(graph):
+    input_nodes = []
+    output_nodes = []
+    Nodes = []
+    for node in range(graph.number_of_nodes()):
+        tmp = list(graph.neighbors(node))
+        tmp.sort()
+        type = -1
+        if node < tmp[0]:
+            input_nodes.append(node)
+            type = 0
+        if node > tmp[-1]:
+            output_nodes.append(node)
+            type = 1
+        Nodes.append(Node(node, [n for n in tmp if n < node], type))
+    return Nodes, input_nodes, output_nodes
+
+
+def nodeid_trans(id, cur_level, num_levels):
+    if id % 2 == 1:
+        gap = int(((id + 1) // 2) * num_levels * 2)
+    else:
+        a = (num_levels - cur_level) * 2 - 1
+        b = ((id + 1) // 2) * num_levels * 2
+        gap = int(a + b)
+    return cur_level + gap
+
+
+def gen_log2n_graph_file(log2n_graph_file, depth_multiplier):
+    f = open(log2n_graph_file, 'w')
+    for i in range(depth_multiplier):
+        for j in [1, 2, 4, 8, 16, 32]:
+            if i - j < 0:
+                break
+            else:
+                f.write('%d,%d\n' % (i - j, i))
+    f.close()
+
+
+def get_log2n_graph(depth_multiplier):
+    nodes = []
+    connnections = []
+
+    for i in range(depth_multiplier):
+        nodes.append(i)
+        for j in [1, 2, 4, 8, 16, 32]:
+            if i - j < 0:
+                break
+            else:
+                connnections.append((i - j, i))
+    return nodes, connnections
+
+
+def get_dense_graph(depth_multiplier):
+    nodes = []
+    connections = []
+
+    for i in range(depth_multiplier):
+        nodes.append(i)
+        for j in range(i):
+            connections.append((j, i))
+    return nodes, connections
+
+
+def giraffeneck_config(min_level,
+                       max_level,
+                       weight_method=None,
+                       depth_multiplier=5,
+                       with_backslash=False,
+                       with_slash=False,
+                       with_skip_connect=False,
+                       skip_connect_type='dense'):
+    """Graph config with log2n merge and panet"""
+    if skip_connect_type == 'dense':
+        nodes, connections = get_dense_graph(depth_multiplier)
+    elif skip_connect_type == 'log2n':
+        nodes, connections = get_log2n_graph(depth_multiplier)
+    graph = nx.Graph()
+    graph.add_nodes_from(nodes)
+    graph.add_edges_from(connections)
+
+    drop_node = []
+    nodes, input_nodes, output_nodes = get_graph_info(graph)
+
+    weight_method = weight_method or 'fastattn'
+
+    num_levels = max_level - min_level + 1
+    node_ids = {min_level + i: [i] for i in range(num_levels)}
+    node_ids_per_layer = {}
+
+    pnodes = {}
+
+    def update_drop_node(new_id, input_offsets):
+        if new_id not in drop_node:
+            new_id = new_id
+        else:
+            while new_id in drop_node:
+                if new_id in pnodes:
+                    for n in pnodes[new_id]['inputs_offsets']:
+                        if n not in input_offsets and n not in drop_node:
+                            input_offsets.append(n)
+                new_id = new_id - 1
+        if new_id not in input_offsets:
+            input_offsets.append(new_id)
+
+    # top-down layer
+    for i in range(max_level, min_level - 1, -1):
+        node_ids_per_layer[i] = []
+        for id, node in enumerate(nodes):
+            input_offsets = []
+            if id in input_nodes:
+                input_offsets.append(node_ids[i][0])
+            else:
+                if with_skip_connect:
+                    for input_id in node.inputs:
+                        new_id = nodeid_trans(input_id, i - min_level,
+                                              num_levels)
+                        update_drop_node(new_id, input_offsets)
+
+            # add top2down
+            new_id = nodeid_trans(id, i - min_level, num_levels)
+
+            # add backslash node
+            def cal_backslash_node(id):
+                ind = id // num_levels
+                mod = id % num_levels
+                if ind % 2 == 0:  # even
+                    if mod == (num_levels - 1):
+                        last = -1
+                    else:
+                        last = (ind - 1) * num_levels + (
+                            num_levels - 1 - mod - 1)
+                else:  # odd
+                    if mod == 0:
+                        last = -1
+                    else:
+                        last = (ind - 1) * num_levels + (
+                            num_levels - 1 - mod + 1)
+
+                return last
+
+            # add slash node
+            def cal_slash_node(id):
+                ind = id // num_levels
+                mod = id % num_levels
+                if ind % 2 == 1:  # odd
+                    if mod == (num_levels - 1):
+                        last = -1
+                    else:
+                        last = (ind - 1) * num_levels + (
+                            num_levels - 1 - mod - 1)
+                else:  # even
+                    if mod == 0:
+                        last = -1
+                    else:
+                        last = (ind - 1) * num_levels + (
+                            num_levels - 1 - mod + 1)
+
+                return last
+
+            # add last node
+            last = new_id - 1
+            update_drop_node(last, input_offsets)
+
+            if with_backslash:
+                backslash = cal_backslash_node(new_id)
+                if backslash != -1 and backslash not in input_offsets:
+                    input_offsets.append(backslash)
+
+            if with_slash:
+                slash = cal_slash_node(new_id)
+                if slash != -1 and slash not in input_offsets:
+                    input_offsets.append(slash)
+
+            if new_id in drop_node:
+                input_offsets = []
+
+            pnodes[new_id] = {
+                'reduction': 1 << i,
+                'inputs_offsets': input_offsets,
+                'weight_method': weight_method,
+                'is_out': 0,
+            }
+
+        input_offsets = []
+        for out_id in output_nodes:
+            new_id = nodeid_trans(out_id, i - min_level, num_levels)
+            input_offsets.append(new_id)
+
+        pnodes[node_ids[i][0] + num_levels * (len(nodes) + 1)] = {
+            'reduction': 1 << i,
+            'inputs_offsets': input_offsets,
+            'weight_method': weight_method,
+            'is_out': 1,
+        }
+
+    pnodes = dict(sorted(pnodes.items(), key=lambda x: x[0]))
+    return pnodes
+
+
+def get_graph_config(fpn_name,
+                     min_level=3,
+                     max_level=7,
+                     weight_method='concat',
+                     depth_multiplier=5,
+                     with_backslash=False,
+                     with_slash=False,
+                     with_skip_connect=False,
+                     skip_connect_type='dense'):
+    name_to_config = {
+        'giraffeneck':
+        giraffeneck_config(
+            min_level=min_level,
+            max_level=max_level,
+            weight_method=weight_method,
+            depth_multiplier=depth_multiplier,
+            with_backslash=with_backslash,
+            with_slash=with_slash,
+            with_skip_connect=with_skip_connect,
+            skip_connect_type=skip_connect_type),
+    }
+    return name_to_config[fpn_name]
diff --git a/modelscope/models/cv/tinynas_detection/neck/giraffe_fpn.py b/modelscope/models/cv/tinynas_detection/neck/giraffe_fpn.py
new file mode 100644
index 00000000..b7087779
--- /dev/null
+++ b/modelscope/models/cv/tinynas_detection/neck/giraffe_fpn.py
@@ -0,0 +1,661 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+# The AIRDet implementation is also open-sourced by the authors, and available at https://github.com/tinyvision/AIRDet.
+
+import logging
+import math
+from collections import OrderedDict
+from functools import partial
+from typing import Callable, List, Optional, Tuple, Union
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from timm import create_model
+from timm.models.layers import (Swish, create_conv2d, create_pool2d,
+                                get_act_layer)
+
+from ..core.base_ops import CSPLayer, ShuffleBlock, ShuffleCSPLayer
+from .giraffe_config import get_graph_config
+
+_ACT_LAYER = Swish
+
+
+class SequentialList(nn.Sequential):
+    """ This module exists to work around torchscript typing issues list -> list"""
+
+    def __init__(self, *args):
+        super(SequentialList, self).__init__(*args)
+
+    def forward(self, x: List[torch.Tensor]) -> List[torch.Tensor]:
+        for module in self:
+            x = module(x)
+        return x
+
+
+class ConvBnAct2d(nn.Module):
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 dilation=1,
+                 padding='',
+                 bias=False,
+                 norm_layer=nn.BatchNorm2d,
+                 act_layer=_ACT_LAYER):
+        super(ConvBnAct2d, self).__init__()
+
+        self.conv = create_conv2d(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride=stride,
+            dilation=dilation,
+            padding=padding,
+            bias=bias)
+        self.bn = None if norm_layer is None else norm_layer(out_channels)
+        self.act = None if act_layer is None else act_layer(inplace=True)
+
+    def forward(self, x):
+        x = self.conv(x)
+        if self.bn is not None:
+            x = self.bn(x)
+        if self.act is not None:
+            x = self.act(x)
+        return x
+
+
+class SeparableConv2d(nn.Module):
+    """ Separable Conv
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size=3,
+                 stride=1,
+                 dilation=1,
+                 padding='',
+                 bias=False,
+                 channel_multiplier=1.0,
+                 pw_kernel_size=1,
+                 norm_layer=nn.BatchNorm2d,
+                 act_layer=_ACT_LAYER):
+        super(SeparableConv2d, self).__init__()
+        self.conv_dw = create_conv2d(
+            in_channels,
+            int(in_channels * channel_multiplier),
+            kernel_size,
+            stride=stride,
+            dilation=dilation,
+            padding=padding,
+            depthwise=True)
+
+        self.conv_pw = create_conv2d(
+            int(in_channels * channel_multiplier),
+            out_channels,
+            pw_kernel_size,
+            padding=padding,
+            bias=bias)
+
+        self.bn = None if norm_layer is None else norm_layer(out_channels)
+        self.act = None if act_layer is None else act_layer(inplace=True)
+
+    def forward(self, x):
+        x = self.conv_dw(x)
+        x = self.conv_pw(x)
+        if self.bn is not None:
+            x = self.bn(x)
+        if self.act is not None:
+            x = self.act(x)
+        return x
+
+
+def _init_weight(
+    m,
+    n='',
+):
+    """ Weight initialization as per Tensorflow official implementations.
+    """
+
+    def _fan_in_out(w, groups=1):
+        dimensions = w.dim()
+        if dimensions < 2:
+            raise ValueError(
+                'Fan in and fan out can not be computed for tensor with fewer than 2 dimensions'
+            )
+        num_input_fmaps = w.size(1)
+        num_output_fmaps = w.size(0)
+        receptive_field_size = 1
+        if w.dim() > 2:
+            receptive_field_size = w[0][0].numel()
+        fan_in = num_input_fmaps * receptive_field_size
+        fan_out = num_output_fmaps * receptive_field_size
+        fan_out //= groups
+        return fan_in, fan_out
+
+    def _glorot_uniform(w, gain=1, groups=1):
+        fan_in, fan_out = _fan_in_out(w, groups)
+        gain /= max(1., (fan_in + fan_out) / 2.)  # fan avg
+        limit = math.sqrt(3.0 * gain)
+        w.data.uniform_(-limit, limit)
+
+    def _variance_scaling(w, gain=1, groups=1):
+        fan_in, fan_out = _fan_in_out(w, groups)
+        gain /= max(1., fan_in)  # fan in
+        std = math.sqrt(gain)
+        w.data.normal_(std=std)
+
+    if isinstance(m, SeparableConv2d):
+        if 'box_net' in n or 'class_net' in n:
+            _variance_scaling(m.conv_dw.weight, groups=m.conv_dw.groups)
+            _variance_scaling(m.conv_pw.weight)
+            if m.conv_pw.bias is not None:
+                if 'class_net.predict' in n:
+                    m.conv_pw.bias.data.fill_(-math.log((1 - 0.01) / 0.01))
+                else:
+                    m.conv_pw.bias.data.zero_()
+        else:
+            _glorot_uniform(m.conv_dw.weight, groups=m.conv_dw.groups)
+            _glorot_uniform(m.conv_pw.weight)
+            if m.conv_pw.bias is not None:
+                m.conv_pw.bias.data.zero_()
+    elif isinstance(m, ConvBnAct2d):
+        if 'box_net' in n or 'class_net' in n:
+            m.conv.weight.data.normal_(std=.01)
+            if m.conv.bias is not None:
+                if 'class_net.predict' in n:
+                    m.conv.bias.data.fill_(-math.log((1 - 0.01) / 0.01))
+                else:
+                    m.conv.bias.data.zero_()
+        else:
+            _glorot_uniform(m.conv.weight)
+            if m.conv.bias is not None:
+                m.conv.bias.data.zero_()
+    elif isinstance(m, nn.BatchNorm2d):
+        m.weight.data.fill_(1.0)
+        m.bias.data.zero_()
+
+
+def _init_weight_alt(
+    m,
+    n='',
+):
+    """ Weight initialization alternative, based on EfficientNet bacbkone init w/ class bias addition
+    NOTE: this will likely be removed after some experimentation
+    """
+    if isinstance(m, nn.Conv2d):
+        fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+        fan_out //= m.groups
+        m.weight.data.normal_(0, math.sqrt(2.0 / fan_out))
+        if m.bias is not None:
+            if 'class_net.predict' in n:
+                m.bias.data.fill_(-math.log((1 - 0.01) / 0.01))
+            else:
+                m.bias.data.zero_()
+    elif isinstance(m, nn.BatchNorm2d):
+        m.weight.data.fill_(1.0)
+        m.bias.data.zero_()
+
+
+class Interpolate2d(nn.Module):
+    r"""Resamples a 2d Image
+
+    The input data is assumed to be of the form
+    `minibatch x channels x [optional depth] x [optional height] x width`.
+    Hence, for spatial inputs, we expect a 4D Tensor and for volumetric inputs, we expect a 5D Tensor.
+
+    The algorithms available for upsampling are nearest neighbor and linear,
+    bilinear, bicubic and trilinear for 3D, 4D and 5D input Tensor,
+    respectively.
+
+    One can either give a :attr:`scale_factor` or the target output :attr:`size` to
+    calculate the output size. (You cannot give both, as it is ambiguous)
+
+    Args:
+        size (int or Tuple[int] or Tuple[int, int] or Tuple[int, int, int], optional):
+            output spatial sizes
+        scale_factor (float or Tuple[float] or Tuple[float, float] or Tuple[float, float, float], optional):
+            multiplier for spatial size. Has to match input size if it is a tuple.
+        mode (str, optional): the upsampling algorithm: one of ``'nearest'``,
+            ``'linear'``, ``'bilinear'``, ``'bicubic'`` and ``'trilinear'``.
+            Default: ``'nearest'``
+        align_corners (bool, optional): if ``True``, the corner pixels of the input
+            and output tensors are aligned, and thus preserving the values at
+            those pixels. This only has effect when :attr:`mode` is
+            ``'linear'``, ``'bilinear'``, or ``'trilinear'``. Default: ``False``
+    """
+    __constants__ = ['size', 'scale_factor', 'mode', 'align_corners', 'name']
+    name: str
+    size: Optional[Union[int, Tuple[int, int]]]
+    scale_factor: Optional[Union[float, Tuple[float, float]]]
+    mode: str
+    align_corners: Optional[bool]
+
+    def __init__(self,
+                 size: Optional[Union[int, Tuple[int, int]]] = None,
+                 scale_factor: Optional[Union[float, Tuple[float,
+                                                           float]]] = None,
+                 mode: str = 'nearest',
+                 align_corners: bool = False) -> None:
+        super(Interpolate2d, self).__init__()
+        self.name = type(self).__name__
+        self.size = size
+        if isinstance(scale_factor, tuple):
+            self.scale_factor = tuple(float(factor) for factor in scale_factor)
+        else:
+            self.scale_factor = float(scale_factor) if scale_factor else None
+        self.mode = mode
+        self.align_corners = None if mode == 'nearest' else align_corners
+
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
+        return F.interpolate(
+            input,
+            self.size,
+            self.scale_factor,
+            self.mode,
+            self.align_corners,
+            recompute_scale_factor=False)
+
+
+class ResampleFeatureMap(nn.Sequential):
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 reduction_ratio=1.,
+                 pad_type='',
+                 downsample=None,
+                 upsample=None,
+                 norm_layer=nn.BatchNorm2d,
+                 apply_bn=False,
+                 conv_after_downsample=False,
+                 redundant_bias=False):
+        super(ResampleFeatureMap, self).__init__()
+        downsample = downsample or 'max'
+        upsample = upsample or 'nearest'
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.reduction_ratio = reduction_ratio
+        self.conv_after_downsample = conv_after_downsample
+
+        conv = None
+        if in_channels != out_channels:
+            conv = ConvBnAct2d(
+                in_channels,
+                out_channels,
+                kernel_size=1,
+                padding=pad_type,
+                norm_layer=norm_layer if apply_bn else None,
+                bias=not apply_bn or redundant_bias,
+                act_layer=None)
+
+        if reduction_ratio > 1:
+            if conv is not None and not self.conv_after_downsample:
+                self.add_module('conv', conv)
+            if downsample in ('max', 'avg'):
+                stride_size = int(reduction_ratio)
+                downsample = create_pool2d(
+                    downsample,
+                    kernel_size=stride_size + 1,
+                    stride=stride_size,
+                    padding=pad_type)
+            else:
+                downsample = Interpolate2d(
+                    scale_factor=1. / reduction_ratio, mode=downsample)
+            self.add_module('downsample', downsample)
+            if conv is not None and self.conv_after_downsample:
+                self.add_module('conv', conv)
+        else:
+            if conv is not None:
+                self.add_module('conv', conv)
+            if reduction_ratio < 1:
+                scale = int(1 // reduction_ratio)
+                self.add_module(
+                    'upsample',
+                    Interpolate2d(scale_factor=scale, mode=upsample))
+
+
+class GiraffeCombine(nn.Module):
+
+    def __init__(self,
+                 feature_info,
+                 fpn_config,
+                 fpn_channels,
+                 inputs_offsets,
+                 target_reduction,
+                 pad_type='',
+                 downsample=None,
+                 upsample=None,
+                 norm_layer=nn.BatchNorm2d,
+                 apply_resample_bn=False,
+                 conv_after_downsample=False,
+                 redundant_bias=False,
+                 weight_method='attn'):
+        super(GiraffeCombine, self).__init__()
+        self.inputs_offsets = inputs_offsets
+        self.weight_method = weight_method
+
+        self.resample = nn.ModuleDict()
+        reduction_base = feature_info[0]['reduction']
+
+        target_channels_idx = int(
+            math.log(target_reduction // reduction_base, 2))
+        for idx, offset in enumerate(inputs_offsets):
+            if offset < len(feature_info):
+                in_channels = feature_info[offset]['num_chs']
+                input_reduction = feature_info[offset]['reduction']
+            else:
+                node_idx = offset
+                input_reduction = fpn_config[node_idx]['reduction']
+                # in_channels = fpn_config[node_idx]['num_chs']
+                input_channels_idx = int(
+                    math.log(input_reduction // reduction_base, 2))
+                in_channels = feature_info[input_channels_idx]['num_chs']
+
+            reduction_ratio = target_reduction / input_reduction
+            if weight_method == 'concat':
+                self.resample[str(offset)] = ResampleFeatureMap(
+                    in_channels,
+                    in_channels,
+                    reduction_ratio=reduction_ratio,
+                    pad_type=pad_type,
+                    downsample=downsample,
+                    upsample=upsample,
+                    norm_layer=norm_layer,
+                    apply_bn=apply_resample_bn,
+                    conv_after_downsample=conv_after_downsample,
+                    redundant_bias=redundant_bias)
+            else:
+                self.resample[str(offset)] = ResampleFeatureMap(
+                    in_channels,
+                    fpn_channels[target_channels_idx],
+                    reduction_ratio=reduction_ratio,
+                    pad_type=pad_type,
+                    downsample=downsample,
+                    upsample=upsample,
+                    norm_layer=norm_layer,
+                    apply_bn=apply_resample_bn,
+                    conv_after_downsample=conv_after_downsample,
+                    redundant_bias=redundant_bias)
+
+        if weight_method == 'attn' or weight_method == 'fastattn':
+            self.edge_weights = nn.Parameter(
+                torch.ones(len(inputs_offsets)), requires_grad=True)  # WSM
+        else:
+            self.edge_weights = None
+
+    def forward(self, x: List[torch.Tensor]):
+        dtype = x[0].dtype
+        nodes = []
+        if len(self.inputs_offsets) == 0:
+            return None
+        for offset, resample in zip(self.inputs_offsets,
+                                    self.resample.values()):
+            input_node = x[offset]
+            input_node = resample(input_node)
+            nodes.append(input_node)
+
+        if self.weight_method == 'attn':
+            normalized_weights = torch.softmax(
+                self.edge_weights.to(dtype=dtype), dim=0)
+            out = torch.stack(nodes, dim=-1) * normalized_weights
+            out = torch.sum(out, dim=-1)
+        elif self.weight_method == 'fastattn':
+            edge_weights = nn.functional.relu(
+                self.edge_weights.to(dtype=dtype))
+            weights_sum = torch.sum(edge_weights)
+            weights_norm = weights_sum + 0.0001
+            out = torch.stack([(nodes[i] * edge_weights[i]) / weights_norm
+                               for i in range(len(nodes))],
+                              dim=-1)
+
+            out = torch.sum(out, dim=-1)
+        elif self.weight_method == 'sum':
+            out = torch.stack(nodes, dim=-1)
+            out = torch.sum(out, dim=-1)
+        elif self.weight_method == 'concat':
+            out = torch.cat(nodes, dim=1)
+        else:
+            raise ValueError('unknown weight_method {}'.format(
+                self.weight_method))
+        return out
+
+
+class GiraffeNode(nn.Module):
+    """ A simple wrapper used in place of nn.Sequential for torchscript typing
+    Handles input type List[Tensor] -> output type Tensor
+    """
+
+    def __init__(self, combine: nn.Module, after_combine: nn.Module):
+        super(GiraffeNode, self).__init__()
+        self.combine = combine
+        self.after_combine = after_combine
+
+    def forward(self, x: List[torch.Tensor]) -> torch.Tensor:
+        combine_feat = self.combine(x)
+        if combine_feat is None:
+            return None
+        else:
+            return self.after_combine(combine_feat)
+
+
+class GiraffeLayer(nn.Module):
+
+    def __init__(self,
+                 feature_info,
+                 fpn_config,
+                 inner_fpn_channels,
+                 outer_fpn_channels,
+                 num_levels=5,
+                 pad_type='',
+                 downsample=None,
+                 upsample=None,
+                 norm_layer=nn.BatchNorm2d,
+                 act_layer=_ACT_LAYER,
+                 apply_resample_bn=False,
+                 conv_after_downsample=True,
+                 conv_bn_relu_pattern=False,
+                 separable_conv=True,
+                 redundant_bias=False,
+                 merge_type='conv'):
+        super(GiraffeLayer, self).__init__()
+        self.num_levels = num_levels
+        self.conv_bn_relu_pattern = False
+
+        self.feature_info = {}
+        for idx, feat in enumerate(feature_info):
+            self.feature_info[idx] = feat
+
+        self.fnode = nn.ModuleList()
+        reduction_base = feature_info[0]['reduction']
+        for i, fnode_cfg in fpn_config.items():
+            logging.debug('fnode {} : {}'.format(i, fnode_cfg))
+
+            if fnode_cfg['is_out'] == 1:
+                fpn_channels = outer_fpn_channels
+            else:
+                fpn_channels = inner_fpn_channels
+
+            reduction = fnode_cfg['reduction']
+            fpn_channels_idx = int(math.log(reduction // reduction_base, 2))
+            combine = GiraffeCombine(
+                self.feature_info,
+                fpn_config,
+                fpn_channels,
+                tuple(fnode_cfg['inputs_offsets']),
+                target_reduction=reduction,
+                pad_type=pad_type,
+                downsample=downsample,
+                upsample=upsample,
+                norm_layer=norm_layer,
+                apply_resample_bn=apply_resample_bn,
+                conv_after_downsample=conv_after_downsample,
+                redundant_bias=redundant_bias,
+                weight_method=fnode_cfg['weight_method'])
+
+            after_combine = nn.Sequential()
+
+            in_channels = 0
+            out_channels = 0
+            for input_offset in fnode_cfg['inputs_offsets']:
+                in_channels += self.feature_info[input_offset]['num_chs']
+
+            out_channels = fpn_channels[fpn_channels_idx]
+
+            if merge_type == 'csp':
+                after_combine.add_module(
+                    'CspLayer',
+                    CSPLayer(
+                        in_channels,
+                        out_channels,
+                        2,
+                        shortcut=True,
+                        depthwise=False,
+                        act='silu'))
+            elif merge_type == 'shuffle':
+                after_combine.add_module(
+                    'shuffleBlock', ShuffleBlock(in_channels, in_channels))
+                after_combine.add_module(
+                    'conv1x1',
+                    create_conv2d(in_channels, out_channels, kernel_size=1))
+            elif merge_type == 'conv':
+                after_combine.add_module(
+                    'conv1x1',
+                    create_conv2d(in_channels, out_channels, kernel_size=1))
+                conv_kwargs = dict(
+                    in_channels=out_channels,
+                    out_channels=out_channels,
+                    kernel_size=3,
+                    padding=pad_type,
+                    bias=False,
+                    norm_layer=norm_layer,
+                    act_layer=act_layer)
+                if not conv_bn_relu_pattern:
+                    conv_kwargs['bias'] = redundant_bias
+                    conv_kwargs['act_layer'] = None
+                    after_combine.add_module('act', act_layer(inplace=True))
+                after_combine.add_module(
+                    'conv',
+                    SeparableConv2d(**conv_kwargs)
+                    if separable_conv else ConvBnAct2d(**conv_kwargs))
+
+            self.fnode.append(
+                GiraffeNode(combine=combine, after_combine=after_combine))
+            self.feature_info[i] = dict(
+                num_chs=fpn_channels[fpn_channels_idx], reduction=reduction)
+
+        self.out_feature_info = []
+        out_node = list(self.feature_info.keys())[-num_levels::]
+        for i in out_node:
+            self.out_feature_info.append(self.feature_info[i])
+
+        self.feature_info = self.out_feature_info
+
+    def forward(self, x: List[torch.Tensor]):
+        for fn in self.fnode:
+            x.append(fn(x))
+        return x[-self.num_levels::]
+
+
+class GiraffeNeck(nn.Module):
+
+    def __init__(self, min_level, max_level, num_levels, norm_layer,
+                 norm_kwargs, act_type, fpn_config, fpn_name, fpn_channels,
+                 out_fpn_channels, weight_method, depth_multiplier,
+                 width_multiplier, with_backslash, with_slash,
+                 with_skip_connect, skip_connect_type, separable_conv,
+                 feature_info, merge_type, pad_type, downsample_type,
+                 upsample_type, apply_resample_bn, conv_after_downsample,
+                 redundant_bias, conv_bn_relu_pattern, alternate_init):
+        super(GiraffeNeck, self).__init__()
+
+        self.num_levels = num_levels
+        self.min_level = min_level
+        self.in_features = [0, 1, 2, 3, 4, 5,
+                            6][self.min_level - 1:self.min_level - 1
+                               + num_levels]
+        self.alternate_init = alternate_init
+        norm_layer = norm_layer or nn.BatchNorm2d
+        if norm_kwargs:
+            norm_layer = partial(norm_layer, **norm_kwargs)
+        act_layer = get_act_layer(act_type) or _ACT_LAYER
+        fpn_config = fpn_config or get_graph_config(
+            fpn_name,
+            min_level=min_level,
+            max_level=max_level,
+            weight_method=weight_method,
+            depth_multiplier=depth_multiplier,
+            with_backslash=with_backslash,
+            with_slash=with_slash,
+            with_skip_connect=with_skip_connect,
+            skip_connect_type=skip_connect_type)
+
+        # width scale
+        for i in range(len(fpn_channels)):
+            fpn_channels[i] = int(fpn_channels[i] * width_multiplier)
+
+        self.resample = nn.ModuleDict()
+        for level in range(num_levels):
+            if level < len(feature_info):
+                in_chs = feature_info[level]['num_chs']
+                reduction = feature_info[level]['reduction']
+            else:
+                # Adds a coarser level by downsampling the last feature map
+                reduction_ratio = 2
+                self.resample[str(level)] = ResampleFeatureMap(
+                    in_channels=in_chs,
+                    out_channels=feature_info[level - 1]['num_chs'],
+                    pad_type=pad_type,
+                    downsample=downsample_type,
+                    upsample=upsample_type,
+                    norm_layer=norm_layer,
+                    reduction_ratio=reduction_ratio,
+                    apply_bn=apply_resample_bn,
+                    conv_after_downsample=conv_after_downsample,
+                    redundant_bias=redundant_bias,
+                )
+                in_chs = feature_info[level - 1]['num_chs']
+                reduction = int(reduction * reduction_ratio)
+                feature_info.append(dict(num_chs=in_chs, reduction=reduction))
+
+        self.cell = SequentialList()
+        logging.debug('building giraffeNeck')
+        giraffe_layer = GiraffeLayer(
+            feature_info=feature_info,
+            fpn_config=fpn_config,
+            inner_fpn_channels=fpn_channels,
+            outer_fpn_channels=out_fpn_channels,
+            num_levels=num_levels,
+            pad_type=pad_type,
+            downsample=downsample_type,
+            upsample=upsample_type,
+            norm_layer=norm_layer,
+            act_layer=act_layer,
+            separable_conv=separable_conv,
+            apply_resample_bn=apply_resample_bn,
+            conv_after_downsample=conv_after_downsample,
+            conv_bn_relu_pattern=conv_bn_relu_pattern,
+            redundant_bias=redundant_bias,
+            merge_type=merge_type)
+        self.cell.add_module('giraffeNeck', giraffe_layer)
+        feature_info = giraffe_layer.feature_info
+
+    def init_weights(self, pretrained=False):
+        for n, m in self.named_modules():
+            if 'backbone' not in n:
+                if self.alternate_init:
+                    _init_weight_alt(m, n)
+                else:
+                    _init_weight(m, n)
+
+    def forward(self, x: List[torch.Tensor]):
+        if type(x) is tuple:
+            x = list(x)
+        x = [x[f] for f in self.in_features]
+        for resample in self.resample.values():
+            x.append(resample(x[-1]))
+        x = self.cell(x)
+        return x
diff --git a/modelscope/models/cv/tinynas_detection/neck/giraffe_fpn_v2.py b/modelscope/models/cv/tinynas_detection/neck/giraffe_fpn_v2.py
new file mode 100644
index 00000000..b710572f
--- /dev/null
+++ b/modelscope/models/cv/tinynas_detection/neck/giraffe_fpn_v2.py
@@ -0,0 +1,203 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+# The AIRDet implementation is also open-sourced by the authors, and available at https://github.com/tinyvision/AIRDet.
+
+import torch
+import torch.nn as nn
+
+from ..core.base_ops import BaseConv, CSPLayer, DWConv
+from ..core.neck_ops import CSPStage
+
+
+class GiraffeNeckV2(nn.Module):
+
+    def __init__(
+        self,
+        depth=1.0,
+        width=1.0,
+        in_features=[2, 3, 4],
+        in_channels=[256, 512, 1024],
+        out_channels=[256, 512, 1024],
+        depthwise=False,
+        act='silu',
+        spp=True,
+        reparam_mode=True,
+        block_name='BasicBlock',
+    ):
+        super().__init__()
+        self.in_features = in_features
+        self.in_channels = in_channels
+        Conv = DWConv if depthwise else BaseConv
+
+        reparam_mode = reparam_mode
+
+        self.upsample = nn.Upsample(scale_factor=2, mode='nearest')
+
+        # node x3: input x0, x1
+        self.bu_conv13 = Conv(
+            int(in_channels[1] * width),
+            int(in_channels[1] * width),
+            3,
+            2,
+            act=act)
+        if reparam_mode:
+            self.merge_3 = CSPStage(
+                block_name,
+                int((in_channels[1] + in_channels[2]) * width),
+                int(in_channels[2] * width),
+                round(3 * depth),
+                act=act,
+                spp=spp)
+        else:
+            self.merge_3 = CSPLayer(
+                int((in_channels[1] + in_channels[2]) * width),
+                int(in_channels[2] * width),
+                round(3 * depth),
+                False,
+                depthwise=depthwise,
+                act=act)
+
+        # node x4: input x1, x2, x3
+        self.bu_conv24 = Conv(
+            int(in_channels[0] * width),
+            int(in_channels[0] * width),
+            3,
+            2,
+            act=act)
+        if reparam_mode:
+            self.merge_4 = CSPStage(
+                block_name,
+                int((in_channels[0] + in_channels[1] + in_channels[2])
+                    * width),
+                int(in_channels[1] * width),
+                round(3 * depth),
+                act=act,
+                spp=spp)
+        else:
+            self.merge_4 = CSPLayer(
+                int((in_channels[0] + in_channels[1] + in_channels[2])
+                    * width),
+                int(in_channels[1] * width),
+                round(3 * depth),
+                False,
+                depthwise=depthwise,
+                act=act)
+
+        # node x5: input x2, x4
+        if reparam_mode:
+            self.merge_5 = CSPStage(
+                block_name,
+                int((in_channels[1] + in_channels[0]) * width),
+                int(out_channels[0] * width),
+                round(3 * depth),
+                act=act,
+                spp=spp)
+        else:
+            self.merge_5 = CSPLayer(
+                int((in_channels[1] + in_channels[0]) * width),
+                int(out_channels[0] * width),
+                round(3 * depth),
+                False,
+                depthwise=depthwise,
+                act=act)
+
+        # node x7: input x4, x5
+        self.bu_conv57 = Conv(
+            int(out_channels[0] * width),
+            int(out_channels[0] * width),
+            3,
+            2,
+            act=act)
+        if reparam_mode:
+            self.merge_7 = CSPStage(
+                block_name,
+                int((out_channels[0] + in_channels[1]) * width),
+                int(out_channels[1] * width),
+                round(3 * depth),
+                act=act,
+                spp=spp)
+        else:
+            self.merge_7 = CSPLayer(
+                int((out_channels[0] + in_channels[1]) * width),
+                int(out_channels[1] * width),
+                round(3 * depth),
+                False,
+                depthwise=depthwise,
+                act=act)
+
+        # node x6: input x3, x4, x7
+        self.bu_conv46 = Conv(
+            int(in_channels[1] * width),
+            int(in_channels[1] * width),
+            3,
+            2,
+            act=act)
+        self.bu_conv76 = Conv(
+            int(out_channels[1] * width),
+            int(out_channels[1] * width),
+            3,
+            2,
+            act=act)
+        if reparam_mode:
+            self.merge_6 = CSPStage(
+                block_name,
+                int((in_channels[1] + out_channels[1] + in_channels[2])
+                    * width),
+                int(out_channels[2] * width),
+                round(3 * depth),
+                act=act,
+                spp=spp)
+        else:
+            self.merge_6 = CSPLayer(
+                int((in_channels[1] + out_channels[1] + in_channels[2])
+                    * width),
+                int(out_channels[2] * width),
+                round(3 * depth),
+                False,
+                depthwise=depthwise,
+                act=act)
+
+    def init_weights(self):
+        pass
+
+    def forward(self, out_features):
+        """
+        Args:
+            inputs: input images.
+
+        Returns:
+            Tuple[Tensor]: FPN feature.
+        """
+
+        #  backbone
+        features = [out_features[f] for f in self.in_features]
+        [x2, x1, x0] = features
+
+        # node x3
+        x13 = self.bu_conv13(x1)
+        x3 = torch.cat([x0, x13], 1)
+        x3 = self.merge_3(x3)
+
+        # node x4
+        x34 = self.upsample(x3)
+        x24 = self.bu_conv24(x2)
+        x4 = torch.cat([x1, x24, x34], 1)
+        x4 = self.merge_4(x4)
+
+        # node x5
+        x45 = self.upsample(x4)
+        x5 = torch.cat([x2, x45], 1)
+        x5 = self.merge_5(x5)
+
+        # node x7
+        x57 = self.bu_conv57(x5)
+        x7 = torch.cat([x4, x57], 1)
+        x7 = self.merge_7(x7)
+
+        # node x6
+        x46 = self.bu_conv46(x4)
+        x76 = self.bu_conv76(x7)
+        x6 = torch.cat([x3, x46, x76], 1)
+        x6 = self.merge_6(x6)
+
+        outputs = (x5, x7, x6)
+        return outputs
diff --git a/modelscope/models/cv/tinynas_detection/tinynas_detector.py b/modelscope/models/cv/tinynas_detection/tinynas_detector.py
new file mode 100644
index 00000000..e6f144df
--- /dev/null
+++ b/modelscope/models/cv/tinynas_detection/tinynas_detector.py
@@ -0,0 +1,16 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+# The AIRDet implementation is also open-sourced by the authors, and available at https://github.com/tinyvision/AIRDet.
+
+from modelscope.metainfo import Models
+from modelscope.models.builder import MODELS
+from modelscope.utils.constant import Tasks
+from .detector import SingleStageDetector
+
+
+@MODELS.register_module(
+    Tasks.image_object_detection, module_name=Models.tinynas_detection)
+class TinynasDetector(SingleStageDetector):
+
+    def __init__(self, model_dir, *args, **kwargs):
+
+        super(TinynasDetector, self).__init__(model_dir, *args, **kwargs)
diff --git a/modelscope/models/cv/tinynas_detection/utils.py b/modelscope/models/cv/tinynas_detection/utils.py
new file mode 100644
index 00000000..d67d3a36
--- /dev/null
+++ b/modelscope/models/cv/tinynas_detection/utils.py
@@ -0,0 +1,30 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+# The AIRDet implementation is also open-sourced by the authors, and available at https://github.com/tinyvision/AIRDet.
+
+import importlib
+import os
+import sys
+from os.path import dirname, join
+
+
+def get_config_by_file(config_file):
+    try:
+        sys.path.append(os.path.dirname(config_file))
+        current_config = importlib.import_module(
+            os.path.basename(config_file).split('.')[0])
+        exp = current_config.Config()
+    except Exception:
+        raise ImportError(
+            "{} doesn't contains class named 'Config'".format(config_file))
+    return exp
+
+
+def parse_config(config_file):
+    """
+    get config object by file.
+    Args:
+        config_file (str): file path of config.
+    """
+    assert (config_file is not None), 'plz provide config file'
+    if config_file is not None:
+        return get_config_by_file(config_file)
diff --git a/modelscope/models/cv/video_inpainting/__init__.py b/modelscope/models/cv/video_inpainting/__init__.py
new file mode 100644
index 00000000..f5489da9
--- /dev/null
+++ b/modelscope/models/cv/video_inpainting/__init__.py
@@ -0,0 +1,20 @@
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .inpainting_model import VideoInpainting
+
+else:
+    _import_structure = {'inpainting_model': ['VideoInpainting']}
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/models/cv/video_inpainting/inpainting.py b/modelscope/models/cv/video_inpainting/inpainting.py
new file mode 100644
index 00000000..e2af2ad0
--- /dev/null
+++ b/modelscope/models/cv/video_inpainting/inpainting.py
@@ -0,0 +1,299 @@
+""" VideoInpaintingProcess
+The implementation here is modified based on STTN,
+originally Apache 2.0 License and publicly avaialbe at https://github.com/researchmm/STTN
+"""
+
+import os
+import time
+
+import cv2
+import numpy as np
+import torch
+from PIL import Image
+from torchvision import transforms
+
+torch.backends.cudnn.enabled = False
+
+w, h = 192, 96
+ref_length = 300
+neighbor_stride = 20
+default_fps = 24
+MAX_frame = 300
+
+
+def video_process(video_input_path):
+    video_input = cv2.VideoCapture(video_input_path)
+    success, frame = video_input.read()
+    if success is False:
+        decode_error = 'decode_error'
+        w, h, fps = 0, 0, 0
+    else:
+        decode_error = None
+        h, w = frame.shape[0:2]
+        fps = video_input.get(cv2.CAP_PROP_FPS)
+    video_input.release()
+
+    return decode_error, fps, w, h
+
+
+class Stack(object):
+
+    def __init__(self, roll=False):
+        self.roll = roll
+
+    def __call__(self, img_group):
+        mode = img_group[0].mode
+        if mode == '1':
+            img_group = [img.convert('L') for img in img_group]
+            mode = 'L'
+        if mode == 'L':
+            return np.stack([np.expand_dims(x, 2) for x in img_group], axis=2)
+        elif mode == 'RGB':
+            if self.roll:
+                return np.stack([np.array(x)[:, :, ::-1] for x in img_group],
+                                axis=2)
+            else:
+                return np.stack(img_group, axis=2)
+        else:
+            raise NotImplementedError(f'Image mode {mode}')
+
+
+class ToTorchFormatTensor(object):
+    """ Converts a PIL.Image (RGB) or numpy.ndarray (H x W x C) in the range [0, 255]
+    to a torch.FloatTensor of shape (C x H x W) in the range [0.0, 1.0] """
+
+    def __init__(self, div=True):
+        self.div = div
+
+    def __call__(self, pic):
+        if isinstance(pic, np.ndarray):
+            img = torch.from_numpy(pic).permute(2, 3, 0, 1).contiguous()
+        else:
+            img = torch.ByteTensor(
+                torch.ByteStorage.from_buffer(pic.tobytes()))
+            img = img.view(pic.size[1], pic.size[0], len(pic.mode))
+            img = img.transpose(0, 1).transpose(0, 2).contiguous()
+        img = img.float().div(255) if self.div else img.float()
+        return img
+
+
+_to_tensors = transforms.Compose([Stack(), ToTorchFormatTensor()])
+
+
+def get_crop_mask_v1(mask):
+    orig_h, orig_w, _ = mask.shape
+    if (mask == 255).all():
+        return mask, (0, int(orig_h), 0,
+                      int(orig_w)), [0, int(orig_h), 0,
+                                     int(orig_w)
+                                     ], [0, int(orig_h), 0,
+                                         int(orig_w)]
+
+    hs = np.min(np.where(mask == 0)[0])
+    he = np.max(np.where(mask == 0)[0])
+    ws = np.min(np.where(mask == 0)[1])
+    we = np.max(np.where(mask == 0)[1])
+    crop_box = [ws, hs, we, he]
+
+    mask_h = round(int(orig_h / 2) / 4) * 4
+    mask_w = round(int(orig_w / 2) / 4) * 4
+
+    if (hs < mask_h) and (he < mask_h) and (ws < mask_w) and (we < mask_w):
+        crop_mask = mask[:mask_h, :mask_w, :]
+        res_pix = (0, mask_h, 0, mask_w)
+    elif (hs < mask_h) and (he < mask_h) and (ws > mask_w) and (we > mask_w):
+        crop_mask = mask[:mask_h, orig_w - mask_w:orig_w, :]
+        res_pix = (0, mask_h, orig_w - mask_w, int(orig_w))
+    elif (hs > mask_h) and (he > mask_h) and (ws < mask_w) and (we < mask_w):
+        crop_mask = mask[orig_h - mask_h:orig_h, :mask_w, :]
+        res_pix = (orig_h - mask_h, int(orig_h), 0, mask_w)
+    elif (hs > mask_h) and (he > mask_h) and (ws > mask_w) and (we > mask_w):
+        crop_mask = mask[orig_h - mask_h:orig_h, orig_w - mask_w:orig_w, :]
+        res_pix = (orig_h - mask_h, int(orig_h), orig_w - mask_w, int(orig_w))
+
+    elif (hs < mask_h) and (he < mask_h) and (ws < mask_w) and (we > mask_w):
+        crop_mask = mask[:mask_h, :, :]
+        res_pix = (0, mask_h, 0, int(orig_w))
+    elif (hs < mask_h) and (he > mask_h) and (ws < mask_w) and (we < mask_w):
+        crop_mask = mask[:, :mask_w, :]
+        res_pix = (0, int(orig_h), 0, mask_w)
+    elif (hs > mask_h) and (he > mask_h) and (ws < mask_w) and (we > mask_w):
+        crop_mask = mask[orig_h - mask_h:orig_h, :, :]
+        res_pix = (orig_h - mask_h, int(orig_h), 0, int(orig_w))
+    elif (hs < mask_h) and (he > mask_h) and (ws > mask_w) and (we > mask_w):
+        crop_mask = mask[:, orig_w - mask_w:orig_w, :]
+        res_pix = (0, int(orig_h), orig_w - mask_w, int(orig_w))
+    else:
+        crop_mask = mask
+        res_pix = (0, int(orig_h), 0, int(orig_w))
+    a = ws - res_pix[2]
+    b = hs - res_pix[0]
+    c = we - res_pix[2]
+    d = he - res_pix[0]
+    return crop_mask, res_pix, crop_box, [a, b, c, d]
+
+
+def get_ref_index(neighbor_ids, length):
+    ref_index = []
+    for i in range(0, length, ref_length):
+        if i not in neighbor_ids:
+            ref_index.append(i)
+    return ref_index
+
+
+def read_mask_oneImage(mpath):
+    masks = []
+    print('mask_path: {}'.format(mpath))
+    start = int(mpath.split('/')[-1].split('mask_')[1].split('_')[0])
+    end = int(
+        mpath.split('/')[-1].split('mask_')[1].split('_')[1].split('.')[0])
+    m = Image.open(mpath)
+    m = np.array(m.convert('L'))
+    m = np.array(m > 0).astype(np.uint8)
+    m = 1 - m
+    for i in range(start - 1, end + 1):
+        masks.append(Image.fromarray(m * 255))
+    return masks
+
+
+def check_size(h, w):
+    is_resize = False
+    if h != 240:
+        h = 240
+        is_resize = True
+    if w != 432:
+        w = 432
+        is_resize = True
+    return is_resize
+
+
+def get_mask_list(mask_path):
+    mask_names = os.listdir(mask_path)
+    mask_names.sort()
+
+    abs_mask_path = []
+    mask_list = []
+    begin_list = []
+    end_list = []
+
+    for mask_name in mask_names:
+        mask_name_tmp = mask_name.split('mask_')[1]
+        begin_list.append(int(mask_name_tmp.split('_')[0]))
+        end_list.append(int(mask_name_tmp.split('_')[1].split('.')[0]))
+        abs_mask_path.append(os.path.join(mask_path, mask_name))
+        mask = cv2.imread(os.path.join(mask_path, mask_name))
+        mask_list.append(mask)
+    return mask_list, begin_list, end_list, abs_mask_path
+
+
+def inpainting_by_model_balance(model, video_inputPath, mask_path,
+                                video_savePath, fps, w_ori, h_ori):
+
+    video_ori = cv2.VideoCapture(video_inputPath)
+
+    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
+    video_save = cv2.VideoWriter(video_savePath, fourcc, fps, (w_ori, h_ori))
+
+    mask_list, begin_list, end_list, abs_mask_path = get_mask_list(mask_path)
+
+    img_npy = []
+
+    for index, mask in enumerate(mask_list):
+
+        masks = read_mask_oneImage(abs_mask_path[index])
+
+        mask, res_pix, crop_for_oriimg, crop_for_inpimg = get_crop_mask_v1(
+            mask)
+        mask_h, mask_w = mask.shape[0:2]
+        is_resize = check_size(mask.shape[0], mask.shape[1])
+
+        begin = begin_list[index]
+        end = end_list[index]
+        print('begin: {}'.format(begin))
+        print('end: {}'.format(end))
+
+        for i in range(begin, end + 1, MAX_frame):
+            begin_time = time.time()
+            if i + MAX_frame <= end:
+                video_length = MAX_frame
+            else:
+                video_length = end - i + 1
+
+            for frame_count in range(video_length):
+                _, frame = video_ori.read()
+                img_npy.append(frame)
+            frames_temp = []
+            for f in img_npy:
+                f = Image.fromarray(f)
+                i_temp = f.crop(
+                    (res_pix[2], res_pix[0], res_pix[3], res_pix[1]))
+                a = i_temp.resize((w, h), Image.NEAREST)
+                frames_temp.append(a)
+            feats_temp = _to_tensors(frames_temp).unsqueeze(0) * 2 - 1
+            frames_temp = [np.array(f).astype(np.uint8) for f in frames_temp]
+            masks_temp = []
+            for m in masks[i - begin:i + video_length - begin]:
+
+                m_temp = m.crop(
+                    (res_pix[2], res_pix[0], res_pix[3], res_pix[1]))
+                b = m_temp.resize((w, h), Image.NEAREST)
+                masks_temp.append(b)
+            binary_masks_temp = [
+                np.expand_dims((np.array(m) != 0).astype(np.uint8), 2)
+                for m in masks_temp
+            ]
+            masks_temp = _to_tensors(masks_temp).unsqueeze(0)
+            if torch.cuda.is_available():
+                feats_temp, masks_temp = feats_temp.cuda(), masks_temp.cuda()
+            comp_frames = [None] * video_length
+            model.eval()
+            with torch.no_grad():
+                feats_out = feats_temp * (1 - masks_temp).float()
+                feats_out = feats_out.view(video_length, 3, h, w)
+                feats_out = model.model.encoder(feats_out)
+                _, c, feat_h, feat_w = feats_out.size()
+                feats_out = feats_out.view(1, video_length, c, feat_h, feat_w)
+
+            for f in range(0, video_length, neighbor_stride):
+                neighbor_ids = [
+                    i for i in range(
+                        max(0, f - neighbor_stride),
+                        min(video_length, f + neighbor_stride + 1))
+                ]
+                ref_ids = get_ref_index(neighbor_ids, video_length)
+                with torch.no_grad():
+                    pred_feat = model.model.infer(
+                        feats_out[0, neighbor_ids + ref_ids, :, :, :],
+                        masks_temp[0, neighbor_ids + ref_ids, :, :, :])
+                    pred_img = torch.tanh(
+                        model.model.decoder(
+                            pred_feat[:len(neighbor_ids), :, :, :])).detach()
+                    pred_img = (pred_img + 1) / 2
+                    pred_img = pred_img.cpu().permute(0, 2, 3, 1).numpy() * 255
+                    for j in range(len(neighbor_ids)):
+                        idx = neighbor_ids[j]
+                        img = np.array(pred_img[j]).astype(
+                            np.uint8) * binary_masks_temp[idx] + frames_temp[
+                                idx] * (1 - binary_masks_temp[idx])
+                        if comp_frames[idx] is None:
+                            comp_frames[idx] = img
+                        else:
+                            comp_frames[idx] = comp_frames[idx].astype(
+                                np.float32) * 0.5 + img.astype(
+                                    np.float32) * 0.5
+            print('inpainting time:', time.time() - begin_time)
+            for f in range(video_length):
+                comp = np.array(comp_frames[f]).astype(
+                    np.uint8) * binary_masks_temp[f] + frames_temp[f] * (
+                        1 - binary_masks_temp[f])
+                if is_resize:
+                    comp = cv2.resize(comp, (mask_w, mask_h))
+                complete_frame = img_npy[f]
+                a1, b1, c1, d1 = crop_for_oriimg
+                a2, b2, c2, d2 = crop_for_inpimg
+                complete_frame[b1:d1, a1:c1] = comp[b2:d2, a2:c2]
+                video_save.write(complete_frame)
+
+            img_npy = []
+
+    video_ori.release()
diff --git a/modelscope/models/cv/video_inpainting/inpainting_model.py b/modelscope/models/cv/video_inpainting/inpainting_model.py
new file mode 100644
index 00000000..ffecde67
--- /dev/null
+++ b/modelscope/models/cv/video_inpainting/inpainting_model.py
@@ -0,0 +1,381 @@
+""" VideoInpaintingProcess
+The implementation here is modified based on STTN,
+ originally Apache 2.0 License and publicly avaialbe at https://github.com/researchmm/STTN
+"""
+
+import math
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torchvision.models as models
+
+from modelscope.metainfo import Models
+from modelscope.models import Model
+from modelscope.models.base import TorchModel
+from modelscope.models.builder import MODELS
+from modelscope.utils.constant import ModelFile, Tasks
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+
+class BaseNetwork(nn.Module):
+
+    def __init__(self):
+        super(BaseNetwork, self).__init__()
+
+    def print_network(self):
+        if isinstance(self, list):
+            self = self[0]
+        num_params = 0
+        for param in self.parameters():
+            num_params += param.numel()
+        print(
+            'Network [%s] was created. Total number of parameters: %.1f million. '
+            'To see the architecture, do print(network).' %
+            (type(self).__name__, num_params / 1000000))
+
+    def init_weights(self, init_type='normal', gain=0.02):
+        '''
+        initialize network's weights
+        init_type: normal | xavier | kaiming | orthogonal
+        https://github.com/junyanz/pytorch-CycleGAN-and-pix2pix/blob/9451e70673400885567d08a9e97ade2524c700d0/models/networks.py#L39
+        '''
+
+        def init_func(m):
+            classname = m.__class__.__name__
+            if classname.find('InstanceNorm2d') != -1:
+                if hasattr(m, 'weight') and m.weight is not None:
+                    nn.init.constant_(m.weight.data, 1.0)
+                if hasattr(m, 'bias') and m.bias is not None:
+                    nn.init.constant_(m.bias.data, 0.0)
+            elif hasattr(m, 'weight') and (classname.find('Conv') != -1
+                                           or classname.find('Linear') != -1):
+                if init_type == 'normal':
+                    nn.init.normal_(m.weight.data, 0.0, gain)
+                elif init_type == 'xavier':
+                    nn.init.xavier_normal_(m.weight.data, gain=gain)
+                elif init_type == 'xavier_uniform':
+                    nn.init.xavier_uniform_(m.weight.data, gain=1.0)
+                elif init_type == 'kaiming':
+                    nn.init.kaiming_normal_(m.weight.data, a=0, mode='fan_in')
+                elif init_type == 'orthogonal':
+                    nn.init.orthogonal_(m.weight.data, gain=gain)
+                elif init_type == 'none':
+                    m.reset_parameters()
+                else:
+                    raise NotImplementedError(
+                        'initialization method [%s] is not implemented'
+                        % init_type)
+                if hasattr(m, 'bias') and m.bias is not None:
+                    nn.init.constant_(m.bias.data, 0.0)
+
+        self.apply(init_func)
+
+        for m in self.children():
+            if hasattr(m, 'init_weights'):
+                m.init_weights(init_type, gain)
+
+
+@MODELS.register_module(
+    Tasks.video_inpainting, module_name=Models.video_inpainting)
+class VideoInpainting(TorchModel):
+
+    def __init__(self, model_dir, device_id=0, *args, **kwargs):
+        super().__init__(
+            model_dir=model_dir, device_id=device_id, *args, **kwargs)
+        self.model = InpaintGenerator()
+        if torch.cuda.is_available():
+            device = 'cuda'
+        else:
+            device = 'cpu'
+        pretrained_params = torch.load(
+            '{}/{}'.format(model_dir, ModelFile.TORCH_MODEL_BIN_FILE),
+            map_location=device)
+        self.model.load_state_dict(pretrained_params['netG'])
+        self.model.eval()
+        self.device_id = device_id
+        if self.device_id >= 0 and torch.cuda.is_available():
+            self.model.to('cuda:{}'.format(self.device_id))
+            logger.info('Use GPU: {}'.format(self.device_id))
+        else:
+            self.device_id = -1
+            logger.info('Use CPU for inference')
+
+
+class InpaintGenerator(BaseNetwork):
+
+    def __init__(self, init_weights=True):
+        super(InpaintGenerator, self).__init__()
+        channel = 256
+        stack_num = 6
+        patchsize = [(48, 24), (16, 8), (8, 4), (4, 2)]
+        blocks = []
+        for _ in range(stack_num):
+            blocks.append(TransformerBlock(patchsize, hidden=channel))
+        self.transformer = nn.Sequential(*blocks)
+
+        self.encoder = nn.Sequential(
+            nn.Conv2d(3, 64, kernel_size=3, stride=2, padding=1),
+            nn.LeakyReLU(0.2, inplace=True),
+            nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1),
+            nn.LeakyReLU(0.2, inplace=True),
+            nn.Conv2d(64, 128, kernel_size=3, stride=2, padding=1),
+            nn.LeakyReLU(0.2, inplace=True),
+            nn.Conv2d(128, channel, kernel_size=3, stride=1, padding=1),
+            nn.LeakyReLU(0.2, inplace=True),
+        )
+
+        self.decoder = nn.Sequential(
+            deconv(channel, 128, kernel_size=3, padding=1),
+            nn.LeakyReLU(0.2, inplace=True),
+            nn.Conv2d(128, 64, kernel_size=3, stride=1, padding=1),
+            nn.LeakyReLU(0.2, inplace=True),
+            deconv(64, 64, kernel_size=3, padding=1),
+            nn.LeakyReLU(0.2, inplace=True),
+            nn.Conv2d(64, 3, kernel_size=3, stride=1, padding=1))
+
+        if init_weights:
+            self.init_weights()
+
+    def forward(self, masked_frames, masks):
+        b, t, c, h, w = masked_frames.size()
+        masks = masks.view(b * t, 1, h, w)
+        enc_feat = self.encoder(masked_frames.view(b * t, c, h, w))
+        _, c, h, w = enc_feat.size()
+        masks = F.interpolate(masks, scale_factor=1.0 / 4)
+        enc_feat = self.transformer({
+            'x': enc_feat,
+            'm': masks,
+            'b': b,
+            'c': c
+        })['x']
+        output = self.decoder(enc_feat)
+        output = torch.tanh(output)
+        return output
+
+    def infer(self, feat, masks):
+        t, c, h, w = masks.size()
+        masks = masks.view(t, c, h, w)
+        masks = F.interpolate(masks, scale_factor=1.0 / 4)
+        t, c, _, _ = feat.size()
+        enc_feat = self.transformer({
+            'x': feat,
+            'm': masks,
+            'b': 1,
+            'c': c
+        })['x']
+        return enc_feat
+
+
+class deconv(nn.Module):
+
+    def __init__(self,
+                 input_channel,
+                 output_channel,
+                 kernel_size=3,
+                 padding=0):
+        super().__init__()
+        self.conv = nn.Conv2d(
+            input_channel,
+            output_channel,
+            kernel_size=kernel_size,
+            stride=1,
+            padding=padding)
+
+    def forward(self, x):
+        x = F.interpolate(
+            x, scale_factor=2, mode='bilinear', align_corners=True)
+        x = self.conv(x)
+        return x
+
+
+class Attention(nn.Module):
+    """
+    Compute 'Scaled Dot Product Attention
+    """
+
+    def forward(self, query, key, value, m):
+        scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(
+            query.size(-1))
+        scores.masked_fill(m, -1e9)
+        p_attn = F.softmax(scores, dim=-1)
+        p_val = torch.matmul(p_attn, value)
+        return p_val, p_attn
+
+
+class MultiHeadedAttention(nn.Module):
+    """
+    Take in model size and number of heads.
+    """
+
+    def __init__(self, patchsize, d_model):
+        super().__init__()
+        self.patchsize = patchsize
+        self.query_embedding = nn.Conv2d(
+            d_model, d_model, kernel_size=1, padding=0)
+        self.value_embedding = nn.Conv2d(
+            d_model, d_model, kernel_size=1, padding=0)
+        self.key_embedding = nn.Conv2d(
+            d_model, d_model, kernel_size=1, padding=0)
+        self.output_linear = nn.Sequential(
+            nn.Conv2d(d_model, d_model, kernel_size=3, padding=1),
+            nn.LeakyReLU(0.2, inplace=True))
+        self.attention = Attention()
+
+    def forward(self, x, m, b, c):
+        bt, _, h, w = x.size()
+        t = bt // b
+        d_k = c // len(self.patchsize)
+        output = []
+        _query = self.query_embedding(x)
+        _key = self.key_embedding(x)
+        _value = self.value_embedding(x)
+        for (width, height), query, key, value in zip(
+                self.patchsize,
+                torch.chunk(_query, len(self.patchsize), dim=1),
+                torch.chunk(_key, len(self.patchsize), dim=1),
+                torch.chunk(_value, len(self.patchsize), dim=1)):
+            out_w, out_h = w // width, h // height
+            mm = m.view(b, t, 1, out_h, height, out_w, width)
+            mm = mm.permute(0, 1, 3, 5, 2, 4,
+                            6).contiguous().view(b, t * out_h * out_w,
+                                                 height * width)
+            mm = (mm.mean(-1) > 0.5).unsqueeze(1).repeat(
+                1, t * out_h * out_w, 1)
+            query = query.view(b, t, d_k, out_h, height, out_w, width)
+            query = query.permute(0, 1, 3, 5, 2, 4,
+                                  6).contiguous().view(b, t * out_h * out_w,
+                                                       d_k * height * width)
+            key = key.view(b, t, d_k, out_h, height, out_w, width)
+            key = key.permute(0, 1, 3, 5, 2, 4,
+                              6).contiguous().view(b, t * out_h * out_w,
+                                                   d_k * height * width)
+            value = value.view(b, t, d_k, out_h, height, out_w, width)
+            value = value.permute(0, 1, 3, 5, 2, 4,
+                                  6).contiguous().view(b, t * out_h * out_w,
+                                                       d_k * height * width)
+            y, _ = self.attention(query, key, value, mm)
+            y = y.view(b, t, out_h, out_w, d_k, height, width)
+            y = y.permute(0, 1, 4, 2, 5, 3, 6).contiguous().view(bt, d_k, h, w)
+            output.append(y)
+        output = torch.cat(output, 1)
+        x = self.output_linear(output)
+        return x
+
+
+class FeedForward(nn.Module):
+
+    def __init__(self, d_model):
+        super(FeedForward, self).__init__()
+        self.conv = nn.Sequential(
+            nn.Conv2d(d_model, d_model, kernel_size=3, padding=2, dilation=2),
+            nn.LeakyReLU(0.2, inplace=True),
+            nn.Conv2d(d_model, d_model, kernel_size=3, padding=1),
+            nn.LeakyReLU(0.2, inplace=True))
+
+    def forward(self, x):
+        x = self.conv(x)
+        return x
+
+
+class TransformerBlock(nn.Module):
+    """
+    Transformer = MultiHead_Attention + Feed_Forward with sublayer connection
+    """
+
+    def __init__(self, patchsize, hidden=128):  # hidden=128
+        super().__init__()
+        self.attention = MultiHeadedAttention(patchsize, d_model=hidden)
+        self.feed_forward = FeedForward(hidden)
+
+    def forward(self, x):
+        x, m, b, c = x['x'], x['m'], x['b'], x['c']
+        x = x + self.attention(x, m, b, c)
+        x = x + self.feed_forward(x)
+        return {'x': x, 'm': m, 'b': b, 'c': c}
+
+
+class Discriminator(BaseNetwork):
+
+    def __init__(self,
+                 in_channels=3,
+                 use_sigmoid=False,
+                 use_spectral_norm=True,
+                 init_weights=True):
+        super(Discriminator, self).__init__()
+        self.use_sigmoid = use_sigmoid
+        nf = 64
+
+        self.conv = nn.Sequential(
+            spectral_norm(
+                nn.Conv3d(
+                    in_channels=in_channels,
+                    out_channels=nf * 1,
+                    kernel_size=(3, 5, 5),
+                    stride=(1, 2, 2),
+                    padding=1,
+                    bias=not use_spectral_norm), use_spectral_norm),
+            nn.LeakyReLU(0.2, inplace=True),
+            spectral_norm(
+                nn.Conv3d(
+                    nf * 1,
+                    nf * 2,
+                    kernel_size=(3, 5, 5),
+                    stride=(1, 2, 2),
+                    padding=(1, 2, 2),
+                    bias=not use_spectral_norm), use_spectral_norm),
+            nn.LeakyReLU(0.2, inplace=True),
+            spectral_norm(
+                nn.Conv3d(
+                    nf * 2,
+                    nf * 4,
+                    kernel_size=(3, 5, 5),
+                    stride=(1, 2, 2),
+                    padding=(1, 2, 2),
+                    bias=not use_spectral_norm), use_spectral_norm),
+            nn.LeakyReLU(0.2, inplace=True),
+            spectral_norm(
+                nn.Conv3d(
+                    nf * 4,
+                    nf * 4,
+                    kernel_size=(3, 5, 5),
+                    stride=(1, 2, 2),
+                    padding=(1, 2, 2),
+                    bias=not use_spectral_norm), use_spectral_norm),
+            nn.LeakyReLU(0.2, inplace=True),
+            spectral_norm(
+                nn.Conv3d(
+                    nf * 4,
+                    nf * 4,
+                    kernel_size=(3, 5, 5),
+                    stride=(1, 2, 2),
+                    padding=(1, 2, 2),
+                    bias=not use_spectral_norm), use_spectral_norm),
+            nn.LeakyReLU(0.2, inplace=True),
+            nn.Conv3d(
+                nf * 4,
+                nf * 4,
+                kernel_size=(3, 5, 5),
+                stride=(1, 2, 2),
+                padding=(1, 2, 2)))
+
+        if init_weights:
+            self.init_weights()
+
+    def forward(self, xs):
+        xs_t = torch.transpose(xs, 0, 1)
+        xs_t = xs_t.unsqueeze(0)
+        feat = self.conv(xs_t)
+        if self.use_sigmoid:
+            feat = torch.sigmoid(feat)
+        out = torch.transpose(feat, 1, 2)
+        return out
+
+
+def spectral_norm(module, mode=True):
+    if mode:
+        return _spectral_norm(module)
+    return module
diff --git a/modelscope/models/cv/video_single_object_tracking/config/ostrack.py b/modelscope/models/cv/video_single_object_tracking/config/ostrack.py
index 8be07928..6805c503 100644
--- a/modelscope/models/cv/video_single_object_tracking/config/ostrack.py
+++ b/modelscope/models/cv/video_single_object_tracking/config/ostrack.py
@@ -1,4 +1,5 @@
-# The implementation is based on OSTrack, available at https://github.com/botaoye/OSTrack/
+# The implementation is adopted from OSTrack,
+# made publicly available under the MIT License at https://github.com/botaoye/OSTrack/
 from easydict import EasyDict as edict
 
 cfg = edict()
diff --git a/modelscope/models/cv/video_single_object_tracking/models/layers/attn.py b/modelscope/models/cv/video_single_object_tracking/models/layers/attn.py
index 00eb7e1c..e245c821 100644
--- a/modelscope/models/cv/video_single_object_tracking/models/layers/attn.py
+++ b/modelscope/models/cv/video_single_object_tracking/models/layers/attn.py
@@ -1,4 +1,5 @@
-# The implementation is based on OSTrack, available at https://github.com/botaoye/OSTrack/
+# The implementation is adopted from OSTrack,
+# made publicly available under the MIT License at https://github.com/botaoye/OSTrack/
 import torch.nn as nn
 
 
diff --git a/modelscope/models/cv/video_single_object_tracking/models/layers/attn_blocks.py b/modelscope/models/cv/video_single_object_tracking/models/layers/attn_blocks.py
index 3505d5e1..702c84f1 100644
--- a/modelscope/models/cv/video_single_object_tracking/models/layers/attn_blocks.py
+++ b/modelscope/models/cv/video_single_object_tracking/models/layers/attn_blocks.py
@@ -1,4 +1,5 @@
-# The implementation is based on OSTrack, available at https://github.com/botaoye/OSTrack/
+# The implementation is adopted from OSTrack,
+# made publicly available under the MIT License at https://github.com/botaoye/OSTrack/
 import math
 
 import torch
diff --git a/modelscope/models/cv/video_single_object_tracking/models/layers/head.py b/modelscope/models/cv/video_single_object_tracking/models/layers/head.py
index 77706dbc..e0dc7b59 100644
--- a/modelscope/models/cv/video_single_object_tracking/models/layers/head.py
+++ b/modelscope/models/cv/video_single_object_tracking/models/layers/head.py
@@ -1,4 +1,5 @@
-# The implementation is based on OSTrack, available at https://github.com/botaoye/OSTrack/
+# The implementation is adopted from OSTrack,
+# made publicly available under the MIT License at https://github.com/botaoye/OSTrack/
 import torch
 import torch.nn as nn
 
diff --git a/modelscope/models/cv/video_single_object_tracking/models/layers/patch_embed.py b/modelscope/models/cv/video_single_object_tracking/models/layers/patch_embed.py
index b1099fdf..c001663f 100644
--- a/modelscope/models/cv/video_single_object_tracking/models/layers/patch_embed.py
+++ b/modelscope/models/cv/video_single_object_tracking/models/layers/patch_embed.py
@@ -1,4 +1,5 @@
-# The implementation is based on OSTrack, available at https://github.com/botaoye/OSTrack/
+# The implementation is adopted from OSTrack,
+# made publicly available under the MIT License at https://github.com/botaoye/OSTrack/
 import torch.nn as nn
 from timm.models.layers import to_2tuple
 
diff --git a/modelscope/models/cv/video_single_object_tracking/models/ostrack/base_backbone.py b/modelscope/models/cv/video_single_object_tracking/models/ostrack/base_backbone.py
index de3a7b83..20d73422 100644
--- a/modelscope/models/cv/video_single_object_tracking/models/ostrack/base_backbone.py
+++ b/modelscope/models/cv/video_single_object_tracking/models/ostrack/base_backbone.py
@@ -1,4 +1,5 @@
-# The implementation is based on OSTrack, available at https://github.com/botaoye/OSTrack/
+# The implementation is adopted from OSTrack,
+# made publicly available under the MIT License at https://github.com/botaoye/OSTrack/
 import torch.nn as nn
 from timm.models.layers import to_2tuple
 
diff --git a/modelscope/models/cv/video_single_object_tracking/models/ostrack/ostrack.py b/modelscope/models/cv/video_single_object_tracking/models/ostrack/ostrack.py
index 40ed54f1..52704a6c 100644
--- a/modelscope/models/cv/video_single_object_tracking/models/ostrack/ostrack.py
+++ b/modelscope/models/cv/video_single_object_tracking/models/ostrack/ostrack.py
@@ -1,4 +1,5 @@
-# The implementation is based on OSTrack, available at https://github.com/botaoye/OSTrack/
+# The implementation is adopted from OSTrack,
+# made publicly available under the MIT License at https://github.com/botaoye/OSTrack/
 import torch
 from torch import nn
 
diff --git a/modelscope/models/cv/video_single_object_tracking/models/ostrack/utils.py b/modelscope/models/cv/video_single_object_tracking/models/ostrack/utils.py
index e1130069..46e7c18a 100644
--- a/modelscope/models/cv/video_single_object_tracking/models/ostrack/utils.py
+++ b/modelscope/models/cv/video_single_object_tracking/models/ostrack/utils.py
@@ -1,4 +1,5 @@
-# The implementation is based on OSTrack, available at https://github.com/botaoye/OSTrack/
+# The implementation is adopted from OSTrack,
+# made publicly available under the MIT License at https://github.com/botaoye/OSTrack/
 import torch
 
 
diff --git a/modelscope/models/cv/video_single_object_tracking/models/ostrack/vit_ce.py b/modelscope/models/cv/video_single_object_tracking/models/ostrack/vit_ce.py
index 9f010332..f186cf89 100644
--- a/modelscope/models/cv/video_single_object_tracking/models/ostrack/vit_ce.py
+++ b/modelscope/models/cv/video_single_object_tracking/models/ostrack/vit_ce.py
@@ -1,4 +1,5 @@
-# The implementation is based on OSTrack, available at https://github.com/botaoye/OSTrack/
+# The implementation is adopted from OSTrack,
+# made publicly available under the MIT License at https://github.com/botaoye/OSTrack/
 from functools import partial
 
 import torch
diff --git a/modelscope/models/cv/video_single_object_tracking/tracker/ostrack.py b/modelscope/models/cv/video_single_object_tracking/tracker/ostrack.py
index 02f4c79e..5093a72d 100644
--- a/modelscope/models/cv/video_single_object_tracking/tracker/ostrack.py
+++ b/modelscope/models/cv/video_single_object_tracking/tracker/ostrack.py
@@ -1,4 +1,5 @@
-# The implementation is based on OSTrack, available at https://github.com/botaoye/OSTrack/
+# The implementation is adopted from OSTrack,
+# made publicly available under the MIT License at https://github.com/botaoye/OSTrack/
 import torch
 
 from modelscope.models.cv.video_single_object_tracking.config.ostrack import \
diff --git a/modelscope/models/cv/video_single_object_tracking/utils/utils.py b/modelscope/models/cv/video_single_object_tracking/utils/utils.py
index 51911957..90513a2a 100644
--- a/modelscope/models/cv/video_single_object_tracking/utils/utils.py
+++ b/modelscope/models/cv/video_single_object_tracking/utils/utils.py
@@ -1,4 +1,5 @@
-# The implementation is based on OSTrack, available at https://github.com/botaoye/OSTrack/
+# The implementation is adopted from OSTrack,
+# made publicly available under the MIT License at https://github.com/botaoye/OSTrack/
 import math
 from typing import Optional
 
@@ -237,3 +238,10 @@ def check_box(box: list, image_height, image_width) -> bool:
     if box[3] < 0 or box[3] >= image_height:
         return False
     return True
+
+
+def timestamp_format(seconds):
+    m, s = divmod(seconds, 60)
+    h, m = divmod(m, 60)
+    time = '%02d:%02d:%06.3f' % (h, m, s)
+    return time
diff --git a/modelscope/models/multi_modal/__init__.py b/modelscope/models/multi_modal/__init__.py
index 112b3a58..0053da43 100644
--- a/modelscope/models/multi_modal/__init__.py
+++ b/modelscope/models/multi_modal/__init__.py
@@ -7,23 +7,29 @@ if TYPE_CHECKING:
 
     from .clip import CLIPForMultiModalEmbedding
     from .gemm import GEMMForMultiModalEmbedding
+    from .team import TEAMForMultiModalSimilarity
     from .diffusion import DiffusionForTextToImageSynthesis
     from .mmr import VideoCLIPForMultiModalEmbedding
     from .mplug_for_all_tasks import MPlugForAllTasks
     from .ofa_for_all_tasks import OfaForAllTasks
     from .ofa_for_text_to_image_synthesis_model import \
         OfaForTextToImageSynthesis
+    from .multi_stage_diffusion import \
+        MultiStageDiffusionForTextToImageSynthesis
 
 else:
     _import_structure = {
         'clip': ['CLIPForMultiModalEmbedding'],
         'diffusion': ['DiffusionForTextToImageSynthesis'],
         'gemm': ['GEMMForMultiModalEmbedding'],
+        'team': ['TEAMForMultiModalSimilarity'],
         'mmr': ['VideoCLIPForMultiModalEmbedding'],
         'mplug_for_all_tasks': ['MPlugForAllTasks'],
         'ofa_for_all_tasks': ['OfaForAllTasks'],
         'ofa_for_text_to_image_synthesis_model':
-        ['OfaForTextToImageSynthesis']
+        ['OfaForTextToImageSynthesis'],
+        'multi_stage_diffusion':
+        ['MultiStageDiffusionForTextToImageSynthesis']
     }
 
     import sys
diff --git a/modelscope/models/multi_modal/diffusion/diffusion.py b/modelscope/models/multi_modal/diffusion/diffusion.py
index d71fe0ae..bfe7baf7 100644
--- a/modelscope/models/multi_modal/diffusion/diffusion.py
+++ b/modelscope/models/multi_modal/diffusion/diffusion.py
@@ -1,3 +1,6 @@
+# Part of the implementation is borrowed and modified from latent-diffusion,
+# publicly avaialbe at https://github.com/CompVis/latent-diffusion.
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
 import math
 
 import torch
diff --git a/modelscope/models/multi_modal/diffusion/model.py b/modelscope/models/multi_modal/diffusion/model.py
index 8617b8dd..4229391f 100644
--- a/modelscope/models/multi_modal/diffusion/model.py
+++ b/modelscope/models/multi_modal/diffusion/model.py
@@ -1,3 +1,4 @@
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
 import os.path as osp
 from typing import Any, Dict
 
diff --git a/modelscope/models/multi_modal/diffusion/unet_generator.py b/modelscope/models/multi_modal/diffusion/unet_generator.py
index 9b507223..539d3996 100644
--- a/modelscope/models/multi_modal/diffusion/unet_generator.py
+++ b/modelscope/models/multi_modal/diffusion/unet_generator.py
@@ -1,3 +1,6 @@
+# Part of the implementation is borrowed and modified from latent-diffusion,
+# publicly avaialbe at https://github.com/CompVis/latent-diffusion.
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
 import math
 
 import torch
diff --git a/modelscope/models/multi_modal/diffusion/unet_upsampler_1024.py b/modelscope/models/multi_modal/diffusion/unet_upsampler_1024.py
index 1c66b2fe..38cff6a2 100644
--- a/modelscope/models/multi_modal/diffusion/unet_upsampler_1024.py
+++ b/modelscope/models/multi_modal/diffusion/unet_upsampler_1024.py
@@ -1,3 +1,6 @@
+# Part of the implementation is borrowed and modified from latent-diffusion,
+# publicly avaialbe at https://github.com/CompVis/latent-diffusion.
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
 import math
 
 import torch
diff --git a/modelscope/models/multi_modal/diffusion/unet_upsampler_256.py b/modelscope/models/multi_modal/diffusion/unet_upsampler_256.py
index 0da8b805..ca5cd7d6 100644
--- a/modelscope/models/multi_modal/diffusion/unet_upsampler_256.py
+++ b/modelscope/models/multi_modal/diffusion/unet_upsampler_256.py
@@ -1,3 +1,6 @@
+# Part of the implementation is borrowed and modified from latent-diffusion,
+# publicly avaialbe at https://github.com/CompVis/latent-diffusion.
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
 import math
 from functools import partial
 
diff --git a/modelscope/models/multi_modal/gemm/gemm_base.py b/modelscope/models/multi_modal/gemm/gemm_base.py
index db928212..09ef2480 100644
--- a/modelscope/models/multi_modal/gemm/gemm_base.py
+++ b/modelscope/models/multi_modal/gemm/gemm_base.py
@@ -1,9 +1,14 @@
-""" Generative Multimodal Model
-Base modules are adapted from https://github.com/openai/CLIP/,
-originally MIT License, Copyright (c) 2021 OpenAI,
-and adapted from https://github.com/lucidrains/CoCa-pytorch/,
-originally MIT License, Copyright (c) 2022 Phil Wang.
-"""
+# Copyright 2021 The OpenAI Team Authors.
+# Copyright 2022 Phil Wang.
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
+#
+# The implementation here is modified based on OpenAI CLIP,
+# originally MIT License, Copyright (c) 2021 OpenAI,
+# and publicly available at https://github.com/openai/CLIP/.
+# The implementation here is modified based on Coca-pytorch,
+# originally MIT License, Copyright (c) 2022 Phil Wang,
+# and publicly available at https://github.com/lucidrains/CoCa-pytorch/,
+""" Generative Multimodal Model Architecture."""
 
 import os
 from collections import OrderedDict
diff --git a/modelscope/models/multi_modal/gemm/gemm_model.py b/modelscope/models/multi_modal/gemm/gemm_model.py
index 356dc8d3..c90b35d4 100644
--- a/modelscope/models/multi_modal/gemm/gemm_model.py
+++ b/modelscope/models/multi_modal/gemm/gemm_model.py
@@ -1,3 +1,5 @@
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
+""" Generative Multimodal Model Wrapper."""
 import os.path as osp
 from typing import Any, Dict
 
@@ -65,7 +67,7 @@ class GEMMForMultiModalEmbedding(TorchModel):
         return img_tensor
 
     def parse_text(self, text_str):
-        if text_str is None:
+        if text_str is None or len(text_str) == 0:
             return None
         if isinstance(text_str, str):
             text_ids_tensor = self.gemm_model.tokenize(text_str)
@@ -77,9 +79,12 @@ class GEMMForMultiModalEmbedding(TorchModel):
         return text_ids_tensor.view(1, -1)
 
     def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
-        image = self.parse_image(input.get('image', input.get('img', None)))
-        text = self.parse_text(input.get('text', input.get('txt', None)))
-        captioning = input.get('captioning', False) is True
+        image_input = input.get('image', input.get('img', None))
+        text_input = input.get('text', input.get('txt', None))
+        captioning_input = input.get('captioning', None)
+        image = self.parse_image(image_input)
+        text = self.parse_text(text_input)
+        captioning = captioning_input is True or text_input == ''
         out = self.gemm_model(image, text, captioning)
         output = {
             OutputKeys.IMG_EMBEDDING: out.get('image_feature', None),
diff --git a/modelscope/models/multi_modal/gemm/tokenizer.py b/modelscope/models/multi_modal/gemm/tokenizer.py
index af962ceb..8b7cc094 100644
--- a/modelscope/models/multi_modal/gemm/tokenizer.py
+++ b/modelscope/models/multi_modal/gemm/tokenizer.py
@@ -1,7 +1,11 @@
-""" CLIP Tokenizer
-Adapted from https://github.com/openai/CLIP.
-Originally MIT License, Copyright (c) 2021 OpenAI.
-"""
+# Copyright 2021 The OpenAI Team Authors.
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
+#
+# The implementation here is modified based on OpenAI CLIP,
+# originally MIT License, Copyright (c) 2021 OpenAI,
+# and publicly available at https://github.com/openai/CLIP/.
+""" CLIP Tokenizer."""
+
 import gzip
 import html
 import os
diff --git a/modelscope/models/multi_modal/mmr/__init__.py b/modelscope/models/multi_modal/mmr/__init__.py
index c5fb7419..9dac8409 100644
--- a/modelscope/models/multi_modal/mmr/__init__.py
+++ b/modelscope/models/multi_modal/mmr/__init__.py
@@ -1 +1,3 @@
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
+
 from .models import VideoCLIPForMultiModalEmbedding
diff --git a/modelscope/models/multi_modal/mmr/dataloaders/rawvideo_util.py b/modelscope/models/multi_modal/mmr/dataloaders/rawvideo_util.py
index eab1189f..c7ac3f94 100644
--- a/modelscope/models/multi_modal/mmr/dataloaders/rawvideo_util.py
+++ b/modelscope/models/multi_modal/mmr/dataloaders/rawvideo_util.py
@@ -1,3 +1,6 @@
+# The implementation is adopted from Huaishao Luo,
+# made pubicly available under the MIT License at https://github.com/ArrowLuo/CLIP4Clip
+
 import cv2
 import numpy as np
 import torch as th
diff --git a/modelscope/models/multi_modal/mmr/models/__init__.py b/modelscope/models/multi_modal/mmr/models/__init__.py
index 6cd06bcd..da832719 100644
--- a/modelscope/models/multi_modal/mmr/models/__init__.py
+++ b/modelscope/models/multi_modal/mmr/models/__init__.py
@@ -1 +1,3 @@
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
+
 from .clip_for_mm_video_embedding import VideoCLIPForMultiModalEmbedding
diff --git a/modelscope/models/multi_modal/mmr/models/clip_for_mm_video_embedding.py b/modelscope/models/multi_modal/mmr/models/clip_for_mm_video_embedding.py
index 4e959a17..5e8e2e7a 100644
--- a/modelscope/models/multi_modal/mmr/models/clip_for_mm_video_embedding.py
+++ b/modelscope/models/multi_modal/mmr/models/clip_for_mm_video_embedding.py
@@ -1,3 +1,6 @@
+# The implementation is adopated from the CLIP4Clip implementation,
+# made pubicly available under Apache License, Version 2.0 at https://github.com/ArrowLuo/CLIP4Clip
+
 import random
 from os.path import exists
 from typing import Any, Dict
@@ -42,7 +45,10 @@ class VideoCLIPForMultiModalEmbedding(TorchModel):
         self.max_frames = model_config['max_frames']
         self.feature_framerate = model_config['feature_framerate']
         self.image_resolution = 224
-        self.device = model_config['device']
+        if torch.cuda.is_available():
+            self.device = model_config['device']
+        else:
+            self.device = 'cpu'
         self.init_model = f'{model_dir}/{ModelFile.TORCH_MODEL_BIN_FILE}'
 
         self.tokenizer = ClipTokenizer(model_dir)
diff --git a/modelscope/models/multi_modal/mmr/models/dynamic_inverted_softmax.py b/modelscope/models/multi_modal/mmr/models/dynamic_inverted_softmax.py
index 572f44bc..253a847c 100644
--- a/modelscope/models/multi_modal/mmr/models/dynamic_inverted_softmax.py
+++ b/modelscope/models/multi_modal/mmr/models/dynamic_inverted_softmax.py
@@ -1,3 +1,6 @@
+# The implementation is adopated from the CLIP4Clip implementation,
+# made pubicly available under Apache License, Version 2.0 at https://github.com/ArrowLuo/CLIP4Clip
+
 import numpy as np
 
 
diff --git a/modelscope/models/multi_modal/mmr/models/modeling.py b/modelscope/models/multi_modal/mmr/models/modeling.py
index 214e65c7..dc6510bf 100644
--- a/modelscope/models/multi_modal/mmr/models/modeling.py
+++ b/modelscope/models/multi_modal/mmr/models/modeling.py
@@ -1,3 +1,5 @@
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
+
 import os
 import platform
 from collections import OrderedDict
@@ -85,9 +87,6 @@ class CLIP4Clip(nn.Module):
             linear_patch=config['linear_patch'],
             use_gc=config['use_gc']).float()
 
-        if (platform.system() != 'Darwin'):
-            convert_weights(self.clip)  # fp16
-
         if backbone in ['ViT-B/32', 'ViT-B/16']:
             cross_config = SimpleNamespace(**{
                 'hidden_size': 512,
diff --git a/modelscope/models/multi_modal/mmr/models/module_clip.py b/modelscope/models/multi_modal/mmr/models/module_clip.py
index 36e56196..53501720 100644
--- a/modelscope/models/multi_modal/mmr/models/module_clip.py
+++ b/modelscope/models/multi_modal/mmr/models/module_clip.py
@@ -1,4 +1,5 @@
-# Part of the implementation is borrowed and modified from The OpenAI CLIP project.
+# The implementation is  adopated from the CLIP4Clip implementation,
+# made pubicly available under Apache License, Version 2.0 at https://github.com/ArrowLuo/CLIP4Clip
 
 import hashlib
 import os
diff --git a/modelscope/models/multi_modal/mmr/models/module_cross.py b/modelscope/models/multi_modal/mmr/models/module_cross.py
index 05edb853..b958d5bc 100644
--- a/modelscope/models/multi_modal/mmr/models/module_cross.py
+++ b/modelscope/models/multi_modal/mmr/models/module_cross.py
@@ -1,3 +1,6 @@
+# The implementation is  adopated from the CLIP4Clip implementation,
+# made pubicly available under Apache License, Version 2.0 at https://github.com/ArrowLuo/CLIP4Clip
+
 from __future__ import absolute_import, division, print_function
 import logging
 from collections import OrderedDict
diff --git a/modelscope/models/multi_modal/mmr/models/tokenization_clip.py b/modelscope/models/multi_modal/mmr/models/tokenization_clip.py
index ee60f857..4e2c9b15 100644
--- a/modelscope/models/multi_modal/mmr/models/tokenization_clip.py
+++ b/modelscope/models/multi_modal/mmr/models/tokenization_clip.py
@@ -1,3 +1,6 @@
+# The implementation is adopated from the CLIP4Clip implementation,
+# made pubicly available under Apache License, Version 2.0 at https://github.com/ArrowLuo/CLIP4Clip
+
 import gzip
 import html
 import os
diff --git a/modelscope/models/multi_modal/mplug/clip/__init__.py b/modelscope/models/multi_modal/mplug/clip/__init__.py
index 05826f46..e6007a04 100644
--- a/modelscope/models/multi_modal/mplug/clip/__init__.py
+++ b/modelscope/models/multi_modal/mplug/clip/__init__.py
@@ -1 +1,3 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 from .clip import load_from_config
diff --git a/modelscope/models/multi_modal/mplug/configuration_mplug.py b/modelscope/models/multi_modal/mplug/configuration_mplug.py
index c275ed15..914678c5 100644
--- a/modelscope/models/multi_modal/mplug/configuration_mplug.py
+++ b/modelscope/models/multi_modal/mplug/configuration_mplug.py
@@ -64,6 +64,10 @@ class MPlugConfig(PretrainedConfig):
             clip_transformer_width=768,
             clip_transformer_heads=12,
             clip_transformer_layers=12,
+            # retrieval
+            queue_size=65536,
+            embed_dim=256,
+            temp=0.07,
             **kwargs):
 
         super().__init__(**kwargs)
@@ -99,6 +103,10 @@ class MPlugConfig(PretrainedConfig):
         self.clip_transformer_width = clip_transformer_width
         self.clip_transformer_heads = clip_transformer_heads
         self.clip_transformer_layers = clip_transformer_layers
+        # retrieval
+        self.queue_size = queue_size
+        self.embed_dim = embed_dim
+        self.temp = temp
 
     @classmethod
     def from_yaml_file(cls, yaml_file: Union[str,
diff --git a/modelscope/models/multi_modal/mplug/modeling_mplug.py b/modelscope/models/multi_modal/mplug/modeling_mplug.py
index 50622cc0..ec491f1d 100755
--- a/modelscope/models/multi_modal/mplug/modeling_mplug.py
+++ b/modelscope/models/multi_modal/mplug/modeling_mplug.py
@@ -1855,7 +1855,8 @@ class MPlug(PreTrainedModel):
 
         task_mapping = {
             Tasks.visual_question_answering: MPlugForVisualQuestionAnswering,
-            Tasks.image_captioning: MPLUGForImageCaption
+            Tasks.image_captioning: MPlugForImageCaption,
+            Tasks.image_text_retrieval: MPlugForImageTextRetrieval,
         }
         config = cls.config_class.from_yaml_file(
             os.path.join(model_dir, CONFIG_NAME))
@@ -1866,11 +1867,15 @@ class MPlug(PreTrainedModel):
                                            ModelFile.TORCH_MODEL_BIN_FILE)
             checkpoint = torch.load(checkpoint_path, map_location='cpu')
             if 'model' in checkpoint:
-                state_dict = checkpoint['model']
-            else:
-                state_dict = checkpoint['module']
+                checkpoint = checkpoint['model']
+            if 'module' in checkpoint:
+                checkpoint = checkpoint['module']
+            checkpoint = {
+                k.replace('model.', ''): v
+                for k, v in checkpoint.items()
+            }
 
-            msg = model.load_state_dict(state_dict, strict=False)
+            msg = model.load_state_dict(checkpoint, strict=False)
             print('load checkpoint from %s' % checkpoint_path)
             print(msg)
         return model
@@ -1915,6 +1920,33 @@ class MPlug(PreTrainedModel):
         clip_model.visual.positional_embedding = pos_embed
         return clip_model
 
+    def init_distill(self, config):
+        self.distill = config.distill
+        if self.distill:
+            self.visual_encoder_m = self._initialize_clip(config)
+            self.text_encoder_m = BertModel(
+                self.config_encoder, add_pooling_layer=False)
+            self.fusion_encoder_m = FusionModel(
+                self.config_fusion, add_pooling_layer=False)
+            self.text_decoder_m = BertLMHeadModel(self.config_decoder)
+            self.model_pairs = [
+                [self.visual_encoder, self.visual_encoder_m],
+                [self.text_encoder, self.text_encoder_m],
+                [self.text_decoder, self.text_decoder_m],
+            ]
+            if self.config_encoder.hidden_size != config.vision_width:
+                self.visn_fc_m = nn.Linear(config.vision_width,
+                                           self.config_encoder.hidden_size)
+                self.visn_layer_norm_m = nn.LayerNorm(
+                    self.config_encoder.hidden_size, eps=1e-12)
+                self.dropout_m = nn.Dropout(
+                    self.config_encoder.hidden_dropout_prob)
+                self.model_pairs.extend(
+                    [[self.visn_fc, self.visn_fc_m],
+                     [self.visn_layer_norm, self.visn_layer_norm_m]])
+            self.copy_params()
+            self.momentum = 0.995
+
     def forward(self, *args, **kwargs):
         raise NotImplementedError
 
@@ -1969,71 +2001,6 @@ class MPlug(PreTrainedModel):
                 [init_dim * np.arange(n_tile) + i for i in range(init_dim)]))
         return torch.index_select(x, dim, order_index.to(x.device))
 
-    def rank_answer(self, question_states, question_atts, answer_ids,
-                    answer_atts, k):
-
-        num_ques = question_states.size(0)
-        start_ids = answer_ids[0, 0].repeat(num_ques, 1)  # bos token
-
-        start_output = self.text_decoder(
-            start_ids,
-            encoder_hidden_states=question_states,
-            encoder_attention_mask=question_atts,
-            return_dict=True,
-            reduction='none')
-        logits = start_output.logits[:, 0, :]  # first token's logit
-
-        # topk_probs: top-k probability
-        # topk_ids: [num_question, k]
-        answer_first_token = answer_ids[:, 1]
-        prob_first_token = F.softmax(
-            logits, dim=1).index_select(
-                dim=1, index=answer_first_token)
-        topk_probs, topk_ids = prob_first_token.topk(k, dim=1)
-
-        # answer input: [num_question*k, answer_len]
-        input_ids = []
-        input_atts = []
-        for b, topk_id in enumerate(topk_ids):
-            input_ids.append(answer_ids.index_select(dim=0, index=topk_id))
-            input_atts.append(answer_atts.index_select(dim=0, index=topk_id))
-        input_ids = torch.cat(input_ids, dim=0)
-        input_atts = torch.cat(input_atts, dim=0)
-
-        targets_ids = input_ids.masked_fill(
-            input_ids == self.tokenizer.pad_token_id, -100)
-
-        # repeat encoder's output for top-k answers
-        question_states = self._tile(question_states, 0, k)
-        question_atts = self._tile(question_atts, 0, k)
-
-        output = self.text_decoder(
-            input_ids,
-            attention_mask=input_atts,
-            encoder_hidden_states=question_states,
-            encoder_attention_mask=question_atts,
-            labels=targets_ids,
-            return_dict=True,
-            reduction='none')
-
-        answer_loss = output.loss
-        answer_loss = answer_loss.view(input_ids.size(0), -1)
-
-        # topk_prob: first token probability
-        topk_probs = topk_probs.view(-1, 1)
-        log_probs = torch.cat([topk_probs.log(), -answer_loss], dim=1)
-
-        # re-calculate log probabilities for the answer sequences using chain rule
-        log_probs_sum = log_probs.sum(1)
-        log_probs_sum = log_probs_sum.view(num_ques, k)
-
-        topk_probs = F.softmax(log_probs_sum, dim=-1)
-        # get top-k after re-ranking
-        topk_probs, rerank_id = topk_probs.topk(k, dim=1)
-        topk_ids = torch.gather(topk_ids, 1, rerank_id)
-
-        return topk_ids, topk_probs
-
 
 class MPlugForVisualQuestionAnswering(MPlug):
 
@@ -2043,33 +2010,6 @@ class MPlugForVisualQuestionAnswering(MPlug):
         self.beam_generator = TextGenerator(config, self.text_decoder)
         self.init_distill(config)
 
-    def init_distill(self, config):
-        self.distill = config.distill
-        if self.distill:
-            self.visual_encoder_m = self._initialize_clip(config)
-            self.text_encoder_m = BertModel(
-                self.config_encoder, add_pooling_layer=False)
-            self.fusion_encoder_m = FusionModel(
-                self.config_fusion, add_pooling_layer=False)
-            self.text_decoder_m = BertLMHeadModel(self.config_decoder)
-            self.model_pairs = [
-                [self.visual_encoder, self.visual_encoder_m],
-                [self.text_encoder, self.text_encoder_m],
-                [self.text_decoder, self.text_decoder_m],
-            ]
-            if self.config_encoder.hidden_size != config.vision_width:
-                self.visn_fc_m = nn.Linear(config.vision_width,
-                                           self.config_encoder.hidden_size)
-                self.visn_layer_norm_m = nn.LayerNorm(
-                    self.config_encoder.hidden_size, eps=1e-12)
-                self.dropout_m = nn.Dropout(
-                    self.config_encoder.hidden_dropout_prob)
-                self.model_pairs.extend(
-                    [[self.visn_fc, self.visn_fc_m],
-                     [self.visn_layer_norm, self.visn_layer_norm_m]])
-            self.copy_params()
-            self.momentum = 0.995
-
     def forward(self,
                 image,
                 question,
@@ -2111,6 +2051,8 @@ class MPlugForVisualQuestionAnswering(MPlug):
             merge_text_attention = torch.cat(
                 [image_atts, question.attention_mask], 1)
 
+            if k is None:
+                k = [1] * question_output.shape[0]
             question_states = []
             question_atts = []
             for b, n in enumerate(k):
@@ -2177,6 +2119,8 @@ class MPlugForVisualQuestionAnswering(MPlug):
                     return_dict=True,
                     reduction='none',
                 )
+            if weights is None:
+                weights = 1
             loss = weights * answer_output.loss
             loss = loss.sum() / image.size(0)
 
@@ -2203,7 +2147,7 @@ class MPlugForVisualQuestionAnswering(MPlug):
             return topk_ids, topk_probs
 
 
-class MPLUGForImageCaption(MPlug):
+class MPlugForImageCaption(MPlug):
 
     def __init__(self, config):
         super().__init__(config)
@@ -2262,50 +2206,278 @@ class MPLUGForImageCaption(MPlug):
         if train:
             answer_targets = answer.input_ids.masked_fill(
                 answer.input_ids == self.tokenizer.pad_token_id, -100)
-            text_output = self.text_encoder(
-                question.input_ids,
-                attention_mask=question.attention_mask,
-                return_dict=True)
-            text_embeds = text_output.last_hidden_state
-            fusion_output = self.fusion_encoder(
-                encoder_embeds=text_embeds,
-                attention_mask=question.attention_mask,
-                encoder_hidden_states=image_embeds,
-                encoder_attention_mask=image_atts,
-                return_dict=False)
-
-            image_output, question_output = fusion_output
-
-            question_output = torch.cat([image_output, question_output], 1)
-            merge_text_attention = torch.cat(
-                [image_atts, question.attention_mask], 1)
-
             answer_output = self.text_decoder(
                 answer.input_ids,
                 attention_mask=answer.attention_mask,
-                encoder_hidden_states=question_output,
-                encoder_attention_mask=merge_text_attention,
+                encoder_hidden_states=image_embeds,
+                encoder_attention_mask=image_atts,
                 labels=answer_targets,
                 return_dict=True,
                 reduction='none')
             loss = answer_output.loss
+
             return loss
         else:
+            topk_ids, topk_probs = self.generation(image_embeds, image_atts)
+            return topk_ids, topk_probs
+
+
+class MPlugForImageTextRetrieval(MPlug):
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.embed_dim = config.embed_dim
+        self.temp = nn.Parameter(torch.ones([]) * config.temp)
+        self.queue_size = config.queue_size
+        self.momentum = config.momentum
+        self.alpha = config.alpha
+
+        self.queue_size = config.queue_size
+        self.text_width = self.config_encoder.hidden_size
+        self.embed_dim = config.embed_dim
+
+        self.vision_proj = nn.Linear(self.text_width, self.embed_dim)
+        self.text_proj = nn.Linear(self.text_width, self.embed_dim)
+        self.itm_head = nn.Linear(self.text_width, 2)
+
+        self.register_buffer('image_queue',
+                             torch.randn(self.embed_dim, self.queue_size))
+        self.register_buffer('text_queue',
+                             torch.randn(self.embed_dim, self.queue_size))
+        self.register_buffer('idx_queue', torch.full((1, self.queue_size),
+                                                     -100))
+        self.register_buffer('queue_ptr', torch.zeros(1, dtype=torch.long))
+
+        self.image_queue = F.normalize(self.image_queue, dim=0)
+        self.text_queue = F.normalize(self.text_queue, dim=0)
+        self.init_distill(config)
+
+    def init_distill(self, config):
+        self.distill = config.distill
+        if self.distill:
+            self.visual_encoder_m = self._initialize_clip(config)
+            self.text_encoder_m = BertModel(
+                self.config_encoder, add_pooling_layer=False)
+            self.fusion_encoder_m = FusionModel(
+                self.config_fusion, add_pooling_layer=False)
+            self.vision_proj_m = nn.Linear(self.text_width, self.embed_dim)
+            self.text_proj_m = nn.Linear(self.text_width, self.embed_dim)
+            self.model_pairs = [
+                [self.visual_encoder, self.visual_encoder_m],
+                [self.text_encoder, self.text_encoder_m],
+                [self.text_proj, self.text_proj_m],
+                [self.vision_proj, self.vision_proj_m],
+            ]
+            if self.config_encoder.hidden_size != config.vision_width:
+                self.visn_fc_m = nn.Linear(config.vision_width,
+                                           self.config_encoder.hidden_size)
+                self.visn_layer_norm_m = nn.LayerNorm(
+                    self.config_encoder.hidden_size, eps=1e-12)
+                self.dropout_m = nn.Dropout(
+                    self.config_encoder.hidden_dropout_prob)
+                self.model_pairs.extend(
+                    [[self.visn_fc, self.visn_fc_m],
+                     [self.visn_layer_norm, self.visn_layer_norm_m]])
+            self.copy_params()
+            self.momentum = 0.995
+
+    @torch.no_grad()
+    def _dequeue_and_enqueue(self, image_feat, text_feat, idx):
+
+        def concat_all_gather(tensor):
+            """
+            Performs all_gather operation on the provided tensors.
+            *** Warning ***: torch.distributed.all_gather has no gradient.
+            """
+            if not torch.distributed.is_initialized():
+                return tensor
+            tensors_gather = [
+                torch.ones_like(tensor)
+                for _ in range(torch.distributed.get_world_size())
+            ]
+            torch.distributed.all_gather(
+                tensors_gather, tensor, async_op=False)
+
+            output = torch.cat(tensors_gather, dim=0)
+            return output
+
+        # gather keys before updating queue
+        image_feats = concat_all_gather(image_feat)
+        text_feats = concat_all_gather(text_feat)
+        idxs = concat_all_gather(idx)
+
+        batch_size = image_feats.shape[0]
+
+        ptr = int(self.queue_ptr)
+        # assert self.queue_size % batch_size == 0  # for simplicity
+
+        # replace the keys at ptr (dequeue and enqueue)
+        self.image_queue[:, ptr:ptr + batch_size] = image_feats.T
+        self.text_queue[:, ptr:ptr + batch_size] = text_feats.T
+        self.idx_queue[:, ptr:ptr + batch_size] = idxs.T
+        ptr = (ptr + batch_size) % self.queue_size  # move pointer
+
+        self.queue_ptr[0] = ptr
+
+    def forward(self, image, text, idx=None, train=True):
+        if train:
+            image_embeds = self.visual_encoder.visual(
+                image, skip_last_layer=True)
+            if self.large:
+                image_embeds = self.dropout(
+                    self.visn_layer_norm(self.visn_fc(image_embeds)))
+            image_atts = torch.ones(
+                image_embeds.size()[:-1], dtype=torch.long).to(image.device)
+
+            image_feat = F.normalize(
+                self.vision_proj(image_embeds[:, 0, :]), dim=-1)
             text_output = self.text_encoder(
-                question.input_ids,
-                attention_mask=question.attention_mask,
+                text.input_ids,
+                attention_mask=text.attention_mask,
                 return_dict=True)
             text_embeds = text_output.last_hidden_state
-            fusion_output = self.fusion_encoder(
+            text_feat = F.normalize(
+                self.text_proj(text_embeds[:, 0, :]), dim=-1)
+
+            idx = idx.view(-1, 1)
+            idx_all = torch.cat(
+                [idx.t(), self.idx_queue.clone().detach()], dim=1)
+            pos_idx = torch.eq(idx, idx_all).float()
+            sim_targets = pos_idx / pos_idx.sum(1, keepdim=True)
+
+            with torch.no_grad():
+                self._momentum_update()
+                image_embeds_m = self.visual_encoder_m.visual(
+                    image, skip_last_layer=True)
+                if self.large:
+                    image_embeds_m = self.dropout_m(
+                        self.visn_layer_norm_m(self.visn_fc_m(image_embeds_m)))
+                image_feat_m = F.normalize(
+                    self.vision_proj_m(image_embeds_m[:, 0, :]), dim=-1)
+                image_feat_all = torch.cat(
+                    [image_feat_m.t(),
+                     self.image_queue.clone().detach()],
+                    dim=1)
+                text_output_m = self.text_encoder_m(
+                    text.input_ids,
+                    attention_mask=text.attention_mask,
+                    return_dict=True)
+                text_feat_m = F.normalize(
+                    self.text_proj_m(text_output_m.last_hidden_state[:, 0, :]),
+                    dim=-1)
+                text_feat_all = torch.cat(
+                    [text_feat_m.t(),
+                     self.text_queue.clone().detach()], dim=1)
+
+                if self.distill:
+                    sim_i2t_m = image_feat_m @ text_feat_all / self.temp
+                    sim_t2i_m = text_feat_m @ image_feat_all / self.temp
+
+                    sim_i2t_targets = self.alpha * F.softmax(
+                        sim_i2t_m, dim=1) + (1 - self.alpha) * sim_targets
+                    sim_t2i_targets = self.alpha * F.softmax(
+                        sim_t2i_m, dim=1) + (1 - self.alpha) * sim_targets
+
+            sim_i2t = image_feat @ text_feat_all / self.temp
+            sim_t2i = text_feat @ image_feat_all / self.temp
+
+            if self.distill:
+                loss_i2t = -torch.sum(
+                    F.log_softmax(sim_i2t, dim=1) * sim_i2t_targets,
+                    dim=1).mean()
+                loss_t2i = -torch.sum(
+                    F.log_softmax(sim_t2i, dim=1) * sim_t2i_targets,
+                    dim=1).mean()
+            else:
+                loss_i2t = -torch.sum(
+                    F.log_softmax(sim_i2t, dim=1) * sim_targets, dim=1).mean()
+                loss_t2i = -torch.sum(
+                    F.log_softmax(sim_t2i, dim=1) * sim_targets, dim=1).mean()
+
+            loss_ita = (loss_i2t + loss_t2i) / 2
+
+            self._dequeue_and_enqueue(image_feat_m, text_feat_m, idx)
+
+            # forward the positve image-text pair
+            _, output_pos = self.fusion_encoder(
                 encoder_embeds=text_embeds,
-                attention_mask=question.attention_mask,
+                attention_mask=text.attention_mask,
                 encoder_hidden_states=image_embeds,
                 encoder_attention_mask=image_atts,
-                return_dict=False)
-            image_output, question_output = fusion_output
-            question_output = torch.cat([image_output, question_output], 1)
-            merge_text_attention = torch.cat(
-                [image_atts, question.attention_mask], 1)
-            topk_ids, topk_probs = self.generation(question_output,
-                                                   merge_text_attention)
-            return topk_ids, topk_probs
+                return_dict=False,
+            )
+            with torch.no_grad():
+                bs = image.size(0)
+                weights_i2t = F.softmax(sim_i2t[:, :bs], dim=1)
+                weights_t2i = F.softmax(sim_t2i[:, :bs], dim=1)
+
+                mask = torch.eq(idx, idx.T)
+                weights_i2t.masked_fill_(mask, 0)
+                weights_t2i.masked_fill_(mask, 0)
+
+            # select a negative image for each text
+            image_embeds_neg = []
+            for b in range(bs):
+                neg_idx = torch.multinomial(weights_t2i[b], 1).item()
+                image_embeds_neg.append(image_embeds[neg_idx])
+            image_embeds_neg = torch.stack(image_embeds_neg, dim=0)
+
+            # select a negative text for each image
+            text_embeds_neg = []
+            text_atts_neg = []
+            for b in range(bs):
+                neg_idx = torch.multinomial(weights_i2t[b], 1).item()
+                text_embeds_neg.append(text_embeds[neg_idx])
+                text_atts_neg.append(text.attention_mask[neg_idx])
+            text_embeds_neg = torch.stack(text_embeds_neg, dim=0)
+            text_atts_neg = torch.stack(text_atts_neg, dim=0)
+
+            text_embeds_all = torch.cat([text_embeds, text_embeds_neg], dim=0)
+            text_atts_all = torch.cat([text.attention_mask, text_atts_neg],
+                                      dim=0)
+
+            image_embeds_all = torch.cat([image_embeds_neg, image_embeds],
+                                         dim=0)
+            image_atts_all = torch.cat([image_atts, image_atts], dim=0)
+
+            _, output_neg = self.fusion_encoder(
+                encoder_embeds=text_embeds_all,
+                attention_mask=text_atts_all,
+                encoder_hidden_states=image_embeds_all,
+                encoder_attention_mask=image_atts_all,
+                return_dict=False,
+            )
+
+            vl_embeddings = torch.cat(
+                [output_pos[:, 0, :], output_neg[:, 0, :]], dim=0)
+            vl_output = self.itm_head(vl_embeddings)
+
+            ones_tmp = torch.ones(bs, dtype=torch.long)
+            zeros_tmp = torch.zeros(2 * bs, dtype=torch.long)
+            itm_labels = torch.cat([ones_tmp, zeros_tmp],
+                                   dim=0).to(image.device)
+            loss_itm = F.cross_entropy(vl_output, itm_labels)
+
+            return loss_ita + loss_itm
+        else:
+            text_output = self.text_encoder(
+                text.input_ids, attention_mask=text.attention_mask)
+            text_feat = text_output.last_hidden_state
+            image_feat = self.visual_encoder.visual(
+                image, skip_last_layer=True)
+            image_feat = self.visn_layer_norm(self.visn_fc(image_feat))
+            image_att = torch.ones(
+                image_feat.size()[:-1],
+                dtype=torch.long,
+                device=image_feat.device)
+            _, output = self.fusion_encoder(
+                encoder_embeds=text_feat,
+                attention_mask=text.attention_mask,
+                encoder_hidden_states=image_feat,
+                encoder_attention_mask=image_att,
+                return_dict=False,
+            )
+            scores = self.itm_head(output[:, 0, :])
+            scores = F.softmax(scores, dim=-1)
+
+            return scores
diff --git a/modelscope/models/multi_modal/mplug/predictor.py b/modelscope/models/multi_modal/mplug/predictor.py
index c976baa1..6375d1d7 100755
--- a/modelscope/models/multi_modal/mplug/predictor.py
+++ b/modelscope/models/multi_modal/mplug/predictor.py
@@ -1,3 +1,19 @@
+# Copyright 2021-2022 The Alibaba DAMO NLP Team Authors.
+# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from __future__ import print_function
 
 import torch
diff --git a/modelscope/models/multi_modal/mplug_for_all_tasks.py b/modelscope/models/multi_modal/mplug_for_all_tasks.py
index bb5a9c46..64a7dd7b 100644
--- a/modelscope/models/multi_modal/mplug_for_all_tasks.py
+++ b/modelscope/models/multi_modal/mplug_for_all_tasks.py
@@ -1,10 +1,15 @@
-from typing import Dict
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import os.path as osp
+from typing import Dict, List
 
 from modelscope.metainfo import Models
 from modelscope.models import TorchModel
 from modelscope.models.base import Tensor
 from modelscope.models.builder import MODELS
-from modelscope.utils.constant import Tasks
+from modelscope.outputs import OutputKeys
+from modelscope.utils.config import Config
+from modelscope.utils.constant import ModelFile, Tasks
 
 __all__ = ['MPlugForAllTasks']
 
@@ -12,6 +17,7 @@ __all__ = ['MPlugForAllTasks']
 @MODELS.register_module(
     Tasks.visual_question_answering, module_name=Models.mplug)
 @MODELS.register_module(Tasks.image_captioning, module_name=Models.mplug)
+@MODELS.register_module(Tasks.image_text_retrieval, module_name=Models.mplug)
 class MPlugForAllTasks(TorchModel):
 
     def __init__(self, model_dir: str, *args, **kwargs):
@@ -25,12 +31,6 @@ class MPlugForAllTasks(TorchModel):
         self.model = MPlug.from_pretrained(model_dir)
         self.tokenizer = self.model.tokenizer
 
-    def train(self):
-        return self.model.train()
-
-    def eval(self):
-        return self.model.eval()
-
     def forward(self, input: Dict[str, Tensor]) -> Dict[str, Tensor]:
         """return the result by the model
 
@@ -45,13 +45,61 @@ class MPlugForAllTasks(TorchModel):
                     }
         """
 
-        topk_ids, _ = self.model(**input)
         replace_tokens_bert = (('[unused0]', ''), ('[PAD]', ''),
                                ('[unused1]', ''), (r' +', ' '), ('[SEP]', ''),
                                ('[unused2]', ''), ('[CLS]', ''), ('[UNK]', ''))
 
-        pred_string = self.tokenizer.decode(topk_ids[0][0])
-        for _old, _new in replace_tokens_bert:
-            pred_string = pred_string.replace(_old, _new)
-        pred_string = pred_string.strip()
-        return pred_string
+        # get task from config file
+        task = Config.from_file(
+            osp.join(self.model_dir, ModelFile.CONFIGURATION)).task
+
+        # inference
+        if not self.training and 'question' in input:
+            output = self.model(input['image'], input['question'], train=False)
+            if task == Tasks.image_text_retrieval:
+                return {OutputKeys.SCORES: output[0].tolist()}
+            topk_ids, _ = output
+            pred_string: List[str] = \
+                self.tokenizer.decode(topk_ids[0][0])
+            for _old, _new in replace_tokens_bert:
+                pred_string = pred_string.replace(_old, _new)
+            pred_string = pred_string.strip()
+            output_key = OutputKeys.CAPTION \
+                if task == Tasks.image_captioning else OutputKeys.TEXT
+            return {output_key: pred_string}
+
+        # train and evaluate
+        import addict
+        image = input['image']
+        answer = addict.Dict(
+            input_ids=input['answer_input_ids'],
+            attention_mask=input['answer_attention_mask'])
+        if 'index' not in input:
+            question = addict.Dict(
+                input_ids=input['question_input_ids'],
+                attention_mask=input['question_attention_mask'])
+            output = self.model(image, question, answer, train=self.training)
+        else:
+            index = input['index']
+            output = self.model(image, answer, index, train=self.training)
+        if self.training:
+            return {OutputKeys.LOSS: output}
+
+        # evaluate
+        topk_ids, _ = output
+        preds: List[str] = [
+            self.tokenizer.decode(batch[0]) for batch in topk_ids
+        ]
+        for i in range(len(preds)):
+            for _old, _new in replace_tokens_bert:
+                preds[i] = preds[i].replace(_old, _new)
+            preds[i] = preds[i].strip()
+        tgts: List[str] = [
+            self.tokenizer.decode(batch)
+            for batch in input['answer_input_ids'].cpu().numpy().tolist()
+        ]
+        for i in range(len(tgts)):
+            for _old, _new in replace_tokens_bert:
+                tgts[i] = tgts[i].replace(_old, _new)
+            preds[i] = preds[i].strip()
+        return {'preds': preds, 'tgts': tgts}
diff --git a/modelscope/models/multi_modal/multi_stage_diffusion/__init__.py b/modelscope/models/multi_modal/multi_stage_diffusion/__init__.py
new file mode 100644
index 00000000..accbb56e
--- /dev/null
+++ b/modelscope/models/multi_modal/multi_stage_diffusion/__init__.py
@@ -0,0 +1 @@
+from .model import MultiStageDiffusionForTextToImageSynthesis
diff --git a/modelscope/models/multi_modal/multi_stage_diffusion/clip.py b/modelscope/models/multi_modal/multi_stage_diffusion/clip.py
new file mode 100644
index 00000000..98727066
--- /dev/null
+++ b/modelscope/models/multi_modal/multi_stage_diffusion/clip.py
@@ -0,0 +1,319 @@
+# Part of the implementation is borrowed and modified from CLIP, publicly avaialbe at https://github.com/openai/CLIP.
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
+
+import math
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+__all__ = ['CLIP']
+
+
+def to_fp16(m):
+    if isinstance(m, (nn.Linear, nn.Conv2d)):
+        m.weight.data = m.weight.data.half()
+        if m.bias is not None:
+            m.bias.data = m.bias.data.half()
+    elif hasattr(m, 'head'):
+        p = getattr(m, 'head')
+        p.data = p.data.half()
+
+
+class QuickGELU(nn.Module):
+
+    def forward(self, x):
+        return x * torch.sigmoid(1.702 * x)
+
+
+class LayerNorm(nn.LayerNorm):
+    r"""Subclass of nn.LayerNorm to handle fp16.
+    """
+
+    def forward(self, x):
+        return super(LayerNorm, self).forward(x.float()).type_as(x)
+
+
+class SelfAttention(nn.Module):
+
+    def __init__(self, dim, num_heads, attn_dropout=0.0, proj_dropout=0.0):
+        assert dim % num_heads == 0
+        super(SelfAttention, self).__init__()
+        self.dim = dim
+        self.num_heads = num_heads
+        self.head_dim = dim // num_heads
+        self.scale = 1.0 / math.sqrt(self.head_dim)
+
+        # layers
+        self.to_qkv = nn.Linear(dim, dim * 3)
+        self.attn_dropout = nn.Dropout(attn_dropout)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_dropout = nn.Dropout(proj_dropout)
+
+    def forward(self, x, mask=None):
+        r"""x:      [B, L, C].
+            mask:   [*, L, L].
+        """
+        b, l, _, n = *x.size(), self.num_heads
+
+        # compute query, key, and value
+        q, k, v = self.to_qkv(x.transpose(0, 1)).chunk(3, dim=-1)
+        q = q.reshape(l, b * n, -1).transpose(0, 1)
+        k = k.reshape(l, b * n, -1).transpose(0, 1)
+        v = v.reshape(l, b * n, -1).transpose(0, 1)
+
+        # compute attention
+        attn = self.scale * torch.bmm(q, k.transpose(1, 2))
+        if mask is not None:
+            attn = attn.masked_fill(mask[:, :l, :l] == 0, float('-inf'))
+        attn = F.softmax(attn.float(), dim=-1).type_as(attn)
+        attn = self.attn_dropout(attn)
+
+        # gather context
+        x = torch.bmm(attn, v)
+        x = x.view(b, n, l, -1).transpose(1, 2).reshape(b, l, -1)
+
+        # output
+        x = self.proj(x)
+        x = self.proj_dropout(x)
+        return x
+
+
+class AttentionBlock(nn.Module):
+
+    def __init__(self, dim, num_heads, attn_dropout=0.0, proj_dropout=0.0):
+        super(AttentionBlock, self).__init__()
+        self.dim = dim
+        self.num_heads = num_heads
+
+        # layers
+        self.norm1 = LayerNorm(dim)
+        self.attn = SelfAttention(dim, num_heads, attn_dropout, proj_dropout)
+        self.norm2 = LayerNorm(dim)
+        self.mlp = nn.Sequential(
+            nn.Linear(dim, dim * 4), QuickGELU(), nn.Linear(dim * 4, dim),
+            nn.Dropout(proj_dropout))
+
+    def forward(self, x, mask=None):
+        x = x + self.attn(self.norm1(x), mask)
+        x = x + self.mlp(self.norm2(x))
+        return x
+
+
+class VisionTransformer(nn.Module):
+
+    def __init__(self,
+                 image_size=224,
+                 patch_size=16,
+                 dim=768,
+                 out_dim=512,
+                 num_heads=12,
+                 num_layers=12,
+                 attn_dropout=0.0,
+                 proj_dropout=0.0,
+                 embedding_dropout=0.0):
+        assert image_size % patch_size == 0
+        super(VisionTransformer, self).__init__()
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.dim = dim
+        self.out_dim = out_dim
+        self.num_heads = num_heads
+        self.num_layers = num_layers
+        self.num_patches = (image_size // patch_size)**2
+
+        # embeddings
+        gain = 1.0 / math.sqrt(dim)
+        self.patch_embedding = nn.Conv2d(
+            3, dim, kernel_size=patch_size, stride=patch_size, bias=False)
+        self.cls_embedding = nn.Parameter(gain * torch.randn(1, 1, dim))
+        self.pos_embedding = nn.Parameter(
+            gain * torch.randn(1, self.num_patches + 1, dim))
+        self.dropout = nn.Dropout(embedding_dropout)
+
+        # transformer
+        self.pre_norm = LayerNorm(dim)
+        self.transformer = nn.Sequential(*[
+            AttentionBlock(dim, num_heads, attn_dropout, proj_dropout)
+            for _ in range(num_layers)
+        ])
+        self.post_norm = LayerNorm(dim)
+
+        # head
+        self.head = nn.Parameter(gain * torch.randn(dim, out_dim))
+
+    def forward(self, x):
+        b, dtype = x.size(0), self.head.dtype
+        x = x.type(dtype)
+
+        # patch-embedding
+        x = self.patch_embedding(x).flatten(2).permute(0, 2, 1)  # [b, n, c]
+        x = torch.cat([self.cls_embedding.repeat(b, 1, 1).type(dtype), x],
+                      dim=1)
+        x = self.dropout(x + self.pos_embedding.type(dtype))
+        x = self.pre_norm(x)
+
+        # transformer
+        x = self.transformer(x)
+
+        # head
+        x = self.post_norm(x)
+        x = torch.mm(x[:, 0, :], self.head)
+        return x
+
+    def fp16(self):
+        return self.apply(to_fp16)
+
+
+class TextTransformer(nn.Module):
+
+    def __init__(self,
+                 vocab_size,
+                 text_len,
+                 dim=512,
+                 out_dim=512,
+                 num_heads=8,
+                 num_layers=12,
+                 attn_dropout=0.0,
+                 proj_dropout=0.0,
+                 embedding_dropout=0.0):
+        super(TextTransformer, self).__init__()
+        self.vocab_size = vocab_size
+        self.text_len = text_len
+        self.dim = dim
+        self.out_dim = out_dim
+        self.num_heads = num_heads
+        self.num_layers = num_layers
+
+        # embeddings
+        self.token_embedding = nn.Embedding(vocab_size, dim)
+        self.pos_embedding = nn.Parameter(0.01 * torch.randn(1, text_len, dim))
+        self.dropout = nn.Dropout(embedding_dropout)
+
+        # transformer
+        self.transformer = nn.ModuleList([
+            AttentionBlock(dim, num_heads, attn_dropout, proj_dropout)
+            for _ in range(num_layers)
+        ])
+        self.norm = LayerNorm(dim)
+
+        # head
+        gain = 1.0 / math.sqrt(dim)
+        self.head = nn.Parameter(gain * torch.randn(dim, out_dim))
+
+        # causal attention mask
+        self.register_buffer('attn_mask',
+                             torch.tril(torch.ones(1, text_len, text_len)))
+
+    def forward(self, x):
+        eot, dtype = x.argmax(dim=-1), self.head.dtype
+
+        # embeddings
+        x = self.dropout(
+            self.token_embedding(x).type(dtype)
+            + self.pos_embedding.type(dtype))
+
+        # transformer
+        for block in self.transformer:
+            x = block(x, self.attn_mask)
+
+        # head
+        x = self.norm(x)
+        x = torch.mm(x[torch.arange(x.size(0)), eot], self.head)
+        return x
+
+    def fp16(self):
+        return self.apply(to_fp16)
+
+
+class CLIP(nn.Module):
+
+    def __init__(self,
+                 embed_dim=512,
+                 image_size=224,
+                 patch_size=16,
+                 vision_dim=768,
+                 vision_heads=12,
+                 vision_layers=12,
+                 vocab_size=49408,
+                 text_len=77,
+                 text_dim=512,
+                 text_heads=8,
+                 text_layers=12,
+                 attn_dropout=0.0,
+                 proj_dropout=0.0,
+                 embedding_dropout=0.0):
+        super(CLIP, self).__init__()
+        self.embed_dim = embed_dim
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.vision_dim = vision_dim
+        self.vision_heads = vision_heads
+        self.vision_layers = vision_layers
+        self.vocab_size = vocab_size
+        self.text_len = text_len
+        self.text_dim = text_dim
+        self.text_heads = text_heads
+        self.text_layers = text_layers
+
+        # models
+        self.visual = VisionTransformer(
+            image_size=image_size,
+            patch_size=patch_size,
+            dim=vision_dim,
+            out_dim=embed_dim,
+            num_heads=vision_heads,
+            num_layers=vision_layers,
+            attn_dropout=attn_dropout,
+            proj_dropout=proj_dropout,
+            embedding_dropout=embedding_dropout)
+        self.textual = TextTransformer(
+            vocab_size=vocab_size,
+            text_len=text_len,
+            dim=text_dim,
+            out_dim=embed_dim,
+            num_heads=text_heads,
+            num_layers=text_layers,
+            attn_dropout=attn_dropout,
+            proj_dropout=proj_dropout,
+            embedding_dropout=embedding_dropout)
+        self.log_scale = nn.Parameter(math.log(1 / 0.07) * torch.ones([]))
+
+    def forward(self, imgs, txt_tokens):
+        r"""imgs:       [B, C, H, W] of torch.float32.
+            txt_tokens: [B, T] of torch.long.
+        """
+        xi = self.visual(imgs)
+        xt = self.textual(txt_tokens)
+
+        # normalize features
+        xi = F.normalize(xi, p=2, dim=1)
+        xt = F.normalize(xt, p=2, dim=1)
+
+        # logits
+        scale = self.log_scale.exp()
+        logits_i2t = scale * torch.mm(xi, xt.t())
+        logits_t2i = scale * torch.mm(xt, xi.t())
+        return logits_i2t, logits_t2i
+
+    def init_weights(self):
+        # embeddings
+        nn.init.normal_(self.textual.token_embedding.weight, std=0.02)
+        nn.init.normal_(self.visual.patch_embedding.weight, tsd=0.1)
+
+        # attentions
+        for modality in ['visual', 'textual']:
+            dim = self.vision_dim if modality == 'visual' else 'textual'
+            transformer = getattr(self, modality).transformer
+            proj_gain = (1.0 / math.sqrt(dim)) * (
+                1.0 / math.sqrt(2 * transformer.num_layers))
+            attn_gain = 1.0 / math.sqrt(dim)
+            mlp_gain = 1.0 / math.sqrt(2.0 * dim)
+            for block in transformer.layers:
+                nn.init.normal_(block.attn.to_qkv.weight, std=attn_gain)
+                nn.init.normal_(block.attn.proj.weight, std=proj_gain)
+                nn.init.normal_(block.mlp[0].weight, std=mlp_gain)
+                nn.init.normal_(block.mlp[2].weight, std=proj_gain)
+
+    def fp16(self):
+        return self.apply(to_fp16)
diff --git a/modelscope/models/multi_modal/multi_stage_diffusion/decoder.py b/modelscope/models/multi_modal/multi_stage_diffusion/decoder.py
new file mode 100644
index 00000000..eb52a48b
--- /dev/null
+++ b/modelscope/models/multi_modal/multi_stage_diffusion/decoder.py
@@ -0,0 +1,322 @@
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
+
+import math
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+__all__ = ['Decoder']
+
+
+def sinusoidal_embedding(timesteps, dim):
+    # check input
+    half = dim // 2
+    timesteps = timesteps.float()
+
+    # compute sinusoidal embedding
+    sinusoid = torch.outer(
+        timesteps, torch.pow(10000,
+                             -torch.arange(half).to(timesteps).div(half)))
+    x = torch.cat([torch.cos(sinusoid), torch.sin(sinusoid)], dim=1)
+    if dim % 2 != 0:
+        x = torch.cat([x, torch.zeros_like(x[:, :1])], dim=1)
+    return x
+
+
+class Resample(nn.Module):
+
+    def __init__(self, in_dim, out_dim, scale_factor, use_conv=False):
+        assert scale_factor in [0.5, 1.0, 2.0]
+        super(Resample, self).__init__()
+        self.in_dim = in_dim
+        self.out_dim = out_dim
+        self.scale_factor = scale_factor
+        self.use_conv = use_conv
+
+        # layers
+        if scale_factor == 2.0:
+            self.resample = nn.Sequential(
+                nn.Upsample(scale_factor=scale_factor, mode='nearest'),
+                nn.Conv2d(in_dim, out_dim, 3, padding=1)
+                if use_conv else nn.Identity())
+        elif scale_factor == 0.5:
+            self.resample = nn.Conv2d(
+                in_dim, out_dim, 3, stride=2,
+                padding=1) if use_conv else nn.AvgPool2d(
+                    kernel_size=2, stride=2)
+        else:
+            self.resample = nn.Identity()
+
+    def forward(self, x):
+        return self.resample(x)
+
+
+class ResidualBlock(nn.Module):
+
+    def __init__(self,
+                 in_dim,
+                 embed_dim,
+                 out_dim,
+                 use_scale_shift_norm=True,
+                 scale_factor=1.0,
+                 dropout=0.0):
+        super(ResidualBlock, self).__init__()
+        self.in_dim = in_dim
+        self.embed_dim = embed_dim
+        self.out_dim = out_dim
+        self.use_scale_shift_norm = use_scale_shift_norm
+        self.scale_factor = scale_factor
+
+        # layers
+        self.layer1 = nn.Sequential(
+            nn.GroupNorm(32, in_dim), nn.SiLU(),
+            nn.Conv2d(in_dim, out_dim, 3, padding=1))
+        self.resample = Resample(in_dim, in_dim, scale_factor, use_conv=False)
+        self.embedding = nn.Sequential(
+            nn.SiLU(),
+            nn.Linear(embed_dim,
+                      out_dim * 2 if use_scale_shift_norm else out_dim))
+        self.layer2 = nn.Sequential(
+            nn.GroupNorm(32, out_dim), nn.SiLU(), nn.Dropout(dropout),
+            nn.Conv2d(out_dim, out_dim, 3, padding=1))
+        self.shortcut = nn.Identity() if in_dim == out_dim else nn.Conv2d(
+            in_dim, out_dim, 1)
+
+        # zero out the last layer params
+        nn.init.zeros_(self.layer2[-1].weight)
+
+    def forward(self, x, e):
+        identity = self.resample(x)
+        x = self.layer1[-1](self.resample(self.layer1[:-1](x)))
+        e = self.embedding(e).unsqueeze(-1).unsqueeze(-1).type(x.dtype)
+        if self.use_scale_shift_norm:
+            scale, shift = e.chunk(2, dim=1)
+            x = self.layer2[0](x) * (1 + scale) + shift
+            x = self.layer2[1:](x)
+        else:
+            x = x + e
+            x = self.layer2(x)
+        x = x + self.shortcut(identity)
+        return x
+
+
+class AttentionBlock(nn.Module):
+
+    def __init__(self, dim, context_dim=None, num_heads=None, head_dim=None):
+        # consider head_dim first, then num_heads
+        num_heads = dim // head_dim if head_dim else num_heads
+        head_dim = dim // num_heads
+        assert num_heads * head_dim == dim
+        super(AttentionBlock, self).__init__()
+        self.dim = dim
+        self.context_dim = context_dim
+        self.num_heads = num_heads
+        self.head_dim = head_dim
+        self.scale = math.pow(head_dim, -0.25)
+
+        # layers
+        self.norm = nn.GroupNorm(32, dim)
+        self.to_qkv = nn.Conv2d(dim, dim * 3, 1)
+        if context_dim is not None:
+            self.context_kv = nn.Linear(context_dim, dim * 2)
+        self.proj = nn.Conv2d(dim, dim, 1)
+
+        # zero out the last layer params
+        nn.init.zeros_(self.proj.weight)
+
+    def forward(self, x, context=None):
+        r"""x:       [B, C, H, W].
+            context: [B, L, C] or None.
+        """
+        identity = x
+        b, c, h, w, n, d = *x.size(), self.num_heads, self.head_dim
+
+        # compute query, key, value
+        x = self.norm(x)
+        q, k, v = self.to_qkv(x).view(b, n * 3, d, h * w).chunk(3, dim=1)
+        if context is not None:
+            ck, cv = self.context_kv(context).reshape(b, -1, n * 2,
+                                                      d).permute(0, 2, 3,
+                                                                 1).chunk(
+                                                                     2, dim=1)
+            k = torch.cat([ck, k], dim=-1)
+            v = torch.cat([cv, v], dim=-1)
+
+        # compute attention
+        attn = torch.matmul(q.transpose(-1, -2) * self.scale, k * self.scale)
+        attn = F.softmax(attn, dim=-1)
+
+        # gather context
+        x = torch.matmul(v, attn.transpose(-1, -2))
+        x = x.reshape(b, c, h, w)
+
+        # output
+        x = self.proj(x)
+        return x + identity
+
+
+class Decoder(nn.Module):
+
+    def __init__(self,
+                 in_dim=3,
+                 dim=512,
+                 y_dim=512,
+                 context_dim=512,
+                 out_dim=6,
+                 dim_mult=[1, 2, 3, 4],
+                 num_heads=None,
+                 head_dim=64,
+                 num_res_blocks=3,
+                 attn_scales=[1 / 2, 1 / 4, 1 / 8],
+                 resblock_resample=True,
+                 use_scale_shift_norm=True,
+                 dropout=0.1):
+        embed_dim = dim * 4
+        super(Decoder, self).__init__()
+        self.in_dim = in_dim
+        self.dim = dim
+        self.y_dim = y_dim
+        self.context_dim = context_dim
+        self.embed_dim = embed_dim
+        self.out_dim = out_dim
+        self.dim_mult = dim_mult
+        self.num_heads = num_heads
+        self.head_dim = head_dim
+        self.num_res_blocks = num_res_blocks
+        self.attn_scales = attn_scales
+        self.resblock_resample = resblock_resample
+        self.use_scale_shift_norm = use_scale_shift_norm
+
+        # params
+        enc_dims = [dim * u for u in [1] + dim_mult]
+        dec_dims = [dim * u for u in [dim_mult[-1]] + dim_mult[::-1]]
+        shortcut_dims = []
+        scale = 1.0
+
+        # embeddings
+        self.time_embedding = nn.Sequential(
+            nn.Linear(dim, embed_dim), nn.SiLU(),
+            nn.Linear(embed_dim, embed_dim))
+        self.y_embedding = nn.Sequential(
+            nn.Linear(y_dim, embed_dim), nn.SiLU(),
+            nn.Linear(embed_dim, embed_dim))
+        self.context_embedding = nn.Sequential(
+            nn.Linear(y_dim, embed_dim), nn.SiLU(),
+            nn.Linear(embed_dim, context_dim * 4))
+
+        # encoder
+        self.encoder = nn.ModuleList(
+            [nn.Conv2d(self.in_dim, dim, 3, padding=1)])
+        shortcut_dims.append(dim)
+        for i, (in_dim,
+                out_dim) in enumerate(zip(enc_dims[:-1], enc_dims[1:])):
+            for j in range(num_res_blocks):
+                # residual (+attention) blocks
+                block = nn.ModuleList([
+                    ResidualBlock(in_dim, embed_dim, out_dim,
+                                  use_scale_shift_norm, 1.0, dropout)
+                ])
+                if scale in attn_scales:
+                    block.append(
+                        AttentionBlock(out_dim, context_dim, num_heads,
+                                       head_dim))
+                in_dim = out_dim
+                self.encoder.append(block)
+                shortcut_dims.append(out_dim)
+
+                # downsample
+                if i != len(dim_mult) - 1 and j == num_res_blocks - 1:
+                    if resblock_resample:
+                        downsample = ResidualBlock(out_dim, embed_dim, out_dim,
+                                                   use_scale_shift_norm, 0.5,
+                                                   dropout)
+                    else:
+                        downsample = Resample(
+                            out_dim, out_dim, 0.5, use_conv=True)
+                    shortcut_dims.append(out_dim)
+                    scale /= 2.0
+                    self.encoder.append(downsample)
+
+        # middle
+        self.middle = nn.ModuleList([
+            ResidualBlock(out_dim, embed_dim, out_dim, use_scale_shift_norm,
+                          1.0, dropout),
+            AttentionBlock(out_dim, context_dim, num_heads, head_dim),
+            ResidualBlock(out_dim, embed_dim, out_dim, use_scale_shift_norm,
+                          1.0, dropout)
+        ])
+
+        # decoder
+        self.decoder = nn.ModuleList()
+        for i, (in_dim,
+                out_dim) in enumerate(zip(dec_dims[:-1], dec_dims[1:])):
+            for j in range(num_res_blocks + 1):
+                # residual (+attention) blocks
+                block = nn.ModuleList([
+                    ResidualBlock(in_dim + shortcut_dims.pop(), embed_dim,
+                                  out_dim, use_scale_shift_norm, 1.0, dropout)
+                ])
+                if scale in attn_scales:
+                    block.append(
+                        AttentionBlock(out_dim, context_dim, num_heads,
+                                       head_dim))
+                in_dim = out_dim
+
+                # upsample
+                if i != len(dim_mult) - 1 and j == num_res_blocks:
+                    if resblock_resample:
+                        upsample = ResidualBlock(out_dim, embed_dim, out_dim,
+                                                 use_scale_shift_norm, 2.0,
+                                                 dropout)
+                    else:
+                        upsample = Resample(
+                            out_dim, out_dim, 2.0, use_conv=True)
+                    scale *= 2.0
+                    block.append(upsample)
+                self.decoder.append(block)
+
+        # head
+        self.head = nn.Sequential(
+            nn.GroupNorm(32, out_dim), nn.SiLU(),
+            nn.Conv2d(out_dim, self.out_dim, 3, padding=1))
+
+        # zero out the last layer params
+        nn.init.zeros_(self.head[-1].weight)
+
+    def forward(self, x, t, y):
+        # embeddings
+        e = self.time_embedding(sinusoidal_embedding(
+            t, self.dim)) + self.y_embedding(y)
+        context = self.context_embedding(y).view(-1, 4, self.context_dim)
+
+        # encoder
+        xs = []
+        for block in self.encoder:
+            x = self._forward_single(block, x, e, context)
+            xs.append(x)
+
+        # middle
+        for block in self.middle:
+            x = self._forward_single(block, x, e, context)
+
+        # decoder
+        for block in self.decoder:
+            x = torch.cat([x, xs.pop()], dim=1)
+            x = self._forward_single(block, x, e, context)
+
+        # head
+        x = self.head(x)
+        return x
+
+    def _forward_single(self, module, x, e, context):
+        if isinstance(module, ResidualBlock):
+            x = module(x, e)
+        elif isinstance(module, AttentionBlock):
+            x = module(x, context)
+        elif isinstance(module, nn.ModuleList):
+            for block in module:
+                x = self._forward_single(block, x, e, context)
+        else:
+            x = module(x)
+        return x
diff --git a/modelscope/models/multi_modal/multi_stage_diffusion/gaussian_diffusion.py b/modelscope/models/multi_modal/multi_stage_diffusion/gaussian_diffusion.py
new file mode 100644
index 00000000..9677d7c4
--- /dev/null
+++ b/modelscope/models/multi_modal/multi_stage_diffusion/gaussian_diffusion.py
@@ -0,0 +1,642 @@
+# Part of the implementation is borrowed and modified from latent-diffusion,
+# publicly avaialbe at https://github.com/CompVis/latent-diffusion.
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
+
+import math
+
+import torch
+
+__all__ = ['GaussianDiffusion', 'beta_schedule']
+
+
+def kl_divergence(mu1, logvar1, mu2, logvar2):
+    u1 = -1.0 + logvar2 - logvar1 + torch.exp(logvar1 - logvar2)
+    u2 = ((mu1 - mu2)**2) * torch.exp(-logvar2)
+    return 0.5 * (u1 + u2)
+
+
+def standard_normal_cdf(x):
+    r"""A fast approximation of the cumulative distribution function of the standard normal.
+    """
+    return 0.5 * (1.0 + torch.tanh(
+        math.sqrt(2.0 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
+
+
+def discretized_gaussian_log_likelihood(x0, mean, log_scale):
+    assert x0.shape == mean.shape == log_scale.shape
+    cx = x0 - mean
+    inv_stdv = torch.exp(-log_scale)
+    cdf_plus = standard_normal_cdf(inv_stdv * (cx + 1.0 / 255.0))
+    cdf_min = standard_normal_cdf(inv_stdv * (cx - 1.0 / 255.0))
+    log_cdf_plus = torch.log(cdf_plus.clamp(min=1e-12))
+    log_one_minus_cdf_min = torch.log((1.0 - cdf_min).clamp(min=1e-12))
+    cdf_delta = cdf_plus - cdf_min
+    log_probs = torch.where(
+        x0 < -0.999, log_cdf_plus,
+        torch.where(x0 > 0.999, log_one_minus_cdf_min,
+                    torch.log(cdf_delta.clamp(min=1e-12))))
+    assert log_probs.shape == x0.shape
+    return log_probs
+
+
+def _i(tensor, t, x):
+    r"""Index tensor using t and format the output according to x.
+    """
+    shape = (x.size(0), ) + (1, ) * (x.ndim - 1)
+    return tensor[t].view(shape).to(x)
+
+
+def beta_schedule(schedule,
+                  num_timesteps=1000,
+                  init_beta=None,
+                  last_beta=None):
+    if schedule == 'linear':
+        scale = 1000.0 / num_timesteps
+        init_beta = init_beta or scale * 0.0001
+        last_beta = last_beta or scale * 0.02
+        return torch.linspace(
+            init_beta, last_beta, num_timesteps, dtype=torch.float64)
+    elif schedule == 'quadratic':
+        init_beta = init_beta or 0.0015
+        last_beta = last_beta or 0.0195
+        return torch.linspace(
+            init_beta**0.5, last_beta**0.5, num_timesteps,
+            dtype=torch.float64)**2
+    elif schedule == 'cosine':
+        betas = []
+        for step in range(num_timesteps):
+            t1 = step / num_timesteps
+            t2 = (step + 1) / num_timesteps
+            fn_t1 = math.cos((t1 + 0.008) / 1.008 * math.pi / 2)**2
+            fn_t2 = math.cos((t2 + 0.008) / 1.008 * math.pi / 2)**2
+            betas.append(min(1.0 - fn_t2 / fn_t1, 0.999))
+        return torch.tensor(betas, dtype=torch.float64)
+    else:
+        raise ValueError(f'Unsupported schedule: {schedule}')
+
+
+class GaussianDiffusion(object):
+
+    def __init__(self,
+                 betas,
+                 mean_type='eps',
+                 var_type='learned_range',
+                 loss_type='mse',
+                 rescale_timesteps=False):
+        # check input
+        if not isinstance(betas, torch.DoubleTensor):
+            betas = torch.tensor(betas, dtype=torch.float64)
+        assert min(betas) > 0 and max(betas) <= 1
+        assert mean_type in ['x0', 'x_{t-1}', 'eps']
+        assert var_type in [
+            'learned', 'learned_range', 'fixed_large', 'fixed_small'
+        ]
+        assert loss_type in [
+            'mse', 'rescaled_mse', 'kl', 'rescaled_kl', 'l1', 'rescaled_l1'
+        ]
+        self.betas = betas
+        self.num_timesteps = len(betas)
+        self.mean_type = mean_type
+        self.var_type = var_type
+        self.loss_type = loss_type
+        self.rescale_timesteps = rescale_timesteps
+
+        # alphas
+        alphas = 1 - self.betas
+        self.alphas_cumprod = torch.cumprod(alphas, dim=0)
+        self.alphas_cumprod_prev = torch.cat(
+            [alphas.new_ones([1]), self.alphas_cumprod[:-1]])
+        self.alphas_cumprod_next = torch.cat(
+            [self.alphas_cumprod[1:],
+             alphas.new_zeros([1])])
+
+        # q(x_t | x_{t-1})
+        self.sqrt_alphas_cumprod = torch.sqrt(self.alphas_cumprod)
+        self.sqrt_one_minus_alphas_cumprod = torch.sqrt(1.0
+                                                        - self.alphas_cumprod)
+        self.log_one_minus_alphas_cumprod = torch.log(1.0
+                                                      - self.alphas_cumprod)
+        self.sqrt_recip_alphas_cumprod = torch.sqrt(1.0 / self.alphas_cumprod)
+        self.sqrt_recipm1_alphas_cumprod = torch.sqrt(1.0 / self.alphas_cumprod
+                                                      - 1)
+
+        # q(x_{t-1} | x_t, x_0)
+        self.posterior_variance = betas * (1.0 - self.alphas_cumprod_prev) / (
+            1.0 - self.alphas_cumprod)
+        self.posterior_log_variance_clipped = torch.log(
+            self.posterior_variance.clamp(1e-20))
+        self.posterior_mean_coef1 = betas * torch.sqrt(
+            self.alphas_cumprod_prev) / (1.0 - self.alphas_cumprod)
+        self.posterior_mean_coef2 = (
+            1.0 - self.alphas_cumprod_prev) * torch.sqrt(alphas) / (
+                1.0 - self.alphas_cumprod)
+
+    def q_sample(self, x0, t, noise=None):
+        r"""Sample from q(x_t | x_0).
+        """
+        noise = torch.randn_like(x0) if noise is None else noise
+        u1 = _i(self.sqrt_alphas_cumprod, t, x0) * x0
+        u2 = _i(self.sqrt_one_minus_alphas_cumprod, t, x0) * noise
+        return u1 + u2
+
+    def q_mean_variance(self, x0, t):
+        r"""Distribution of q(x_t | x_0).
+        """
+        mu = _i(self.sqrt_alphas_cumprod, t, x0) * x0
+        var = _i(1.0 - self.alphas_cumprod, t, x0)
+        log_var = _i(self.log_one_minus_alphas_cumprod, t, x0)
+        return mu, var, log_var
+
+    def q_posterior_mean_variance(self, x0, xt, t):
+        r"""Distribution of q(x_{t-1} | x_t, x_0).
+        """
+        mu = _i(self.posterior_mean_coef1, t, xt) * x0 + _i(
+            self.posterior_mean_coef2, t, xt) * xt
+        var = _i(self.posterior_variance, t, xt)
+        log_var = _i(self.posterior_log_variance_clipped, t, xt)
+        return mu, var, log_var
+
+    @torch.no_grad()
+    def p_sample(self,
+                 xt,
+                 t,
+                 model,
+                 model_kwargs={},
+                 clamp=None,
+                 percentile=None,
+                 condition_fn=None,
+                 guide_scale=None):
+        r"""Sample from p(x_{t-1} | x_t).
+            - condition_fn: for classifier-based guidance (guided-diffusion).
+            - guide_scale: for classifier-free guidance (glide/dalle-2).
+        """
+        # predict distribution of p(x_{t-1} | x_t)
+        mu, var, log_var, x0 = self.p_mean_variance(xt, t, model, model_kwargs,
+                                                    clamp, percentile,
+                                                    guide_scale)
+
+        # random sample (with optional conditional function)
+        noise = torch.randn_like(xt)
+        shape = (-1, *((1, ) * (xt.ndim - 1)))
+        mask = t.ne(0).float().view(shape)  # no noise when t == 0
+        if condition_fn is not None:
+            grad = condition_fn(xt, self._scale_timesteps(t), **model_kwargs)
+            mu = mu.float() + var * grad.float()
+        xt_1 = mu + mask * torch.exp(0.5 * log_var) * noise
+        return xt_1, x0
+
+    @torch.no_grad()
+    def p_sample_loop(self,
+                      noise,
+                      model,
+                      model_kwargs={},
+                      clamp=None,
+                      percentile=None,
+                      condition_fn=None,
+                      guide_scale=None):
+        r"""Sample from p(x_{t-1} | x_t) p(x_{t-2} | x_{t-1}) ... p(x_0 | x_1).
+        """
+        # prepare input
+        b = noise.size(0)
+        xt = noise
+
+        # diffusion process
+        for step in torch.arange(self.num_timesteps).flip(0):
+            t = torch.full((b, ), step, dtype=torch.long, device=xt.device)
+            xt, _ = self.p_sample(xt, t, model, model_kwargs, clamp,
+                                  percentile, condition_fn, guide_scale)
+        return xt
+
+    def p_mean_variance(self,
+                        xt,
+                        t,
+                        model,
+                        model_kwargs={},
+                        clamp=None,
+                        percentile=None,
+                        guide_scale=None):
+        r"""Distribution of p(x_{t-1} | x_t).
+        """
+        # predict distribution
+        if guide_scale is None:
+            out = model(xt, self._scale_timesteps(t), **model_kwargs)
+        else:
+            # classifier-free guidance
+            # (model_kwargs[0]: conditional kwargs; model_kwargs[1]: non-conditional kwargs)
+            assert isinstance(model_kwargs, list) and len(model_kwargs) == 2
+            y_out = model(xt, self._scale_timesteps(t), **model_kwargs[0])
+            u_out = model(xt, self._scale_timesteps(t), **model_kwargs[1])
+            cond = self.var_type.startswith('fixed')
+            dim = y_out.size(1) if cond else y_out.size(1) // 2
+            u1 = u_out[:, :dim]
+            u2 = guide_scale * (y_out[:, :dim] - u_out[:, :dim])
+            out = torch.cat([u1 + u2, y_out[:, dim:]], dim=1)
+
+        # compute variance
+        if self.var_type == 'learned':
+            out, log_var = out.chunk(2, dim=1)
+            var = torch.exp(log_var)
+        elif self.var_type == 'learned_range':
+            out, fraction = out.chunk(2, dim=1)
+            min_log_var = _i(self.posterior_log_variance_clipped, t, xt)
+            max_log_var = _i(torch.log(self.betas), t, xt)
+            fraction = (fraction + 1) / 2.0
+            log_var = fraction * max_log_var + (1 - fraction) * min_log_var
+            var = torch.exp(log_var)
+        elif self.var_type == 'fixed_large':
+            var = _i(
+                torch.cat([self.posterior_variance[1:2], self.betas[1:]]), t,
+                xt)
+            log_var = torch.log(var)
+        elif self.var_type == 'fixed_small':
+            var = _i(self.posterior_variance, t, xt)
+            log_var = _i(self.posterior_log_variance_clipped, t, xt)
+
+        # compute mean and x0
+        if self.mean_type == 'x_{t-1}':
+            mu = out  # x_{t-1}
+            u1 = _i(1.0 / self.posterior_mean_coef1, t, xt) * mu
+            u2 = _i(self.posterior_mean_coef2 / self.posterior_mean_coef1, t,
+                    xt) * xt
+            x0 = u1 - u2
+        elif self.mean_type == 'x0':
+            x0 = out
+            mu, _, _ = self.q_posterior_mean_variance(x0, xt, t)
+        elif self.mean_type == 'eps':
+            u1 = _i(self.sqrt_recip_alphas_cumprod, t, xt) * xt
+            u2 = _i(self.sqrt_recipm1_alphas_cumprod, t, xt) * out
+            x0 = u1 - u2
+            mu, _, _ = self.q_posterior_mean_variance(x0, xt, t)
+
+        # restrict the range of x0
+        if percentile is not None:
+            assert percentile > 0 and percentile <= 1  # e.g., 0.995
+            s = torch.quantile(
+                x0.flatten(1).abs(), percentile,
+                dim=1).clamp_(1.0).view(-1, 1, 1, 1)
+            x0 = torch.min(s, torch.max(-s, x0)) / s
+        elif clamp is not None:
+            x0 = x0.clamp(-clamp, clamp)
+        return mu, var, log_var, x0
+
+    @torch.no_grad()
+    def ddim_sample(self,
+                    xt,
+                    t,
+                    model,
+                    model_kwargs={},
+                    clamp=None,
+                    percentile=None,
+                    condition_fn=None,
+                    guide_scale=None,
+                    ddim_timesteps=20,
+                    eta=0.0):
+        r"""Sample from p(x_{t-1} | x_t) using DDIM.
+            - condition_fn: for classifier-based guidance (guided-diffusion).
+            - guide_scale: for classifier-free guidance (glide/dalle-2).
+        """
+        stride = self.num_timesteps // ddim_timesteps
+
+        # predict distribution of p(x_{t-1} | x_t)
+        _, _, _, x0 = self.p_mean_variance(xt, t, model, model_kwargs, clamp,
+                                           percentile, guide_scale)
+        if condition_fn is not None:
+            # x0 -> eps
+            alpha = _i(self.alphas_cumprod, t, xt)
+            u1 = (_i(self.sqrt_recip_alphas_cumprod, t, xt) * xt - x0)
+            u2 = _i(self.sqrt_recipm1_alphas_cumprod, t, xt)
+            eps = u1 / u2
+            eps = eps - (1 - alpha).sqrt() * condition_fn(
+                xt, self._scale_timesteps(t), **model_kwargs)
+
+            # eps -> x0
+            u1 = _i(self.sqrt_recip_alphas_cumprod, t, xt) * xt
+            u2 = _i(self.sqrt_recipm1_alphas_cumprod, t, xt) * eps
+            x0 = u1 - u2
+
+        # derive variables
+        u1 = (_i(self.sqrt_recip_alphas_cumprod, t, xt) * xt - x0)
+        u2 = _i(self.sqrt_recipm1_alphas_cumprod, t, xt)
+        eps = u1 / u2
+        alphas = _i(self.alphas_cumprod, t, xt)
+        alphas_prev = _i(self.alphas_cumprod, (t - stride).clamp(0), xt)
+        u1 = (1 - alphas_prev) / (1 - alphas)
+        u2 = (1 - alphas / alphas_prev)
+        sigmas = eta * torch.sqrt(u1 * u2)
+
+        # random sample
+        noise = torch.randn_like(xt)
+        direction = torch.sqrt(1 - alphas_prev - sigmas**2) * eps
+        mask = t.ne(0).float().view(-1, *((1, ) * (xt.ndim - 1)))
+        xt_1 = torch.sqrt(alphas_prev) * x0 + direction + mask * sigmas * noise
+        return xt_1, x0
+
+    @torch.no_grad()
+    def ddim_sample_loop(self,
+                         noise,
+                         model,
+                         model_kwargs={},
+                         clamp=None,
+                         percentile=None,
+                         condition_fn=None,
+                         guide_scale=None,
+                         ddim_timesteps=20,
+                         eta=0.0):
+        # prepare input
+        b = noise.size(0)
+        xt = noise
+
+        # diffusion process (TODO: clamp is inaccurate! Consider replacing the stride by explicit prev/next steps)
+        steps = (1 + torch.arange(0, self.num_timesteps,
+                                  self.num_timesteps // ddim_timesteps)).clamp(
+                                      0, self.num_timesteps - 1).flip(0)
+        for step in steps:
+            t = torch.full((b, ), step, dtype=torch.long, device=xt.device)
+            xt, _ = self.ddim_sample(xt, t, model, model_kwargs, clamp,
+                                     percentile, condition_fn, guide_scale,
+                                     ddim_timesteps, eta)
+        return xt
+
+    @torch.no_grad()
+    def ddim_reverse_sample(self,
+                            xt,
+                            t,
+                            model,
+                            model_kwargs={},
+                            clamp=None,
+                            percentile=None,
+                            guide_scale=None,
+                            ddim_timesteps=20):
+        r"""Sample from p(x_{t+1} | x_t) using DDIM reverse ODE (deterministic).
+        """
+        stride = self.num_timesteps // ddim_timesteps
+
+        # predict distribution of p(x_{t-1} | x_t)
+        _, _, _, x0 = self.p_mean_variance(xt, t, model, model_kwargs, clamp,
+                                           percentile, guide_scale)
+
+        # derive variables
+        u1 = (_i(self.sqrt_recip_alphas_cumprod, t, xt) * xt - x0)
+        u2 = _i(self.sqrt_recipm1_alphas_cumprod, t, xt)
+        eps = u1 / u2
+
+        alphas_next = _i(
+            torch.cat(
+                [self.alphas_cumprod,
+                 self.alphas_cumprod.new_zeros([1])]),
+            (t + stride).clamp(0, self.num_timesteps), xt)
+
+        # reverse sample
+        mu = torch.sqrt(alphas_next) * x0 + torch.sqrt(1 - alphas_next) * eps
+        return mu, x0
+
+    @torch.no_grad()
+    def ddim_reverse_sample_loop(self,
+                                 x0,
+                                 model,
+                                 model_kwargs={},
+                                 clamp=None,
+                                 percentile=None,
+                                 guide_scale=None,
+                                 ddim_timesteps=20):
+        # prepare input
+        b = x0.size(0)
+        xt = x0
+
+        # reconstruction steps
+        steps = torch.arange(0, self.num_timesteps,
+                             self.num_timesteps // ddim_timesteps)
+        for step in steps:
+            t = torch.full((b, ), step, dtype=torch.long, device=xt.device)
+            xt, _ = self.ddim_reverse_sample(xt, t, model, model_kwargs, clamp,
+                                             percentile, guide_scale,
+                                             ddim_timesteps)
+        return xt
+
+    @torch.no_grad()
+    def plms_sample(self,
+                    xt,
+                    t,
+                    model,
+                    model_kwargs={},
+                    clamp=None,
+                    percentile=None,
+                    condition_fn=None,
+                    guide_scale=None,
+                    plms_timesteps=20):
+        r"""Sample from p(x_{t-1} | x_t) using PLMS.
+            - condition_fn: for classifier-based guidance (guided-diffusion).
+            - guide_scale: for classifier-free guidance (glide/dalle-2).
+        """
+        stride = self.num_timesteps // plms_timesteps
+
+        # function for compute eps
+        def compute_eps(xt, t):
+            # predict distribution of p(x_{t-1} | x_t)
+            _, _, _, x0 = self.p_mean_variance(xt, t, model, model_kwargs,
+                                               clamp, percentile, guide_scale)
+
+            # condition
+            if condition_fn is not None:
+                # x0 -> eps
+                alpha = _i(self.alphas_cumprod, t, xt)
+                u1 = (_i(self.sqrt_recip_alphas_cumprod, t, xt) * xt - x0)
+                u2 = _i(self.sqrt_recipm1_alphas_cumprod, t, xt)
+                eps = u1 / u2
+                eps = eps - (1 - alpha).sqrt() * condition_fn(
+                    xt, self._scale_timesteps(t), **model_kwargs)
+
+                # eps -> x0
+                u1 = _i(self.sqrt_recip_alphas_cumprod, t, xt) * xt
+                u2 = _i(self.sqrt_recipm1_alphas_cumprod, t, xt) * eps
+                x0 = u1 - u2
+
+            # derive eps
+            u1 = (_i(self.sqrt_recip_alphas_cumprod, t, xt) * xt - x0)
+            u2 = _i(self.sqrt_recipm1_alphas_cumprod, t, xt)
+            eps = u1 / u2
+            return eps
+
+        # function for compute x_0 and x_{t-1}
+        def compute_x0(eps, t):
+            # eps -> x0
+            u1 = _i(self.sqrt_recip_alphas_cumprod, t, xt) * xt
+            u2 = _i(self.sqrt_recipm1_alphas_cumprod, t, xt) * eps
+            x0 = u1 - u2
+
+            # deterministic sample
+            alphas_prev = _i(self.alphas_cumprod, (t - stride).clamp(0), xt)
+            direction = torch.sqrt(1 - alphas_prev) * eps
+            # mask = t.ne(0).float().view(-1, *((1, ) * (xt.ndim - 1)))
+            xt_1 = torch.sqrt(alphas_prev) * x0 + direction
+            return xt_1, x0
+
+        # PLMS sample
+        eps = compute_eps(xt, t)
+        if len(eps_cache) == 0:
+            # 2nd order pseudo improved Euler
+            xt_1, x0 = compute_x0(eps, t)
+            eps_next = compute_eps(xt_1, (t - stride).clamp(0))
+            eps_prime = (eps + eps_next) / 2.0
+        elif len(eps_cache) == 1:
+            # 2nd order pseudo linear multistep (Adams-Bashforth)
+            eps_prime = (3 * eps - eps_cache[-1]) / 2.0
+        elif len(eps_cache) == 2:
+            # 3nd order pseudo linear multistep (Adams-Bashforth)
+            eps_prime = (23 * eps - 16 * eps_cache[-1]
+                         + 5 * eps_cache[-2]) / 12.0
+        elif len(eps_cache) >= 3:
+            # 4nd order pseudo linear multistep (Adams-Bashforth)
+            eps_prime = (55 * eps - 59 * eps_cache[-1] + 37 * eps_cache[-2]
+                         - 9 * eps_cache[-3]) / 24.0
+        xt_1, x0 = compute_x0(eps_prime, t)
+        return xt_1, x0, eps
+
+    @torch.no_grad()
+    def plms_sample_loop(self,
+                         noise,
+                         model,
+                         model_kwargs={},
+                         clamp=None,
+                         percentile=None,
+                         condition_fn=None,
+                         guide_scale=None,
+                         plms_timesteps=20):
+        # prepare input
+        b = noise.size(0)
+        xt = noise
+
+        # diffusion process
+        steps = (1 + torch.arange(0, self.num_timesteps,
+                                  self.num_timesteps // plms_timesteps)).clamp(
+                                      0, self.num_timesteps - 1).flip(0)
+        eps_cache = []
+        for step in steps:
+            # PLMS sampling step
+            t = torch.full((b, ), step, dtype=torch.long, device=xt.device)
+            xt, _, eps = self.plms_sample(xt, t, model, model_kwargs, clamp,
+                                          percentile, condition_fn,
+                                          guide_scale, plms_timesteps,
+                                          eps_cache)
+
+            # update eps cache
+            eps_cache.append(eps)
+            if len(eps_cache) >= 4:
+                eps_cache.pop(0)
+        return xt
+
+    def loss(self, x0, t, model, model_kwargs={}, noise=None, input_x0=None):
+        noise = torch.randn_like(x0) if noise is None else noise
+        input_x0 = x0 if input_x0 is None else input_x0
+        xt = self.q_sample(input_x0, t, noise=noise)
+
+        # compute loss
+        if self.loss_type in ['kl', 'rescaled_kl']:
+            loss, _ = self.variational_lower_bound(x0, xt, t, model,
+                                                   model_kwargs)
+            if self.loss_type == 'rescaled_kl':
+                loss = loss * self.num_timesteps
+        elif self.loss_type in ['mse', 'rescaled_mse', 'l1', 'rescaled_l1']:
+            out = model(xt, self._scale_timesteps(t), **model_kwargs)
+
+            # VLB for variation
+            loss_vlb = 0.0
+            if self.var_type in ['learned', 'learned_range']:
+                out, var = out.chunk(2, dim=1)
+                frozen = torch.cat([
+                    out.detach(), var
+                ], dim=1)  # learn var without affecting the prediction of mean
+                loss_vlb, _ = self.variational_lower_bound(
+                    x0, xt, t, model=lambda *args, **kwargs: frozen)
+                if self.loss_type.startswith('rescaled_'):
+                    loss_vlb = loss_vlb * self.num_timesteps / 1000.0
+
+            # MSE/L1 for x0/eps
+            target = {
+                'eps': noise,
+                'x0': x0,
+                'x_{t-1}': self.q_posterior_mean_variance(x0, xt, t)[0]
+            }[self.mean_type]
+            loss = (out - target).pow(1 if self.loss_type.endswith('l1') else 2
+                                      ).abs().flatten(1).mean(dim=1)
+
+            # total loss
+            loss = loss + loss_vlb
+        return loss
+
+    def variational_lower_bound(self,
+                                x0,
+                                xt,
+                                t,
+                                model,
+                                model_kwargs={},
+                                clamp=None,
+                                percentile=None):
+        # compute groundtruth and predicted distributions
+        mu1, _, log_var1 = self.q_posterior_mean_variance(x0, xt, t)
+        mu2, _, log_var2, x0 = self.p_mean_variance(xt, t, model, model_kwargs,
+                                                    clamp, percentile)
+
+        # compute KL loss
+        kl = kl_divergence(mu1, log_var1, mu2, log_var2)
+        kl = kl.flatten(1).mean(dim=1) / math.log(2.0)
+
+        # compute discretized NLL loss (for p(x0 | x1) only)
+        nll = -discretized_gaussian_log_likelihood(
+            x0, mean=mu2, log_scale=0.5 * log_var2)
+        nll = nll.flatten(1).mean(dim=1) / math.log(2.0)
+
+        # NLL for p(x0 | x1) and KL otherwise
+        vlb = torch.where(t == 0, nll, kl)
+        return vlb, x0
+
+    @torch.no_grad()
+    def variational_lower_bound_loop(self,
+                                     x0,
+                                     model,
+                                     model_kwargs={},
+                                     clamp=None,
+                                     percentile=None):
+        r"""Compute the entire variational lower bound, measured in bits-per-dim.
+        """
+        # prepare input and output
+        b = x0.size(0)
+        metrics = {'vlb': [], 'mse': [], 'x0_mse': []}
+
+        # loop
+        for step in torch.arange(self.num_timesteps).flip(0):
+            # compute VLB
+            t = torch.full((b, ), step, dtype=torch.long, device=x0.device)
+            noise = torch.randn_like(x0)
+            xt = self.q_sample(x0, t, noise)
+            vlb, pred_x0 = self.variational_lower_bound(
+                x0, xt, t, model, model_kwargs, clamp, percentile)
+
+            # predict eps from x0
+            u1 = (_i(self.sqrt_recip_alphas_cumprod, t, xt) * xt - x0)
+            u2 = _i(self.sqrt_recipm1_alphas_cumprod, t, xt)
+            eps = u1 / u2
+
+            # collect metrics
+            metrics['vlb'].append(vlb)
+            metrics['x0_mse'].append(
+                (pred_x0 - x0).square().flatten(1).mean(dim=1))
+            metrics['mse'].append(
+                (eps - noise).square().flatten(1).mean(dim=1))
+        metrics = {k: torch.stack(v, dim=1) for k, v in metrics.items()}
+
+        # compute the prior KL term for VLB, measured in bits-per-dim
+        mu, _, log_var = self.q_mean_variance(x0, t)
+        kl_prior = kl_divergence(mu, log_var, torch.zeros_like(mu),
+                                 torch.zeros_like(log_var))
+        kl_prior = kl_prior.flatten(1).mean(dim=1) / math.log(2.0)
+
+        # update metrics
+        metrics['prior_bits_per_dim'] = kl_prior
+        metrics['total_bits_per_dim'] = metrics['vlb'].sum(dim=1) + kl_prior
+        return metrics
+
+    def _scale_timesteps(self, t):
+        if self.rescale_timesteps:
+            return t.float() * 1000.0 / self.num_timesteps
+        return t
diff --git a/modelscope/models/multi_modal/multi_stage_diffusion/model.py b/modelscope/models/multi_modal/multi_stage_diffusion/model.py
new file mode 100644
index 00000000..59bd837d
--- /dev/null
+++ b/modelscope/models/multi_modal/multi_stage_diffusion/model.py
@@ -0,0 +1,265 @@
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
+
+import math
+import os.path as osp
+from typing import Any, Dict
+
+import json
+import numpy as np
+import torch
+import torch.cuda.amp as amp
+import torch.nn as nn
+import torch.nn.functional as F
+from PIL import Image
+
+from modelscope.metainfo import Models
+from modelscope.models import TorchModel
+from modelscope.models.builder import MODELS
+from modelscope.models.multi_modal.multi_stage_diffusion.clip import CLIP
+from modelscope.models.multi_modal.multi_stage_diffusion.decoder import Decoder
+from modelscope.models.multi_modal.multi_stage_diffusion.gaussian_diffusion import (
+    GaussianDiffusion, beta_schedule)
+from modelscope.models.multi_modal.multi_stage_diffusion.prior import Prior
+from modelscope.models.multi_modal.multi_stage_diffusion.tokenizer import (
+    CLIPTokenizer, XGLMTokenizer)
+from modelscope.models.multi_modal.multi_stage_diffusion.upsampler import (
+    Upsampler256, Upsampler1024)
+from modelscope.models.multi_modal.multi_stage_diffusion.xglm import XGLM
+from modelscope.utils.constant import ModelFile, Tasks
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+__all__ = ['MultiStageDiffusionForTextToImageSynthesis']
+
+
+def make_diffusion(schedule,
+                   num_timesteps=1000,
+                   init_beta=None,
+                   last_beta=None,
+                   mean_type='eps',
+                   var_type='fixed_small'):
+    betas = beta_schedule(schedule, num_timesteps, init_beta, last_beta)
+    diffusion = GaussianDiffusion(
+        betas, mean_type=mean_type, var_type=var_type)
+    return diffusion
+
+
+class UnCLIP(nn.Module):
+
+    def __init__(self, model_dir):
+        super(UnCLIP, self).__init__()
+        self.model_dir = model_dir
+        self.config = json.load(open(f'{model_dir}/{ModelFile.CONFIGURATION}'))
+
+        # modules
+        self.clip = CLIP(**self.config['clip']).fp16()
+        self.xglm = XGLM(**self.config['xglm'])
+        self.prior = Prior(**self.config['prior'])
+        self.decoder = Decoder(**self.config['decoder'])
+        self.upsampler256 = Upsampler256(**self.config['upsampler256'])
+        self.upsampler1024 = Upsampler1024(**self.config['upsampler1024'])
+
+        # diffusions
+        self.prior_diffusion = make_diffusion(**self.config['prior_diffusion'])
+        self.decoder_diffusion = make_diffusion(
+            **self.config['decoder_diffusion'])
+        self.upsampler256_diffusion = make_diffusion(
+            **self.config['upsampler256_diffusion'])
+        self.upsampler1024_diffusion = make_diffusion(
+            **self.config['upsampler1024_diffusion'])
+
+        # tokenizers
+        self.clip_tokenizer = CLIPTokenizer(
+            bpe_path=f'{model_dir}/bpe_simple_vocab_16e6.txt.gz')
+        self.xglm_tokenizer = XGLMTokenizer(model_dir=model_dir)
+
+    def forward(self, *args, **kwargs):
+        raise NotImplementedError(
+            '"forward" is not implemented. Use "synthesis" instead.')
+
+    @torch.no_grad()
+    def synthesis(self,
+                  text='A photo of a confused grizzly bear in calculus class.',
+                  tokenizer='clip',
+                  batch_size=4,
+                  timesteps_prior=100,
+                  timesteps_64=50,
+                  timesteps_256=20,
+                  timesteps_1024=20,
+                  guide_prior=3.0,
+                  guide_64=7.0,
+                  guide_256=3.0,
+                  guide_1024=3.0,
+                  eta_prior=0.0,
+                  eta_64=0.0,
+                  eta_256=0.0,
+                  eta_1024=0.0):
+        device = next(self.parameters()).device
+
+        # check params
+        assert all([
+            t > 0 and t <= 1000 for t in
+            [timesteps_prior, timesteps_64, timesteps_256, timesteps_1024]
+        ])
+        assert all([
+            g > 1 and g < 15
+            for g in [guide_prior, guide_64, guide_256, guide_1024]
+        ])
+        assert all([
+            e >= 0 and e <= 1.0
+            for e in [eta_prior, eta_64, eta_256, eta_1024]
+        ])
+        assert batch_size >= 1 and batch_size <= 16
+
+        # tokenize the text
+        if tokenizer == 'clip':
+            y = F.normalize(
+                self.clip.textual(self.clip_tokenizer([text]).to(device)),
+                p=2,
+                dim=1)
+            zero_y = F.normalize(
+                self.clip.textual(self.clip_tokenizer(['']).to(device)),
+                p=2,
+                dim=1)
+        elif tokenizer == 'xglm':
+            y = F.normalize(
+                self.xglm(*to_device(self.xglm_tokenizer([text]), device)),
+                p=2,
+                dim=1)
+            zero_y = F.normalize(
+                self.xglm(*to_device(self.xglm_tokenizer(['']), device)),
+                p=2,
+                dim=1)
+        else:
+            raise ValueError(
+                f'Expected tokenizer to be one of "clip" or "xglm", but got {tokenizer}'
+            )
+        y = math.sqrt(y.size(1)) * y.repeat(batch_size, 1)
+        zero_y = math.sqrt(zero_y.size(1)) * zero_y.repeat(batch_size, 1)
+
+        # synthesis
+        with amp.autocast(enabled=True):
+            # prior
+            x0 = self.prior_diffusion.ddim_sample_loop(
+                noise=torch.randn_like(y),
+                model=self.prior,
+                model_kwargs=[{
+                    'y': y
+                }, {
+                    'y': zero_y
+                }],
+                guide_scale=guide_prior,
+                ddim_timesteps=timesteps_prior,
+                eta=eta_prior)
+
+            # decoder
+            imgs64 = self.decoder_diffusion.ddim_sample_loop(
+                noise=torch.randn(batch_size, 3, 64, 64).to(device),
+                model=self.decoder,
+                model_kwargs=[{
+                    'y': x0
+                }, {
+                    'y': torch.zeros_like(x0)
+                }],
+                guide_scale=guide_64,
+                percentile=0.995,
+                ddim_timesteps=timesteps_64,
+                eta=eta_64).clamp_(-1, 1)
+
+            # upsampler256
+            imgs256 = F.interpolate(
+                imgs64, scale_factor=4.0, mode='bilinear', align_corners=False)
+            imgs256 = self.upsampler256_diffusion.ddim_sample_loop(
+                noise=torch.randn_like(imgs256),
+                model=self.upsampler256,
+                model_kwargs=[{
+                    'y': y,
+                    'concat': imgs256
+                }, {
+                    'y': zero_y,
+                    'concat': imgs256
+                }],
+                guide_scale=guide_256,
+                percentile=0.995,
+                ddim_timesteps=timesteps_256,
+                eta=eta_256).clamp_(-1, 1)
+
+            # upsampler1024
+            imgs1024 = F.interpolate(
+                imgs256,
+                scale_factor=4.0,
+                mode='bilinear',
+                align_corners=False)
+            imgs1024 = self.upsampler1024_diffusion.ddim_sample_loop(
+                noise=torch.randn_like(imgs1024),
+                model=self.upsampler1024,
+                model_kwargs=[{
+                    'y': y,
+                    'concat': imgs1024
+                }, {
+                    'y': zero_y,
+                    'concat': imgs1024
+                }],
+                guide_scale=guide_1024,
+                percentile=0.995,
+                ddim_timesteps=timesteps_1024,
+                eta=eta_1024).clamp_(-1, 1)
+
+        # output ([B, C, H, W] within range [0, 1])
+        imgs1024 = imgs1024.add_(1).mul_(255 / 2.0).permute(0, 2, 3, 1).cpu()
+        imgs1024 = [
+            Image.fromarray(np.array(u, dtype=np.uint8)) for u in imgs1024
+        ]
+        return imgs1024
+
+
+@MODELS.register_module(
+    Tasks.text_to_image_synthesis, module_name=Models.multi_stage_diffusion)
+class MultiStageDiffusionForTextToImageSynthesis(TorchModel):
+
+    def __init__(self, model_dir, device_id=-1):
+        super().__init__(model_dir=model_dir, device_id=device_id)
+        model = UnCLIP(model_dir=model_dir)
+        pretrained_params = torch.load(
+            osp.join(model_dir, ModelFile.TORCH_MODEL_BIN_FILE), 'cpu')
+        model.load_state_dict(pretrained_params)
+        model.eval()
+
+        self.device_id = device_id
+        if self.device_id >= 0:
+            self.device = torch.device(f'cuda:{self.device_id}')
+            model.to('cuda:{}'.format(self.device_id))
+            logger.info('Use GPU: {}'.format(self.device_id))
+        else:
+            self.device = torch.device('cpu')
+            logger.info('Use CPU for inference')
+        self.model = model
+
+    def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
+        if not isinstance(input, dict):
+            raise ValueError(
+                f'Expected the input to be a dictionary, but got {type(input)}'
+            )
+        if 'text' not in input:
+            raise ValueError('input should contain "text", but not found')
+
+        # ddim sampling
+        imgs = self.model.synthesis(
+            text=input.get('text'),
+            tokenizer=input.get('tokenizer', 'clip'),
+            batch_size=input.get('batch_size', 4),
+            timesteps_prior=input.get('timesteps_prior', 100),
+            timesteps_64=input.get('timesteps_64', 50),
+            timesteps_256=input.get('timesteps_256', 20),
+            timesteps_1024=input.get('timesteps_1024', 20),
+            guide_prior=input.get('guide_prior', 3.0),
+            guide_64=input.get('guide_64', 7.0),
+            guide_256=input.get('guide_256', 3.0),
+            guide_1024=input.get('guide_1024', 3.0),
+            eta_prior=input.get('eta_prior', 0.0),
+            eta_64=input.get('eta_64', 0.0),
+            eta_256=input.get('eta_256', 0.0),
+            eta_1024=input.get('eta_1024', 0.0))
+        imgs = [np.array(u)[..., ::-1] for u in imgs]
+        return imgs
diff --git a/modelscope/models/multi_modal/multi_stage_diffusion/prior.py b/modelscope/models/multi_modal/multi_stage_diffusion/prior.py
new file mode 100644
index 00000000..9f4ef2d5
--- /dev/null
+++ b/modelscope/models/multi_modal/multi_stage_diffusion/prior.py
@@ -0,0 +1,170 @@
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
+
+import math
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+__all__ = ['Prior']
+
+
+def sinusoidal_embedding(timesteps, dim):
+    # check input
+    half = dim // 2
+    timesteps = timesteps.float()
+
+    # compute sinusoidal embedding
+    sinusoid = torch.outer(
+        timesteps, torch.pow(10000,
+                             -torch.arange(half).to(timesteps).div(half)))
+    x = torch.cat([torch.cos(sinusoid), torch.sin(sinusoid)], dim=1)
+    if dim % 2 != 0:
+        x = torch.cat([x, torch.zeros_like(x[:, :1])], dim=1)
+    return x
+
+
+class SelfAttention(nn.Module):
+
+    def __init__(self, dim, num_heads):
+        assert dim % num_heads == 0
+        super(SelfAttention, self).__init__()
+        self.dim = dim
+        self.num_heads = num_heads
+        self.head_dim = dim // num_heads
+        self.scale = math.pow(self.head_dim, -0.25)
+
+        # layers
+        self.to_qkv = nn.Linear(dim, dim * 3)
+        self.proj = nn.Linear(dim, dim)
+
+    def forward(self, x, mask):
+        b, l, n, c = *x.shape[:2], self.num_heads, self.head_dim
+
+        # compute query, key, value
+        q, k, v = self.to_qkv(x).view(b, l, n * 3, c).chunk(3, dim=2)
+
+        # compute attention
+        attn = torch.einsum('binc,bjnc->bnij', q * self.scale, k * self.scale)
+        if mask is not None:
+            attn = attn.masked_fill(mask[:, :, :l, :l] == 0, float('-inf'))
+        attn = F.softmax(attn.float(), dim=-1).type(attn.dtype)
+
+        # gather context
+        x = torch.einsum('bnij,bjnc->binc', attn, v)
+        x = x.reshape(b, l, -1)
+
+        # output
+        x = self.proj(x)
+        return x
+
+
+class AttentionBlock(nn.Module):
+
+    def __init__(self, dim, num_heads):
+        super(AttentionBlock, self).__init__()
+        self.dim = dim
+        self.num_heads = num_heads
+
+        # layers
+        self.norm1 = nn.LayerNorm(dim)
+        self.attn = SelfAttention(dim, num_heads)
+        self.norm2 = nn.LayerNorm(dim)
+        self.ffn = nn.Sequential(
+            nn.Linear(dim, dim * 4), nn.GELU(), nn.Linear(dim * 4, dim))
+
+    def forward(self, x, mask=None):
+        x = x + self.attn(self.norm1(x), mask)
+        x = x + self.ffn(self.norm2(x))
+        return x
+
+
+class Prior(nn.Module):
+
+    def __init__(self, dim=2048, clip_dim=768, num_heads=32, num_layers=24):
+        super(Prior, self).__init__()
+        self.dim = dim
+        self.clip_dim = clip_dim
+        self.num_heads = num_heads
+        self.num_layers = num_layers
+
+        # embeddings
+        self.text_embedding = nn.Sequential(
+            nn.Linear(clip_dim, dim), nn.SiLU(), nn.Linear(dim, dim))
+        self.time_embedding = nn.Sequential(
+            nn.Linear(dim, dim), nn.SiLU(), nn.Linear(dim, dim))
+        self.vision_embedding = nn.Sequential(
+            nn.Linear(clip_dim, dim), nn.SiLU(), nn.Linear(dim, dim))
+        self.eos_embedding = nn.Parameter(torch.zeros(1, 1, dim))
+        self.pos_embedding = nn.Parameter(torch.zeros(1, 4, dim))
+
+        # transformer
+        self.blocks = nn.ModuleList(
+            [AttentionBlock(dim, num_heads) for _ in range(num_layers)])
+        self.norm = nn.LayerNorm(dim)
+
+        # head
+        self.head = nn.Linear(dim, clip_dim)
+
+        # causal attention mask
+        self.register_buffer('attn_mask', torch.tril(torch.ones(1, 1, 4, 4)))
+
+        # initialize weights
+        self.init_weights()
+
+    def forward(self, x, t, y):
+        r"""x:      [B, C].
+            t:      [B].
+            y:      [B, C].
+        """
+        b = x.size(0)
+
+        # embeddings of shape [B, L + 4, C]
+        u1 = sinusoidal_embedding(t, self.dim)
+        u2 = [
+            self.text_embedding(y).unsqueeze(1),
+            self.time_embedding(u1).unsqueeze(1),
+            self.vision_embedding(x).unsqueeze(1),
+            self.eos_embedding.repeat(b, 1, 1)
+        ]
+        x = self.pos_embedding + torch.cat(u2, dim=1)
+
+        # transformer
+        for block in self.blocks:
+            x = block(x, self.attn_mask)
+        x = self.norm(x)
+
+        # head
+        x = self.head(x[:, -1])
+        return x
+
+    def init_weights(self):
+        std = 0.02 / math.sqrt(2.0 * self.num_layers)
+        for name, m in self.named_modules():
+            if name.endswith('attn.proj') or name.endswith('ffn.2'):
+                # smaller std for output layers
+                nn.init.normal_(m.weight, std=std)
+                nn.init.zeros_(m.bias)
+            elif isinstance(m, (nn.Linear, nn.Embedding)):
+                nn.init.normal_(m.weight, std=0.02)
+                if isinstance(m, nn.Linear) and m.bias is not None:
+                    nn.init.zeros_(m.bias)
+            elif isinstance(m, nn.LayerNorm):
+                nn.init.ones_(m.weight)
+                nn.init.zeros_(m.bias)
+
+    def param_groups(self):
+        groups = [{
+            'params': [
+                p for n, p in self.named_parameters()
+                if 'norm' in n or n.endswith('bias')
+            ],
+            'weight_decay':
+            0.0
+        }, {
+            'params': [
+                p for n, p in self.named_parameters()
+                if not ('norm' in n or n.endswith('bias'))
+            ]
+        }]
+        return groups
diff --git a/modelscope/models/multi_modal/multi_stage_diffusion/tokenizer.py b/modelscope/models/multi_modal/multi_stage_diffusion/tokenizer.py
new file mode 100644
index 00000000..59d6b304
--- /dev/null
+++ b/modelscope/models/multi_modal/multi_stage_diffusion/tokenizer.py
@@ -0,0 +1,200 @@
+# Part of the implementation is borrowed and modified from CLIP, publicly avaialbe at https://github.com/openai/CLIP.
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
+
+import gzip
+import html
+from functools import lru_cache
+
+import ftfy
+import regex as re
+import torch
+from transformers import AutoTokenizer
+
+__all__ = ['CLIPTokenizer', 'XGLMTokenizer']
+
+
+@lru_cache()
+def bytes_to_unicode():
+    """
+    Returns list of utf-8 byte and a corresponding list of unicode strings.
+    The reversible bpe codes work on unicode strings.
+    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
+    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
+    This is a signficant percentage of your normal, say, 32K bpe vocab.
+    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
+    And avoids mapping to whitespace/control characters the bpe code barfs on.
+    """
+    bs = list(range(ord('!'),
+                    ord('~') + 1)) + list(range(
+                        ord('¡'),
+                        ord('¬') + 1)) + list(range(ord('®'),
+                                                    ord('ÿ') + 1))
+    cs = bs[:]
+    n = 0
+    for b in range(2**8):
+        if b not in bs:
+            bs.append(b)
+            cs.append(2**8 + n)
+            n += 1
+    cs = [chr(n) for n in cs]
+    return dict(zip(bs, cs))
+
+
+def get_pairs(word):
+    """Return set of symbol pairs in a word.
+    Word is represented as tuple of symbols (symbols being variable-length strings).
+    """
+    pairs = set()
+    prev_char = word[0]
+    for char in word[1:]:
+        pairs.add((prev_char, char))
+        prev_char = char
+    return pairs
+
+
+def basic_clean(text):
+    text = ftfy.fix_text(text)
+    text = html.unescape(html.unescape(text))
+    return text.strip()
+
+
+def whitespace_clean(text):
+    text = re.sub(r'\s+', ' ', text)
+    text = text.strip()
+    return text
+
+
+class SimpleTokenizer(object):
+
+    def __init__(self, bpe_path):
+        self.byte_encoder = bytes_to_unicode()
+        self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
+        merges = gzip.open(bpe_path).read().decode('utf-8').split('\n')
+        merges = merges[1:49152 - 256 - 2 + 1]
+        merges = [tuple(merge.split()) for merge in merges]
+        vocab = list(bytes_to_unicode().values())
+        vocab = vocab + [v + '</w>' for v in vocab]
+        for merge in merges:
+            vocab.append(''.join(merge))
+        vocab.extend(['<|startoftext|>', '<|endoftext|>'])
+        self.encoder = dict(zip(vocab, range(len(vocab))))
+        self.decoder = {v: k for k, v in self.encoder.items()}
+        self.bpe_ranks = dict(zip(merges, range(len(merges))))
+        self.cache = {
+            '<|startoftext|>': '<|startoftext|>',
+            '<|endoftext|>': '<|endoftext|>'
+        }
+        self.pat = re.compile(
+            r"""<\|startoftext\|>|<\|endoftext\|>|'s|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+""",
+            re.IGNORECASE)
+
+    def bpe(self, token):
+        if token in self.cache:
+            return self.cache[token]
+        word = tuple(token[:-1]) + (token[-1] + '</w>', )
+        pairs = get_pairs(word)
+
+        if not pairs:
+            return token + '</w>'
+
+        while True:
+            bigram = min(
+                pairs, key=lambda pair: self.bpe_ranks.get(pair, float('inf')))
+            if bigram not in self.bpe_ranks:
+                break
+            first, second = bigram
+            new_word = []
+            i = 0
+            while i < len(word):
+                try:
+                    j = word.index(first, i)
+                    new_word.extend(word[i:j])
+                    i = j
+                except Exception:
+                    new_word.extend(word[i:])
+                    break
+
+                if word[i] == first and i < len(word) - 1 and word[
+                        i + 1] == second:
+                    new_word.append(first + second)
+                    i += 2
+                else:
+                    new_word.append(word[i])
+                    i += 1
+            new_word = tuple(new_word)
+            word = new_word
+            if len(word) == 1:
+                break
+            else:
+                pairs = get_pairs(word)
+        word = ' '.join(word)
+        self.cache[token] = word
+        return word
+
+    def encode(self, text):
+        bpe_tokens = []
+        text = whitespace_clean(basic_clean(text)).lower()
+        for token in re.findall(self.pat, text):
+            token = ''.join(self.byte_encoder[b]
+                            for b in token.encode('utf-8'))
+            bpe_tokens.extend(self.encoder[bpe_token]
+                              for bpe_token in self.bpe(token).split(' '))
+        return bpe_tokens
+
+    def decode(self, tokens):
+        text = ''.join([self.decoder[token] for token in tokens])
+        text = bytearray([self.byte_decoder[c] for c in text]).decode(
+            'utf-8', errors='replace').replace('</w>', ' ')
+        return text
+
+
+class CLIPTokenizer(object):
+    r"""CLIP tokenizer, adapted from https://github.com/openai/CLIP.
+    """
+
+    def __init__(self, bpe_path, length=77):
+        self.bpe_path = bpe_path
+        self.length = length
+
+        # init tokenizer
+        self.tokenizer = SimpleTokenizer(bpe_path=bpe_path)
+        self.sos_token = self.tokenizer.encoder['<|startoftext|>']
+        self.eos_token = self.tokenizer.encoder['<|endoftext|>']
+        self.vocab_size = len(self.tokenizer.encoder)
+
+    def __call__(self, sequence):
+        if isinstance(sequence, str):
+            return torch.LongTensor(self._tokenizer(sequence))
+        elif isinstance(sequence, list):
+            return torch.LongTensor([self._tokenizer(u) for u in sequence])
+        else:
+            raise TypeError(
+                f'Expected the "sequence" to be a string or a list, but got {type(sequence)}'
+            )
+
+    def _tokenizer(self, text):
+        tokens = self.tokenizer.encode(text)[:self.length - 2]
+        tokens = [self.sos_token] + tokens + [self.eos_token]
+        tokens = tokens + [0] * (self.length - len(tokens))
+        return tokens
+
+
+class XGLMTokenizer(object):
+    r"""A wrapper of HuggingFace's XGLM tokenizer.
+    """
+
+    def __init__(self, model_dir, length=77, **kwargs):
+        self.length = length
+        self.tokenizer = AutoTokenizer.from_pretrained(model_dir, **kwargs)
+        self.vocab_size = self.tokenizer.vocab_size
+
+    def __call__(self, sequence, **kwargs):
+        _kwargs = {
+            'return_tensors': 'pt',
+            'padding': 'max_length',
+            'truncation': True,
+            'max_length': self.length
+        }
+        _kwargs.update(**kwargs)
+        tokens = self.tokenizer(sequence, **_kwargs)
+        return tokens.input_ids, tokens.attention_mask
diff --git a/modelscope/models/multi_modal/multi_stage_diffusion/upsampler.py b/modelscope/models/multi_modal/multi_stage_diffusion/upsampler.py
new file mode 100644
index 00000000..a292edae
--- /dev/null
+++ b/modelscope/models/multi_modal/multi_stage_diffusion/upsampler.py
@@ -0,0 +1,466 @@
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
+
+import math
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+__all__ = ['Upsampler256', 'Upsampler1024']
+
+
+def sinusoidal_embedding(timesteps, dim):
+    # check input
+    half = dim // 2
+    timesteps = timesteps.float()
+
+    # compute sinusoidal embedding
+    sinusoid = torch.outer(
+        timesteps, torch.pow(10000,
+                             -torch.arange(half).to(timesteps).div(half)))
+    x = torch.cat([torch.cos(sinusoid), torch.sin(sinusoid)], dim=1)
+    if dim % 2 != 0:
+        x = torch.cat([x, torch.zeros_like(x[:, :1])], dim=1)
+    return x
+
+
+class Resample(nn.Module):
+
+    def __init__(self, in_dim, out_dim, scale_factor, use_conv=False):
+        assert scale_factor in [0.5, 1.0, 2.0]
+        super(Resample, self).__init__()
+        self.in_dim = in_dim
+        self.out_dim = out_dim
+        self.scale_factor = scale_factor
+        self.use_conv = use_conv
+
+        # layers
+        if scale_factor == 2.0:
+            self.resample = nn.Sequential(
+                nn.Upsample(scale_factor=scale_factor, mode='nearest'),
+                nn.Conv2d(in_dim, out_dim, 3, padding=1)
+                if use_conv else nn.Identity())
+        elif scale_factor == 0.5:
+            self.resample = nn.Conv2d(
+                in_dim, out_dim, 3, stride=2,
+                padding=1) if use_conv else nn.AvgPool2d(
+                    kernel_size=2, stride=2)
+        else:
+            self.resample = nn.Identity()
+
+    def forward(self, x):
+        return self.resample(x)
+
+
+class ResidualBlock(nn.Module):
+
+    def __init__(self,
+                 in_dim,
+                 embed_dim,
+                 out_dim,
+                 use_scale_shift_norm=True,
+                 scale_factor=1.0,
+                 dropout=0.0):
+        super(ResidualBlock, self).__init__()
+        self.in_dim = in_dim
+        self.embed_dim = embed_dim
+        self.out_dim = out_dim
+        self.use_scale_shift_norm = use_scale_shift_norm
+        self.scale_factor = scale_factor
+
+        # layers
+        self.layer1 = nn.Sequential(
+            nn.GroupNorm(32, in_dim), nn.SiLU(),
+            nn.Conv2d(in_dim, out_dim, 3, padding=1))
+        self.resample = Resample(in_dim, in_dim, scale_factor, use_conv=False)
+        self.embedding = nn.Sequential(
+            nn.SiLU(),
+            nn.Linear(embed_dim,
+                      out_dim * 2 if use_scale_shift_norm else out_dim))
+        self.layer2 = nn.Sequential(
+            nn.GroupNorm(32, out_dim), nn.SiLU(), nn.Dropout(dropout),
+            nn.Conv2d(out_dim, out_dim, 3, padding=1))
+        self.shortcut = nn.Identity() if in_dim == out_dim else nn.Conv2d(
+            in_dim, out_dim, 1)
+
+        # zero out the last layer params
+        nn.init.zeros_(self.layer2[-1].weight)
+
+    def forward(self, x, e):
+        identity = self.resample(x)
+        x = self.layer1[-1](self.resample(self.layer1[:-1](x)))
+        e = self.embedding(e).unsqueeze(-1).unsqueeze(-1).type(x.dtype)
+        if self.use_scale_shift_norm:
+            scale, shift = e.chunk(2, dim=1)
+            x = self.layer2[0](x) * (1 + scale) + shift
+            x = self.layer2[1:](x)
+        else:
+            x = x + e
+            x = self.layer2(x)
+        x = x + self.shortcut(identity)
+        return x
+
+
+class AttentionBlock(nn.Module):
+
+    def __init__(self, dim, context_dim=None, num_heads=None, head_dim=None):
+        # consider head_dim first, then num_heads
+        num_heads = dim // head_dim if head_dim else num_heads
+        head_dim = dim // num_heads
+        assert num_heads * head_dim == dim
+        super(AttentionBlock, self).__init__()
+        self.dim = dim
+        self.context_dim = context_dim
+        self.num_heads = num_heads
+        self.head_dim = head_dim
+        self.scale = math.pow(head_dim, -0.25)
+
+        # layers
+        self.norm = nn.GroupNorm(32, dim)
+        self.to_qkv = nn.Conv2d(dim, dim * 3, 1)
+        if context_dim is not None:
+            self.context_kv = nn.Linear(context_dim, dim * 2)
+        self.proj = nn.Conv2d(dim, dim, 1)
+
+        # zero out the last layer params
+        nn.init.zeros_(self.proj.weight)
+
+    def forward(self, x, context=None):
+        r"""x:       [B, C, H, W].
+            context: [B, L, C] or None.
+        """
+        identity = x
+        b, c, h, w, n, d = *x.size(), self.num_heads, self.head_dim
+
+        # compute query, key, value
+        x = self.norm(x)
+        q, k, v = self.to_qkv(x).view(b, n * 3, d, h * w).chunk(3, dim=1)
+        if context is not None:
+            ck, cv = self.context_kv(context).reshape(b, -1, n * 2,
+                                                      d).permute(0, 2, 3,
+                                                                 1).chunk(
+                                                                     2, dim=1)
+            k = torch.cat([ck, k], dim=-1)
+            v = torch.cat([cv, v], dim=-1)
+
+        # compute attention
+        attn = torch.matmul(q.transpose(-1, -2) * self.scale, k * self.scale)
+        attn = F.softmax(attn, dim=-1)
+
+        # gather context
+        x = torch.matmul(v, attn.transpose(-1, -2))
+        x = x.reshape(b, c, h, w)
+
+        # output
+        x = self.proj(x)
+        return x + identity
+
+
+class Upsampler256(nn.Module):
+
+    def __init__(self,
+                 in_dim=6,
+                 dim=320,
+                 y_dim=768,
+                 context_dim=512,
+                 out_dim=3,
+                 dim_mult=[1, 2, 3, 4],
+                 num_heads=None,
+                 head_dim=64,
+                 num_res_blocks=3,
+                 attn_scales=[1 / 8],
+                 resblock_resample=True,
+                 use_scale_shift_norm=True,
+                 dropout=0.1):
+        embed_dim = dim * 4
+        super(Upsampler256, self).__init__()
+        self.in_dim = in_dim
+        self.dim = dim
+        self.y_dim = y_dim
+        self.context_dim = context_dim
+        self.embed_dim = embed_dim
+        self.out_dim = out_dim
+        self.dim_mult = dim_mult
+        self.num_heads = num_heads
+        self.head_dim = head_dim
+        self.num_res_blocks = num_res_blocks
+        self.attn_scales = attn_scales
+        self.resblock_resample = resblock_resample
+        self.use_scale_shift_norm = use_scale_shift_norm
+
+        # params
+        enc_dims = [dim * u for u in [1] + dim_mult]
+        dec_dims = [dim * u for u in [dim_mult[-1]] + dim_mult[::-1]]
+        shortcut_dims = []
+        scale = 1.0
+
+        # embeddings
+        self.time_embedding = nn.Sequential(
+            nn.Linear(dim, embed_dim), nn.SiLU(),
+            nn.Linear(embed_dim, embed_dim))
+        self.y_embedding = nn.Sequential(
+            nn.Linear(y_dim, embed_dim), nn.SiLU(),
+            nn.Linear(embed_dim, embed_dim))
+        self.context_embedding = nn.Sequential(
+            nn.Linear(y_dim, embed_dim), nn.SiLU(),
+            nn.Linear(embed_dim, context_dim * 4))
+
+        # encoder
+        self.encoder = nn.ModuleList(
+            [nn.Conv2d(self.in_dim, dim, 3, padding=1)])
+        shortcut_dims.append(dim)
+        for i, (in_dim,
+                out_dim) in enumerate(zip(enc_dims[:-1], enc_dims[1:])):
+            for j in range(num_res_blocks):
+                # residual (+attention) blocks
+                block = nn.ModuleList([
+                    ResidualBlock(in_dim, embed_dim, out_dim,
+                                  use_scale_shift_norm, 1.0, dropout)
+                ])
+                if scale in attn_scales:
+                    block.append(
+                        AttentionBlock(out_dim, context_dim, num_heads,
+                                       head_dim))
+                in_dim = out_dim
+                self.encoder.append(block)
+                shortcut_dims.append(out_dim)
+
+                # downsample
+                if i != len(dim_mult) - 1 and j == num_res_blocks - 1:
+                    if resblock_resample:
+                        downsample = ResidualBlock(out_dim, embed_dim, out_dim,
+                                                   use_scale_shift_norm, 0.5,
+                                                   dropout)
+                    else:
+                        downsample = Resample(
+                            out_dim, out_dim, 0.5, use_conv=True)
+                    shortcut_dims.append(out_dim)
+                    scale /= 2.0
+                    self.encoder.append(downsample)
+
+        # middle
+        self.middle = nn.ModuleList([
+            ResidualBlock(out_dim, embed_dim, out_dim, use_scale_shift_norm,
+                          1.0, dropout),
+            AttentionBlock(out_dim, context_dim, num_heads, head_dim),
+            ResidualBlock(out_dim, embed_dim, out_dim, use_scale_shift_norm,
+                          1.0, dropout)
+        ])
+
+        # decoder
+        self.decoder = nn.ModuleList()
+        for i, (in_dim,
+                out_dim) in enumerate(zip(dec_dims[:-1], dec_dims[1:])):
+            for j in range(num_res_blocks + 1):
+                # residual (+attention) blocks
+                block = nn.ModuleList([
+                    ResidualBlock(in_dim + shortcut_dims.pop(), embed_dim,
+                                  out_dim, use_scale_shift_norm, 1.0, dropout)
+                ])
+                if scale in attn_scales:
+                    block.append(
+                        AttentionBlock(out_dim, context_dim, num_heads,
+                                       head_dim))
+                in_dim = out_dim
+
+                # upsample
+                if i != len(dim_mult) - 1 and j == num_res_blocks:
+                    if resblock_resample:
+                        upsample = ResidualBlock(out_dim, embed_dim, out_dim,
+                                                 use_scale_shift_norm, 2.0,
+                                                 dropout)
+                    else:
+                        upsample = Resample(
+                            out_dim, out_dim, 2.0, use_conv=True)
+                    scale *= 2.0
+                    block.append(upsample)
+                self.decoder.append(block)
+
+        # head
+        self.head = nn.Sequential(
+            nn.GroupNorm(32, out_dim), nn.SiLU(),
+            nn.Conv2d(out_dim, self.out_dim, 3, padding=1))
+
+        # zero out the last layer params
+        nn.init.zeros_(self.head[-1].weight)
+
+    def forward(self, x, t, y, concat):
+        # embeddings
+        x = torch.cat([x, concat], dim=1)
+        e = self.time_embedding(sinusoidal_embedding(
+            t, self.dim)) + self.y_embedding(y)
+        context = self.context_embedding(y).view(-1, 4, self.context_dim)
+
+        # encoder
+        xs = []
+        for block in self.encoder:
+            x = self._forward_single(block, x, e, context)
+            xs.append(x)
+
+        # middle
+        for block in self.middle:
+            x = self._forward_single(block, x, e, context)
+
+        # decoder
+        for block in self.decoder:
+            x = torch.cat([x, xs.pop()], dim=1)
+            x = self._forward_single(block, x, e, context)
+
+        # head
+        x = self.head(x)
+        return x
+
+    def _forward_single(self, module, x, e, context):
+        if isinstance(module, ResidualBlock):
+            x = module(x, e)
+        elif isinstance(module, AttentionBlock):
+            x = module(x, context)
+        elif isinstance(module, nn.ModuleList):
+            for block in module:
+                x = self._forward_single(block, x, e, context)
+        else:
+            x = module(x)
+        return x
+
+
+class Upsampler1024(nn.Module):
+
+    def __init__(self,
+                 in_dim=6,
+                 dim=192,
+                 y_dim=768,
+                 out_dim=3,
+                 dim_mult=[1, 1, 2, 2, 4, 4],
+                 num_res_blocks=2,
+                 resblock_resample=True,
+                 use_scale_shift_norm=True,
+                 dropout=0.0):
+        embed_dim = dim * 4
+        super(Upsampler1024, self).__init__()
+        self.in_dim = in_dim
+        self.dim = dim
+        self.y_dim = y_dim
+        self.out_dim = out_dim
+        self.dim_mult = dim_mult
+        self.num_res_blocks = num_res_blocks
+        self.resblock_resample = resblock_resample
+        self.use_scale_shift_norm = use_scale_shift_norm
+
+        # params
+        enc_dims = [dim * u for u in [1] + dim_mult]
+        dec_dims = [dim * u for u in [dim_mult[-1]] + dim_mult[::-1]]
+        shortcut_dims = []
+        scale = 1.0
+
+        # embedding
+        self.time_embedding = nn.Sequential(
+            nn.Linear(dim, embed_dim), nn.SiLU(),
+            nn.Linear(embed_dim, embed_dim))
+        self.y_embedding = nn.Sequential(
+            nn.Linear(y_dim, embed_dim), nn.SiLU(),
+            nn.Linear(embed_dim, embed_dim))
+
+        # encoder
+        self.encoder = nn.ModuleList(
+            [nn.Conv2d(self.in_dim, dim, 3, padding=1)])
+        shortcut_dims.append(dim)
+        for i, (in_dim,
+                out_dim) in enumerate(zip(enc_dims[:-1], enc_dims[1:])):
+            for j in range(num_res_blocks):
+                # residual block
+                block = nn.ModuleList([
+                    ResidualBlock(in_dim, embed_dim, out_dim,
+                                  use_scale_shift_norm, 1.0, dropout)
+                ])
+                shortcut_dims.append(out_dim)
+                in_dim = out_dim
+                self.encoder.append(block)
+
+                # downsample
+                if i != len(dim_mult) - 1 and j == num_res_blocks - 1:
+                    if resblock_resample:
+                        downsample = ResidualBlock(out_dim, embed_dim, out_dim,
+                                                   use_scale_shift_norm, 0.5,
+                                                   dropout)
+                    else:
+                        downsample = Resample(
+                            out_dim, out_dim, 0.5, use_conv=True)
+                    shortcut_dims.append(out_dim)
+                    scale /= 2.0
+                    self.encoder.append(downsample)
+
+        # middle
+        self.middle = nn.ModuleList([
+            ResidualBlock(out_dim, embed_dim, out_dim, use_scale_shift_norm,
+                          1.0, dropout),
+            ResidualBlock(out_dim, embed_dim, out_dim, use_scale_shift_norm,
+                          1.0, dropout)
+        ])
+
+        # decoder
+        self.decoder = nn.ModuleList()
+        for i, (in_dim,
+                out_dim) in enumerate(zip(dec_dims[:-1], dec_dims[1:])):
+            for j in range(num_res_blocks + 1):
+                # residual block
+                block = nn.ModuleList([
+                    ResidualBlock(in_dim + shortcut_dims.pop(), embed_dim,
+                                  out_dim, use_scale_shift_norm, 1.0, dropout)
+                ])
+                in_dim = out_dim
+
+                # upsample
+                if i != len(dim_mult) - 1 and j == num_res_blocks:
+                    if resblock_resample:
+                        upsample = ResidualBlock(out_dim, embed_dim, out_dim,
+                                                 use_scale_shift_norm, 2.0,
+                                                 dropout)
+                    else:
+                        upsample = Resample(
+                            out_dim, out_dim, 2.0, use_conv=True)
+                    scale *= 2.0
+                    block.append(upsample)
+                self.decoder.append(block)
+
+        # head
+        self.head = nn.Sequential(
+            nn.GroupNorm(32, out_dim), nn.SiLU(),
+            nn.Conv2d(out_dim, self.out_dim, 3, padding=1))
+
+        # zero out the last layer params
+        nn.init.zeros_(self.head[-1].weight)
+
+    def forward(self, x, t, y, concat):
+        # embedding
+        x = torch.cat([x, concat], dim=1)
+        e = self.time_embedding(sinusoidal_embedding(
+            t, self.dim)) + self.y_embedding(y)
+
+        # encoder
+        xs = []
+        for block in self.encoder:
+            x = self._forward_single(block, x, e)
+            xs.append(x)
+
+        # middle
+        for block in self.middle:
+            x = self._forward_single(block, x, e)
+
+        # decoder
+        for block in self.decoder:
+            x = torch.cat([x, xs.pop()], dim=1)
+            x = self._forward_single(block, x, e)
+
+        # head
+        x = self.head(x)
+        return x
+
+    def _forward_single(self, module, x, e):
+        if isinstance(module, ResidualBlock):
+            x = module(x, e)
+        elif isinstance(module, nn.ModuleList):
+            for block in module:
+                x = self._forward_single(block, x, e)
+        else:
+            x = module(x)
+        return x
diff --git a/modelscope/models/multi_modal/multi_stage_diffusion/xglm.py b/modelscope/models/multi_modal/multi_stage_diffusion/xglm.py
new file mode 100644
index 00000000..133da50b
--- /dev/null
+++ b/modelscope/models/multi_modal/multi_stage_diffusion/xglm.py
@@ -0,0 +1,206 @@
+# Part of the implementation is borrowed and modified from HuggingFace XGLM,
+# publicly avaialbe at https://github.com/huggingface/transformers.
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
+
+import math
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+__all__ = ['XGLM']
+
+
+def sinusoidal_embedding(seq_len, dim, pad_token=None):
+    half = dim // 2
+    sinusoid = torch.outer(
+        torch.arange(seq_len, dtype=torch.float32),
+        torch.pow(10000,
+                  -torch.arange(half, dtype=torch.float32).div(half - 1)))
+    x = torch.cat([torch.sin(sinusoid), torch.cos(sinusoid)], dim=1)
+    if dim % 2 == 1:
+        x = torch.cat([x, torch.zeros_like(x[:, :1])], dim=1)
+    if pad_token is not None:
+        x[pad_token, :] = 0
+    return x
+
+
+class SinusoidalEmbedding(nn.Module):
+
+    def __init__(self, seq_len, dim, pad_token):
+        super(SinusoidalEmbedding, self).__init__()
+        self.seq_len = seq_len
+        self.dim = dim
+        self.pad_token = pad_token
+        self.register_buffer('weight',
+                             sinusoidal_embedding(seq_len + 2, dim, pad_token))
+
+    def forward(self, tokens):
+        mask = tokens.ne(self.pad_token).long()
+        indices = torch.cumsum(mask, dim=1) * mask + self.pad_token
+        pos_embeds = self.weight.index_select(0, indices.view(-1)).view(
+            *tokens.shape, -1)
+        return pos_embeds
+
+
+class GELU(nn.Module):
+
+    def forward(self, x):
+        return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))
+
+
+class SelfAttention(nn.Module):
+
+    def __init__(self, dim, num_heads, dropout=0.1):
+        assert dim % num_heads == 0
+        super(SelfAttention, self).__init__()
+        self.dim = dim
+        self.num_heads = num_heads
+        self.head_dim = dim // num_heads
+        self.scale = 1.0 / math.sqrt(self.head_dim)
+
+        # layers
+        self.q = nn.Linear(dim, dim)
+        self.k = nn.Linear(dim, dim)
+        self.v = nn.Linear(dim, dim)
+        self.o = nn.Linear(dim, dim)
+        self.dropout = nn.Dropout(dropout)
+
+    def forward(self, x, mask=None):
+        r"""x:      [B, L, C].
+            mask:   [B, *, L, L] or None.
+        """
+        b, l, n, c = *x.shape[:2], self.num_heads, self.head_dim
+
+        # compute query, key, value
+        q = self.q(x).view(b, l, n, c)
+        k = self.k(x).view(b, l, n, c)
+        v = self.v(x).view(b, l, n, c)
+
+        # compute attention
+        attn = self.scale * torch.einsum('binc,bjnc->bnij', q, k)
+        if mask is not None:
+            attn = attn.masked_fill(mask == 0, float('-inf'))
+        attn = F.softmax(attn, dim=-1)
+        attn = self.dropout(attn)
+
+        # gather context
+        x = torch.einsum('bnij,bjnc->binc', attn, v)
+        x = x.reshape(b, l, -1)
+
+        # output
+        x = self.o(x)
+        x = self.dropout(x)
+        return x
+
+
+class AttentionBlock(nn.Module):
+
+    def __init__(self, dim, ffn_dim, ffn_act, num_heads, dropout=0.1):
+        assert ffn_act in ['gelu', 'relu']
+        super(AttentionBlock, self).__init__()
+        self.dim = dim
+        self.ffn_dim = ffn_dim
+        self.ffn_act = ffn_act
+        self.num_heads = num_heads
+
+        # layers
+        self.norm1 = nn.LayerNorm(dim)
+        self.attn = SelfAttention(dim, num_heads, dropout)
+        self.norm2 = nn.LayerNorm(dim)
+        self.ffn = nn.Sequential(
+            nn.Linear(dim, ffn_dim),
+            GELU() if ffn_act == 'gelu' else nn.ReLU(inplace=True),
+            nn.Linear(ffn_dim, dim), nn.Dropout(dropout))
+
+    def forward(self, x, mask=None):
+        x = x + self.attn(self.norm1(x), mask)
+        x = x + self.ffn(self.norm2(x))
+        return x
+
+
+class XGLM(nn.Module):
+    r"""A multilingual GPT model with an embedding head.
+    """
+
+    def __init__(self,
+                 vocab_size=256008,
+                 max_seq_len=2048,
+                 dim=1024,
+                 ffn_dim=4096,
+                 ffn_act='gelu',
+                 embed_dim=768,
+                 num_heads=16,
+                 num_layers=24,
+                 pad_token=1,
+                 dropout=0.1):
+        super(XGLM, self).__init__()
+        self.vocab_size = vocab_size
+        self.max_seq_len = max_seq_len
+        self.dim = dim
+        self.ffn_dim = ffn_dim
+        self.ffn_act = ffn_act
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.num_layers = num_layers
+        self.pad_token = pad_token
+        self.scale = math.sqrt(dim)  # rescale token embedings
+
+        # layers
+        self.token_embedding = nn.Embedding(vocab_size, dim, pad_token)
+        self.pos_embedding = SinusoidalEmbedding(max_seq_len, dim, pad_token)
+        self.eos_embedding = nn.Parameter(torch.randn(1, 1, dim))
+        self.dropout = nn.Dropout(dropout)
+        self.blocks = nn.ModuleList([
+            AttentionBlock(dim, ffn_dim, ffn_act, num_heads, dropout)
+            for _ in range(num_layers)
+        ])
+        self.norm = nn.LayerNorm(dim)
+        self.head = nn.Linear(dim, embed_dim, bias=False)
+
+        # causal attention mask
+        self.register_buffer(
+            'attn_mask',
+            torch.tril(torch.ones(1, 1, 1 + max_seq_len, 1 + max_seq_len)))
+
+        # init weights
+        self.apply(self.init_weights)
+
+    def forward(self, tokens, mask=None):
+        r"""tokens: [B, L].
+            mask:   [B, L].
+        """
+        b, seq_len = tokens.size(0), 1 + tokens.size(1)
+
+        # embeddings
+        x = self.scale * self.token_embedding(tokens)
+        x = torch.cat([x, self.eos_embedding.repeat(b, 1, 1)], dim=1)
+        # x = x + self.pos_embedding(tokens)
+        x = self.dropout(x)
+
+        # attention mask
+        if mask is None:
+            mask = self.attn_mask[:, :, :seq_len, :seq_len].repeat(b, 1, 1, 1)
+        else:
+            mask = self.attn_mask[:, :, :seq_len, :seq_len] * torch.cat(
+                [mask, torch.zeros_like(mask[:, :1])], dim=1).view(
+                    b, 1, 1, seq_len)
+
+        # transformer
+        for block in self.blocks:
+            x = block(x, mask)
+        x = self.norm(x)
+
+        # head
+        logits = self.head(x[:, -1])
+        return logits
+
+    def init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            nn.init.normal_(m.weight, std=0.02)
+            if m.bias is not None:
+                nn.init.zeros_(m.bias)
+        elif isinstance(m, nn.Embedding):
+            nn.init.normal_(m.weight, std=0.02)
+            if m.padding_idx is not None:
+                nn.init.zeros_(m.weight[m.padding_idx])
diff --git a/modelscope/models/multi_modal/ofa_for_all_tasks.py b/modelscope/models/multi_modal/ofa_for_all_tasks.py
index 860b68d3..45bafde9 100644
--- a/modelscope/models/multi_modal/ofa_for_all_tasks.py
+++ b/modelscope/models/multi_modal/ofa_for_all_tasks.py
@@ -112,8 +112,6 @@ class OfaForAllTasks(TorchModel):
                 OutputKeys.CAPTION, OutputKeys.TEXT, OutputKeys.BOXES,
                 OutputKeys.LABELS, OutputKeys.SCORES
         ]:
-            if key in ret and len(ret[key]) == 1:
-                ret[key] = ret[key][0]
             if key not in ret:
                 ret[key] = None
         return ret
@@ -121,8 +119,10 @@ class OfaForAllTasks(TorchModel):
     def postprocess(self, input: Dict[str, Tensor],
                     **kwargs) -> Dict[str, Tensor]:
         if self.cfg.task == Tasks.image_captioning:
-            caption = input[OutputKeys.CAPTION]
-            caption = caption.translate(self.transtab).strip()
+            caption = [
+                cap.translate(self.transtab).strip()
+                for cap in input[OutputKeys.CAPTION]
+            ]
             input[OutputKeys.CAPTION] = caption
         return input
 
@@ -152,8 +152,8 @@ class OfaForAllTasks(TorchModel):
         region_tensor[:, ::2] /= input['w_resize_ratios']
         region_tensor[:, 1::2] /= input['h_resize_ratios']
         return {
-            OutputKeys.BOXES: move_to_device(region_tensor,
-                                             torch.device('cpu')),
+            OutputKeys.BOXES:
+            move_to_device(region_tensor, torch.device('cpu')).tolist(),
             OutputKeys.SCORES: [1.0] * region_tensor.shape[0]
         }
 
diff --git a/modelscope/models/multi_modal/ofa_for_text_to_image_synthesis_model.py b/modelscope/models/multi_modal/ofa_for_text_to_image_synthesis_model.py
index 5cdc9668..b942e3fa 100644
--- a/modelscope/models/multi_modal/ofa_for_text_to_image_synthesis_model.py
+++ b/modelscope/models/multi_modal/ofa_for_text_to_image_synthesis_model.py
@@ -6,17 +6,30 @@ import numpy as np
 import torch
 import torch.cuda
 from PIL import Image
+from pkg_resources import packaging
 from taming.models.vqgan import GumbelVQ, VQModel
+from torchvision.transforms import (CenterCrop, Compose, Normalize, Resize,
+                                    ToTensor)
 
 from modelscope.metainfo import Models
 from modelscope.models.base import Model
 from modelscope.models.builder import MODELS
+from modelscope.models.multi_modal.mmr.models.module_clip import CLIP
+from modelscope.models.multi_modal.mmr.models.tokenization_clip import \
+    SimpleTokenizer as ClipTokenizer
 from modelscope.models.multi_modal.ofa import OFAModel, OFATokenizer
 from modelscope.models.multi_modal.ofa.generate import sequence_generator as sg
 from modelscope.models.multi_modal.ofa.generate.search import Sampling
 from modelscope.models.multi_modal.ofa.generate.utils import move_to_device
 from modelscope.utils.constant import Tasks
 
+try:
+    from torchvision.transforms import InterpolationMode
+
+    BICUBIC = InterpolationMode.BICUBIC
+except ImportError:
+    BICUBIC = Image.BICUBIC
+
 __all__ = ['OfaForTextToImageSynthesis']
 
 
@@ -43,6 +56,74 @@ def load_vqgan(config, ckpt_path=None, is_gumbel=False):
     return model.eval()
 
 
+def build_clip_model(model_path):
+    state_dict = torch.load(model_path, map_location='cpu').state_dict()
+    vit = 'visual.proj' in state_dict
+    if vit:
+        vision_width = state_dict['visual.conv1.weight'].shape[0]
+        vision_layers = len([
+            k for k in state_dict.keys()
+            if k.startswith('visual.') and k.endswith('.attn.in_proj_weight')
+        ])
+        vision_patch_size = state_dict['visual.conv1.weight'].shape[-1]
+        grid_size = round(
+            (state_dict['visual.positional_embedding'].shape[0] - 1)**0.5)
+        image_resolution = vision_patch_size * grid_size
+    else:
+        counts: list = [
+            len(
+                set(
+                    k.split('.')[2] for k in state_dict
+                    if k.startswith(f'visual.layer{b}')))
+            for b in [1, 2, 3, 4]
+        ]
+        vision_layers = tuple(counts)
+        vision_width = state_dict['visual.layer1.0.conv1.weight'].shape[0]
+        output_width = round(
+            (state_dict['visual.attnpool.positional_embedding'].shape[0]
+             - 1)**0.5)
+        vision_patch_size = None
+        assert output_width**2 + 1 == state_dict[
+            'visual.attnpool.positional_embedding'].shape[0]
+        image_resolution = output_width * 32
+
+    embed_dim = state_dict['text_projection'].shape[1]
+    context_length = state_dict['positional_embedding'].shape[0]
+    vocab_size = state_dict['token_embedding.weight'].shape[0]
+    transformer_width = state_dict['ln_final.weight'].shape[0]
+    transformer_heads = transformer_width // 64
+    transformer_layers = len(
+        set(
+            k.split('.')[2] for k in state_dict
+            if k.startswith('transformer.resblocks')))
+
+    model = CLIP(embed_dim, image_resolution, vision_layers, vision_width,
+                 vision_patch_size, context_length, vocab_size,
+                 transformer_width, transformer_heads, transformer_layers)
+
+    for key in ['input_resolution', 'context_length', 'vocab_size']:
+        if key in state_dict:
+            del state_dict[key]
+
+    model.load_state_dict(state_dict)
+    return model.eval()
+
+
+def _convert_image_to_rgb(image):
+    return image.convert('RGB')
+
+
+def build_clip_transform(n_px):
+    return Compose([
+        Resize(n_px, interpolation=BICUBIC),
+        CenterCrop(n_px),
+        _convert_image_to_rgb,
+        ToTensor(),
+        Normalize((0.48145466, 0.4578275, 0.40821073),
+                  (0.26862954, 0.26130258, 0.27577711)),
+    ])
+
+
 @MODELS.register_module(Tasks.text_to_image_synthesis, module_name=Models.ofa)
 class OfaForTextToImageSynthesis(Model):
 
@@ -65,11 +146,23 @@ class OfaForTextToImageSynthesis(Model):
             vqgan_config,
             ckpt_path=os.path.join(model_dir, 'vqgan_model.ckpt'),
             is_gumbel=True).to(self._device)
+
+        # Initialize OpenAI clip
+
+        self.clip_tokenizer = ClipTokenizer(model_dir)
+        self.clip_model = build_clip_model(
+            os.path.join(model_dir, 'ViT-B-16.pt'))
+        self.clip_preprocess = build_clip_transform(
+            self.clip_model.visual.input_resolution)
+
+        self.clip_model.to(self._device)
+        self.clip_model.eval()
+
         # Initialize generator
         sampling = Sampling(self.tokenizer, sampling_topp=0.9)
         sg_args = {
             'tokenizer': self.tokenizer,
-            'beam_size': 1,
+            'beam_size': 2,
             'max_len_b': 1024,
             'min_len': 1024,
             'search_strategy': sampling,
@@ -78,13 +171,68 @@ class OfaForTextToImageSynthesis(Model):
         }
         self.generator = sg.SequenceGenerator(**sg_args)
 
+    def clip_tokenize(self, texts, context_length=77, truncate=False):
+
+        if isinstance(texts, str):
+            texts = [texts]
+
+        sot_token = self.clip_tokenizer.encoder['<|startoftext|>']
+        eot_token = self.clip_tokenizer.encoder['<|endoftext|>']
+        all_tokens = [[sot_token] + self.clip_tokenizer.encode(text)
+                      + [eot_token] for text in texts]
+        if packaging.version.parse(
+                torch.__version__) < packaging.version.parse('1.8.0'):
+            result = torch.zeros(
+                len(all_tokens), context_length, dtype=torch.long)
+        else:
+            result = torch.zeros(
+                len(all_tokens), context_length, dtype=torch.int)
+
+        for i, tokens in enumerate(all_tokens):
+            if len(tokens) > context_length:
+                if truncate:
+                    tokens = tokens[:context_length]
+                    tokens[-1] = eot_token
+                else:
+                    raise RuntimeError(
+                        f'Input {texts[i]} is too long for context length {context_length}'
+                    )
+            result[i, :len(tokens)] = torch.tensor(tokens)
+
+        return result
+
     def forward(self, input: Dict[str, Any]):
+
+        text = input['samples'][0]['text']
         input = move_to_device(input, self._device)
+        clip_text_input = self.clip_tokenize([text]).to(self._device)
+
         gen_output = self.generator.generate([self.model], input)
-        gen_tokens = gen_output[0][0]['tokens'][:-1]
-        codes = gen_tokens.view(1, 32, 32) - 50265
+        gen_tokens = torch.stack(
+            [item['tokens'][:-1] for item in gen_output[0]], dim=0)
+        codes = gen_tokens.view(-1, 32, 32) - 50265
+
         quant_b = self.vqgan_model.quantize.get_codebook_entry(
             codes.view(-1),
             list(codes.size()) + [self.vqgan_model.quantize.embedding_dim])
-        dec = self.vqgan_model.decode(quant_b)[0]
-        return custom_to_pil(dec)
+        imgs = self.vqgan_model.decode(quant_b)
+
+        sample_num = imgs.size()[0]
+        pil_imgs = [custom_to_pil(imgs[i]) for i in range(sample_num)]
+
+        clip_image_input = torch.stack(
+            [self.clip_preprocess(img) for img in pil_imgs],
+            dim=0).to(self._device)
+
+        with torch.no_grad():
+            hyp_image_features = self.clip_model.encode_image(clip_image_input)
+            hyp_image_features /= hyp_image_features.norm(dim=-1, keepdim=True)
+            text_features = self.clip_model.encode_text(clip_text_input)
+            text_features /= text_features.norm(dim=-1, keepdim=True)
+        ti_similarity = hyp_image_features @ text_features.T
+
+        sorted_score, ti_indices = torch.sort(
+            ti_similarity.view(-1), descending=True)
+
+        pil_imgs_orderby_ti = [pil_imgs[index] for index in ti_indices]
+        return pil_imgs_orderby_ti[0]
diff --git a/modelscope/models/multi_modal/team/__init__.py b/modelscope/models/multi_modal/team/__init__.py
new file mode 100644
index 00000000..0597040c
--- /dev/null
+++ b/modelscope/models/multi_modal/team/__init__.py
@@ -0,0 +1 @@
+from .team_model import TEAMForMultiModalSimilarity
diff --git a/modelscope/models/multi_modal/team/team_model.py b/modelscope/models/multi_modal/team/team_model.py
new file mode 100644
index 00000000..8c0e288a
--- /dev/null
+++ b/modelscope/models/multi_modal/team/team_model.py
@@ -0,0 +1,127 @@
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
+from typing import Any, Dict
+
+import cv2
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from PIL import Image
+from tokenizers import BertWordPieceTokenizer
+from torchvision.transforms import Compose, Normalize, Resize, ToTensor
+
+from modelscope.metainfo import Models
+from modelscope.models.base import TorchModel
+from modelscope.models.builder import MODELS
+from modelscope.outputs import OutputKeys
+from modelscope.preprocessors import LoadImage
+from modelscope.utils.constant import ModelFile, Tasks
+from modelscope.utils.logger import get_logger
+from .utils import TEAM, BertWrapper, CLIPVisionWrapper, CrossLayer
+
+logger = get_logger()
+
+__all__ = ['TEAMForMultiModalSimilarity']
+
+
+@MODELS.register_module(Tasks.multi_modal_similarity, module_name=Models.team)
+class TEAMForMultiModalSimilarity(TorchModel):
+
+    def __init__(self, model_dir, device_id=0, *args, **kwargs):
+        super().__init__(
+            model_dir=model_dir, device_id=device_id, *args, **kwargs)
+
+        text_model = BertWrapper(
+            config_json='{}/text_config.json'.format(model_dir),
+            feat_dim=768,
+            token_dim=1024)
+        text_model.bert.cls = None
+        image_model = CLIPVisionWrapper()
+
+        self.model = TEAM(
+            text_model,
+            image_model,
+            pretrained='{}/{}'.format(model_dir,
+                                      ModelFile.TORCH_MODEL_BIN_FILE))
+        self.model.eval()
+
+        self.device_id = device_id
+        if self.device_id >= 0 and torch.cuda.is_available():
+            self.model.to('cuda:{}'.format(self.device_id))
+            logger.info('Use GPU: {}'.format(self.device_id))
+        else:
+            self.device_id = -1
+            logger.info('Use CPU for inference')
+
+        self.text_tokenizer = BertWordPieceTokenizer(
+            '{}/{}'.format(model_dir, ModelFile.VOCAB_FILE), lowercase=False)
+        self.text_tokenizer.enable_truncation(max_length=30)
+
+        norm_op = Normalize((0.48145466, 0.4578275, 0.40821073),
+                            (0.26862954, 0.26130258, 0.27577711))
+        self.img_preprocessor = Compose([
+            Resize((224, 224), interpolation=Image.BICUBIC),
+            ToTensor(), norm_op
+        ])
+
+    def tokenize_text(self, text_str):
+        tokens = self.text_tokenizer.encode(text_str)
+        max_tokens = 30
+        text_ids_tensor = torch.zeros((1, max_tokens)).long()
+        text_mask_tensor = torch.zeros((1, max_tokens))
+        text_ids, text_mask = tokens.ids, tokens.attention_mask
+        text_ids_tensor[0, 0:len(text_ids)] = torch.tensor(text_ids)
+        text_mask_tensor[0, 0:len(text_mask)] = torch.tensor(text_mask)
+        return text_ids_tensor, text_mask_tensor
+
+    def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
+        with torch.no_grad():
+            if 'img' in input and input['img'] is not None:
+                input_img = input['img']
+                input_img = LoadImage.convert_to_img(input_img)
+                img_tensor = self.img_preprocessor(input_img)[None, ...]
+
+                if self.device_id >= 0:
+                    img_tensor = img_tensor.to('cuda:{}'.format(
+                        self.device_id))
+                _, _, image_feature, image_tensors = self.model.get_feature(
+                    None, None, img_tensor)
+                image_feature = image_feature.cpu().numpy()
+            else:
+                image_feature, image_tensors = None, None
+
+            if 'text' in input and input['text'] is not None:
+                text_str = input['text']
+                if isinstance(text_str, str):
+                    text_ids_tensor, text_mask_tensor = self.tokenize_text(
+                        text_str)
+                else:
+                    raise TypeError(
+                        f'text should be str, but got {type(text_str)}')
+
+                if self.device_id >= 0:
+                    text_ids_tensor = text_ids_tensor.to('cuda:{}'.format(
+                        self.device_id))
+                    text_mask_tensor = text_mask_tensor.to('cuda:{}'.format(
+                        self.device_id))
+                text_feature, text_tensors, _, _ = self.model.get_feature(
+                    text_ids_tensor, text_mask_tensor, None)
+                text_feature = text_feature.cpu().numpy()
+            else:
+                text_tensors, text_mask_tensor = None, None
+
+            if text_tensors is not None and text_mask_tensor is not None and image_tensors is not None:
+                score = self.model.get_cross_score(text_tensors,
+                                                   text_mask_tensor,
+                                                   image_tensors)[0].item()
+            else:
+                score = None
+            output = {
+                OutputKeys.IMG_EMBEDDING: image_feature,
+                OutputKeys.TEXT_EMBEDDING: text_feature,
+                OutputKeys.SCORES: score
+            }
+            return output
+
+    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
+        return inputs
diff --git a/modelscope/models/multi_modal/team/utils.py b/modelscope/models/multi_modal/team/utils.py
new file mode 100644
index 00000000..73919179
--- /dev/null
+++ b/modelscope/models/multi_modal/team/utils.py
@@ -0,0 +1,329 @@
+# Copyright 2021 The OpenAI Team Authors.
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
+#
+# The implementation here is modified based on OpenAI CLIP,
+# originally MIT License, Copyright (c) 2021 OpenAI,
+# and publicly available at https://github.com/openai/CLIP/.
+
+from collections import OrderedDict
+from typing import Tuple, Union
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint as checkpoint
+from torch import nn
+from transformers import BertConfig, BertForMaskedLM
+
+
+class LayerNorm(nn.LayerNorm):
+    """Subclass torch's LayerNorm to handle fp16."""
+
+    def forward(self, x: torch.Tensor):
+        orig_type = x.dtype
+        ret = super().forward(x.type(torch.float32))
+        return ret.type(orig_type)
+
+
+class QuickGELU(nn.Module):
+
+    def forward(self, x: torch.Tensor):
+        return x * torch.sigmoid(1.702 * x)
+
+
+class ResidualAttentionBlock(nn.Module):
+
+    def __init__(self,
+                 d_model: int,
+                 n_head: int,
+                 attn_mask: torch.Tensor = None):
+        super().__init__()
+
+        self.attn = nn.MultiheadAttention(d_model, n_head)
+        self.ln_1 = LayerNorm(d_model)
+        self.mlp = nn.Sequential(
+            OrderedDict([('c_fc', nn.Linear(d_model, d_model * 4)),
+                         ('gelu', QuickGELU()),
+                         ('c_proj', nn.Linear(d_model * 4, d_model))]))
+        self.ln_2 = LayerNorm(d_model)
+        self.attn_mask = attn_mask
+
+    def attention(self, x: torch.Tensor):
+        self.attn_mask = self.attn_mask.to(
+            dtype=x.dtype,
+            device=x.device) if self.attn_mask is not None else None
+        return self.attn(
+            x, x, x, need_weights=False, attn_mask=self.attn_mask)[0]
+
+    def forward(self, x: torch.Tensor):
+        x = x + self.attention(self.ln_1(x))
+        x = x + self.mlp(self.ln_2(x))
+        return x
+
+
+class Transformer(nn.Module):
+
+    def __init__(self,
+                 width: int,
+                 layers: int,
+                 heads: int,
+                 attn_mask: torch.Tensor = None,
+                 use_gc=False):
+        super().__init__()
+        self.use_gc = use_gc
+        self.width = width
+        self.layers = layers
+        self.resblocks = nn.Sequential(*[
+            ResidualAttentionBlock(width, heads, attn_mask)
+            for _ in range(layers)
+        ])
+
+    def forward(self, x: torch.Tensor):
+        if self.use_gc:
+            for each_block in self.resblocks:
+                x = checkpoint.checkpoint(each_block, x)
+            return x
+        else:
+            return self.resblocks(x)
+
+
+class VisionTransformer(nn.Module):
+
+    def __init__(self,
+                 input_resolution: int,
+                 patch_size: int,
+                 width: int,
+                 layers: int,
+                 heads: int,
+                 output_dim: int,
+                 use_gc=False):
+        super().__init__()
+        self.input_resolution = input_resolution
+        self.output_dim = output_dim
+        self.conv1 = nn.Conv2d(
+            in_channels=3,
+            out_channels=width,
+            kernel_size=patch_size,
+            stride=patch_size,
+            bias=False)
+
+        scale = width**-0.5
+        self.class_embedding = nn.Parameter(scale * torch.randn(width))
+        self.positional_embedding = nn.Parameter(scale * torch.randn(
+            (input_resolution // patch_size)**2 + 1, width))
+        self.ln_pre = LayerNorm(width)
+
+        self.transformer = Transformer(width, layers, heads, use_gc=use_gc)
+
+        self.ln_post = LayerNorm(width)
+        self.proj = nn.Parameter(scale * torch.randn(width, output_dim))
+
+    def forward(self, x: torch.Tensor):
+        x = self.conv1(x)  # shape = [*, width, grid, grid]
+        x = x.reshape(x.shape[0], x.shape[1],
+                      -1)  # shape = [*, width, grid ** 2]
+        x = x.permute(0, 2, 1)  # shape = [*, grid ** 2, width]
+        class_embedding = self.class_embedding.to(x.dtype) + \
+            torch.zeros(x.shape[0], 1, x.shape[-1], dtype=x.dtype, device=x.device)
+        x = torch.cat([class_embedding, x],
+                      dim=1)  # shape = [*, grid ** 2 + 1, width]
+        x = x + self.positional_embedding.to(x.dtype)
+        x = self.ln_pre(x)
+
+        x = x.permute(1, 0, 2)  # NLD -> LND
+        x = self.transformer(x)
+        x = x.permute(1, 0, 2)  # LND -> NLD
+
+        x = self.ln_post(x[:, 0, :])
+
+        if self.proj is not None:
+            x = x @ self.proj
+
+        return x
+
+
+class CLIPVisionWrapper(nn.Module):
+
+    def __init__(self, ):
+        super().__init__()
+        self.vision_transformer = VisionTransformer(
+            input_resolution=224,
+            patch_size=14,
+            width=1024,
+            layers=24,
+            heads=16,
+            output_dim=768)
+
+    def forward(self, x):
+        x = self.vision_transformer.conv1(x)  # shape = [*, width, grid, grid]
+        x = x.reshape(x.shape[0], x.shape[1],
+                      -1)  # shape = [*, width, grid ** 2]
+        x = x.permute(0, 2, 1)  # shape = [*, grid ** 2, width]
+        class_embedding = self.vision_transformer.class_embedding.to(x.dtype) + \
+            torch.zeros(x.shape[0], 1, x.shape[-1], dtype=x.dtype, device=x.device)
+        x = torch.cat([class_embedding, x],
+                      dim=1)  # shape = [*, grid ** 2 + 1, width]
+        x = x + self.vision_transformer.positional_embedding.to(x.dtype)
+        x = self.vision_transformer.ln_pre(x)
+
+        x = x.permute(1, 0, 2)  # NLD -> LND
+        x = self.vision_transformer.transformer(x)
+        x = x.permute(1, 0, 2)  # LND -> NLD
+
+        x_tensor = x.clone()
+        x = self.vision_transformer.ln_post(x[:, 0, :])
+
+        if self.vision_transformer.proj is not None:
+            x = x @ self.vision_transformer.proj
+
+        return x, x_tensor
+
+
+class BertWrapper(nn.Module):
+
+    def __init__(self, config_json, feat_dim, token_dim):
+        super(BertWrapper, self).__init__()
+        bert_config = BertConfig.from_json_file(config_json)
+        self.bert = BertForMaskedLM(bert_config).bert
+
+        self.projector = nn.Linear(768, feat_dim, bias=False)
+        self.projector_token_embeds = nn.Linear(768, token_dim)
+
+    def forward(self, input_ids, attention_mask):
+        trans_features = {
+            'input_ids': input_ids,
+            'attention_mask': attention_mask
+        }
+        output_states = self.bert(**trans_features, return_dict=False)
+        output_tokens = output_states[0]
+
+        cls_tokens = output_tokens[:, 0, :]  # CLS token is first token
+
+        return self.projector(cls_tokens), self.projector_token_embeds(
+            output_tokens)
+
+
+class Mlp(nn.Module):
+
+    def __init__(self,
+                 in_features,
+                 hidden_features=None,
+                 out_features=None,
+                 act_layer=nn.GELU,
+                 drop=0.):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+
+
+class CrossLayer(nn.Module):
+
+    def __init__(self, feat_dim, mlp_ratio):
+        super(CrossLayer, self).__init__()
+        self.norm1 = nn.LayerNorm(feat_dim)
+        self.norm2 = nn.LayerNorm(feat_dim)
+        self.norm3 = nn.LayerNorm(feat_dim)
+
+        self.self_attn = nn.MultiheadAttention(
+            embed_dim=feat_dim, num_heads=16)
+        self.cross_attn = nn.MultiheadAttention(
+            embed_dim=feat_dim, num_heads=16)
+        self.ffn = Mlp(
+            in_features=feat_dim,
+            hidden_features=feat_dim * mlp_ratio,
+            drop=0.1)
+
+        self.dropout1 = nn.Dropout(0.1)
+        self.dropout2 = nn.Dropout(0.1)
+        self.dropout3 = nn.Dropout(0.1)
+
+    def forward(self, text_tensors, text_masks, image_tensors,
+                retrieved_tensors):
+        retrieved_tensors_res = self.norm1(retrieved_tensors)
+        retrieved_tensors_res = self.self_attn(
+            (text_tensors + retrieved_tensors_res).permute(1, 0, 2),
+            (text_tensors + retrieved_tensors_res).permute(1, 0, 2),
+            retrieved_tensors_res.permute(1, 0, 2),
+            key_padding_mask=(text_masks == 0),
+        )[0].permute(1, 0, 2)
+        retrieved_tensors = retrieved_tensors + self.dropout1(
+            retrieved_tensors_res)
+
+        retrieved_tensors_res = self.norm2(retrieved_tensors)
+        retrieved_tensors_res = self.cross_attn(
+            (text_tensors + retrieved_tensors_res).permute(1, 0, 2),
+            image_tensors.permute(1, 0, 2),
+            image_tensors.permute(1, 0, 2))[0].permute(1, 0, 2)
+        retrieved_tensors = retrieved_tensors + self.dropout2(
+            retrieved_tensors_res)
+
+        retrieved_tensors_res = self.norm3(retrieved_tensors)
+        retrieved_tensors = retrieved_tensors + self.dropout3(
+            self.ffn(retrieved_tensors_res))
+
+        return retrieved_tensors
+
+
+class TEAM(nn.Module):
+
+    def __init__(self, text_model, image_model, pretrained):
+        super(TEAM, self).__init__()
+        self.text_model = text_model
+        self.image_model = image_model
+
+        self.cross_model = nn.ModuleList(
+            [CrossLayer(feat_dim=1024, mlp_ratio=2)])
+
+        self.image_tensor_fc = nn.Linear(1024, 768)
+        self.text_tensor_fc = nn.Linear(1024, 768)
+
+        params = torch.load(pretrained, 'cpu')
+        self.load_state_dict(params, strict=True)
+
+    def get_feature(self, text_data=None, text_mask=None, img_tensor=None):
+        if text_data is not None:
+            text_feature, text_tensors = self.text_model(text_data, text_mask)
+            text_feature = F.normalize(text_feature, p=2.0, dim=1)
+        else:
+            text_feature, text_tensors = None, None
+
+        if img_tensor is not None:
+            image_feature, image_tensors = self.image_model(img_tensor)
+            image_feature = F.normalize(image_feature, p=2.0, dim=1)
+        else:
+            image_feature, image_tensors = None, None
+
+        return text_feature, text_tensors, image_feature, image_tensors
+
+    def get_cross_score(self, text_tensors, text_mask, image_tensors):
+        retrieved_tensors = torch.zeros_like(text_tensors)
+        pair_score_list = []
+        text_tensors_proj = self.text_tensor_fc(text_tensors)
+        text_mask_float = text_mask.type(text_tensors_proj.dtype)
+        for each_cross_model in self.cross_model:
+            retrieved_tensors = each_cross_model(text_tensors, text_mask,
+                                                 image_tensors,
+                                                 retrieved_tensors)
+            retrieved_tensors_proj = self.image_tensor_fc(retrieved_tensors)
+
+            pair_score = torch.sum(
+                F.normalize(retrieved_tensors_proj, p=2.0, dim=2)
+                * F.normalize(text_tensors_proj, p=2.0, dim=2),
+                dim=2)
+            pair_score_reduced = torch.sum(
+                pair_score * text_mask_float, dim=1) / torch.clamp(
+                    torch.sum(text_mask_float, dim=1), min=1.0)
+            pair_score_list.append(pair_score_reduced)
+        return pair_score_list
diff --git a/modelscope/models/nlp/T5/__init__.py b/modelscope/models/nlp/T5/__init__.py
new file mode 100644
index 00000000..7c1cea36
--- /dev/null
+++ b/modelscope/models/nlp/T5/__init__.py
@@ -0,0 +1,21 @@
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .t5_for_text_generation import T5ForConditionalGeneration
+
+else:
+    _import_structure = {
+        't5_for_text_generation': ['T5ForConditionalGeneration'],
+    }
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/models/nlp/T5/configuration_t5.py b/modelscope/models/nlp/T5/configuration_t5.py
new file mode 100644
index 00000000..117a6bc1
--- /dev/null
+++ b/modelscope/models/nlp/T5/configuration_t5.py
@@ -0,0 +1,174 @@
+# Copyright 2020, The T5 Authors and HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" T5 model configuration"""
+from typing import Mapping
+
+from transformers.configuration_utils import PretrainedConfig
+from transformers.onnx import OnnxSeq2SeqConfigWithPast
+
+from modelscope.utils.logger import get_logger
+
+logger = get_logger(__name__)
+
+
+class T5Config(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`T5Model`] or a [`TFT5Model`]. It is used to
+    instantiate a T5 model according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the T5
+    [t5-small](https://huggingface.co/t5-small) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Arguments:
+        vocab_size (`int`, *optional*, defaults to 32128):
+            Vocabulary size of the T5 model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`T5Model`] or [`TFT5Model`].
+        d_model (`int`, *optional*, defaults to 512):
+            Size of the encoder layers and the pooler layer.
+        d_kv (`int`, *optional*, defaults to 64):
+            Size of the key, query, value projections per attention head. `d_kv` has to be equal to `d_model //
+            num_heads`.
+        d_ff (`int`, *optional*, defaults to 2048):
+            Size of the intermediate feed forward layer in each `T5Block`.
+        num_layers (`int`, *optional*, defaults to 6):
+            Number of hidden layers in the Transformer encoder.
+        num_decoder_layers (`int`, *optional*):
+            Number of hidden layers in the Transformer decoder. Will use the same value as `num_layers` if not set.
+        num_heads (`int`, *optional*, defaults to 8):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        relative_attention_num_buckets (`int`, *optional*, defaults to 32):
+            The number of buckets to use for each attention layer.
+        relative_attention_max_distance (`int`, *optional*, defaults to 128):
+            The maximum distance of the longer sequences for the bucket separation.
+        dropout_rate (`float`, *optional*, defaults to 0.1):
+            The ratio for all dropout layers.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-6):
+            The epsilon used by the layer normalization layers.
+        initializer_factor (`float`, *optional*, defaults to 1):
+            A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
+            testing).
+        feed_forward_proj (`string`, *optional*, defaults to `"relu"`):
+            Type of feed forward layer to be used. Should be one of `"relu"` or `"gated-gelu"`. T5v1.1 uses the
+            `"gated-gelu"` feed forward projection. Original T5 uses `"relu"`.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models).
+    """
+    model_type = 't5'
+    keys_to_ignore_at_inference = ['past_key_values']
+    attribute_map = {
+        'hidden_size': 'd_model',
+        'num_attention_heads': 'num_heads',
+        'num_hidden_layers': 'num_layers'
+    }
+
+    def __init__(self,
+                 vocab_size=32128,
+                 d_model=512,
+                 d_kv=64,
+                 d_ff=2048,
+                 num_layers=6,
+                 num_decoder_layers=None,
+                 num_heads=8,
+                 relative_attention_num_buckets=32,
+                 relative_attention_max_distance=128,
+                 dropout_rate=0.1,
+                 layer_norm_epsilon=1e-6,
+                 initializer_factor=1.0,
+                 feed_forward_proj='relu',
+                 is_encoder_decoder=True,
+                 use_cache=True,
+                 pad_token_id=0,
+                 eos_token_id=1,
+                 **kwargs):
+        self.vocab_size = vocab_size
+        self.d_model = d_model
+        self.d_kv = d_kv
+        self.d_ff = d_ff
+        self.num_layers = num_layers
+        self.num_decoder_layers = (num_decoder_layers if num_decoder_layers
+                                   is not None else self.num_layers
+                                   )  # default = symmetry
+        self.num_heads = num_heads
+        self.relative_attention_num_buckets = relative_attention_num_buckets
+        self.relative_attention_max_distance = relative_attention_max_distance
+        self.dropout_rate = dropout_rate
+        self.layer_norm_epsilon = layer_norm_epsilon
+        self.initializer_factor = initializer_factor
+        self.feed_forward_proj = feed_forward_proj
+        self.use_cache = use_cache
+
+        act_info = self.feed_forward_proj.split('-')
+        self.dense_act_fn = act_info[-1]
+        self.is_gated_act = act_info[0] == 'gated'
+
+        if len(act_info) > 1 and act_info[0] != 'gated' or len(act_info) > 2:
+            raise ValueError(
+                f'`feed_forward_proj`: {feed_forward_proj} is not a valid activation function of the dense layer.'
+                'Please make sure `feed_forward_proj` is of the format `gated-{ACT_FN}` or `{ACT_FN}`, e.g. '
+                "'gated-gelu' or 'relu'")
+
+        # for backwards compatibility
+        if feed_forward_proj == 'gated-gelu':
+            self.dense_act_fn = 'gelu_new'
+
+        super().__init__(
+            pad_token_id=pad_token_id,
+            eos_token_id=eos_token_id,
+            is_encoder_decoder=is_encoder_decoder,
+            **kwargs,
+        )
+
+
+class T5OnnxConfig(OnnxSeq2SeqConfigWithPast):
+
+    @property
+    def inputs(self) -> Mapping[str, Mapping[int, str]]:
+        common_inputs = {
+            'input_ids': {
+                0: 'batch',
+                1: 'encoder_sequence'
+            },
+            'attention_mask': {
+                0: 'batch',
+                1: 'encoder_sequence'
+            },
+        }
+        if self.use_past:
+            common_inputs['attention_mask'][
+                1] = 'past_encoder_sequence + sequence'
+            common_inputs['decoder_input_ids'] = {0: 'batch'}
+            common_inputs['decoder_attention_mask'] = {
+                0: 'batch',
+                1: 'past_decoder_sequence + sequence'
+            }
+        else:
+            common_inputs['decoder_input_ids'] = {
+                0: 'batch',
+                1: 'decoder_sequence'
+            }
+            common_inputs['decoder_attention_mask'] = {
+                0: 'batch',
+                1: 'decoder_sequence'
+            }
+
+        if self.use_past:
+            self.fill_with_past_key_values_(common_inputs, direction='inputs')
+
+        return common_inputs
+
+    @property
+    def default_onnx_opset(self) -> int:
+        return 13
diff --git a/modelscope/models/nlp/T5/modeling_t5.py b/modelscope/models/nlp/T5/modeling_t5.py
new file mode 100644
index 00000000..da50741e
--- /dev/null
+++ b/modelscope/models/nlp/T5/modeling_t5.py
@@ -0,0 +1,2003 @@
+# Copyright 2018 Mesh TensorFlow authors, T5 Authors and HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch T5 model."""
+
+import copy
+import math
+import os
+import warnings
+from typing import Optional, Tuple, Union
+
+import torch
+from torch import nn
+from torch.nn import CrossEntropyLoss
+from torch.utils.checkpoint import checkpoint
+from transformers.activations import ACT2FN
+from transformers.modeling_outputs import (
+    BaseModelOutput, BaseModelOutputWithPastAndCrossAttentions,
+    Seq2SeqLMOutput, Seq2SeqModelOutput)
+from transformers.modeling_utils import (PreTrainedModel,
+                                         find_pruneable_heads_and_indices,
+                                         prune_linear_layer)
+from transformers.utils import (DUMMY_INPUTS, DUMMY_MASK, add_start_docstrings,
+                                add_start_docstrings_to_model_forward,
+                                is_torch_fx_proxy, replace_return_docstrings)
+from transformers.utils.model_parallel_utils import (assert_device_map,
+                                                     get_device_map)
+
+from modelscope.utils.logger import get_logger
+from .configuration_t5 import T5Config
+
+logger = get_logger(__name__)
+
+_CONFIG_FOR_DOC = 'T5Config'
+_TOKENIZER_FOR_DOC = 'T5Tokenizer'
+_CHECKPOINT_FOR_DOC = 't5-small'
+
+####################################################
+# This dict contains ids and associated url
+# for the pretrained weights provided with the models
+####################################################
+T5_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    't5-small',
+    't5-base',
+    't5-large',
+    't5-3b',
+    't5-11b',
+    # See all T5 models at https://huggingface.co/models?filter=t5
+]
+
+
+####################################################
+# This is a conversion method from TF 1.0 to PyTorch
+# More details: https://medium.com/huggingface/from-tensorflow-to-pytorch-265f40ef2a28
+####################################################
+def load_tf_weights_in_t5(model, config, tf_checkpoint_path):
+    """Load tf checkpoints in a pytorch model."""
+    try:
+        import re
+
+        import numpy as np
+        import tensorflow as tf
+    except ImportError:
+        logger.error(
+            'Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see '
+            'https://www.tensorflow.org/install/ for installation instructions.'
+        )
+        raise
+    tf_path = os.path.abspath(tf_checkpoint_path)
+    logger.info(f'Converting TensorFlow checkpoint from {tf_path}')
+    # Load weights from TF model
+    init_vars = tf.train.list_variables(tf_path)
+    names = []
+    tf_weights = {}
+    for name, shape in init_vars:
+        logger.info(f'Loading TF weight {name} with shape {shape}')
+        array = tf.train.load_variable(tf_path, name)
+        names.append(name)
+        tf_weights[name] = array
+
+    for txt_name in names:
+        name = txt_name.split('/')
+        # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
+        # which are not required for using pretrained model
+        if any(n in [
+                'adam_v', 'adam_m', 'AdamWeightDecayOptimizer',
+                'AdamWeightDecayOptimizer_1', 'global_step'
+        ] for n in name):
+            logger.info(f"Skipping {'/'.join(name)}")
+            tf_weights.pop(txt_name, None)
+            continue
+        if '_slot_' in name[-1]:
+            logger.info(f"Skipping {'/'.join(name)}")
+            tf_weights.pop(txt_name, None)
+            continue
+        pointer = model
+        array = tf_weights[txt_name]
+
+        for m_name in name:
+            if re.fullmatch(r'[A-Za-z]+_\d+', m_name):
+                scope_names = re.split(r'_(\d+)', m_name)
+            else:
+                scope_names = [m_name]
+            if scope_names[0] in ['kernel', 'scale', 'embedding']:
+                pointer = getattr(pointer, 'weight')
+            elif scope_names[0] == 'self_attention':
+                pointer = getattr(pointer, 'layer')
+                pointer = pointer[0]
+            elif scope_names[0] == 'enc_dec_attention':
+                pointer = getattr(pointer, 'layer')
+                pointer = pointer[1]
+            elif scope_names[0] == 'dense_relu_dense':
+                pointer = getattr(pointer, 'layer')
+                pointer = pointer[2]
+            elif scope_names[0] == 'rms_norm':
+                if hasattr(pointer, 'layer_norm'):
+                    pointer = getattr(pointer, 'layer_norm')
+                elif hasattr(pointer, 'final_layer_norm'):
+                    pointer = getattr(pointer, 'final_layer_norm')
+            elif scope_names[0] == 'scale':
+                pointer = getattr(pointer, 'weight')
+            elif scope_names[0] == 'output_bias' or scope_names[0] == 'beta':
+                pointer = getattr(pointer, 'bias')
+            elif scope_names[0] == 'squad':
+                pointer = getattr(pointer, 'classifier')
+            elif scope_names[0] == 'decoder' and name[1] == 'logits':
+                continue
+            elif scope_names[0] == 'logits':
+                pointer = getattr(pointer, 'lm_head')
+            elif scope_names[0] == 'wi' and len(
+                    scope_names) > 1 and scope_names[1].isdigit():
+                pointer = getattr(pointer, f'wi_{scope_names[1]}')
+                continue
+            else:
+                try:
+                    pointer = getattr(pointer, scope_names[0])
+                except AttributeError:
+                    logger.info(f"Skipping {'/'.join(name)}")
+                    continue
+            if len(scope_names) >= 2:
+                num = int(scope_names[1])
+                pointer = pointer[num]
+        if scope_names[0] not in ['kernel', 'scale', 'embedding']:
+            pointer = getattr(pointer, 'weight')
+        if scope_names[0] != 'embedding':
+            logger.info(
+                f'Transposing numpy weight of shape {array.shape} for {name}')
+            array = np.transpose(array)
+        try:
+            assert (
+                pointer.shape == array.shape
+            ), f'Pointer shape {pointer.shape} and array shape {array.shape} mismatched'
+        except AssertionError as e:
+            e.args += (pointer.shape, array.shape)
+            raise
+        logger.info(f'Initialize PyTorch weight {name}')
+        pointer.data = torch.from_numpy(array.astype(np.float32))
+        tf_weights.pop(txt_name, None)
+
+    logger.info(
+        f"Weights not copied to PyTorch model: {', '.join(tf_weights.keys())}."
+    )
+    return model
+
+
+####################################################
+# PyTorch Models are constructed by sub-classing
+# - torch.nn.Module for the layers and
+# - PreTrainedModel for the models (it-self a sub-class of nn.Module)
+####################################################
+PARALLELIZE_DOCSTRING = r"""
+    This is an experimental feature and is a subject to change at a moment's notice.
+
+    Uses a device map to distribute attention modules of the model across several devices. If no device map is given,
+    it will evenly distribute blocks across all devices.
+
+    Args:
+        device_map (`Dict[int, list]`, optional, defaults to None):
+            A dictionary that maps attention modules to devices. Note that the embedding module and LMHead are always
+            automatically mapped to the first device (for esoteric reasons). That means that the first device should
+            have fewer attention modules mapped to it than other devices. For reference, the t5 models have the
+            following number of attention modules:
+
+                - t5-small: 6
+                - t5-base: 12
+                - t5-large: 24
+                - t5-3b: 24
+                - t5-11b: 24
+
+    Example:
+
+    ```python
+    # Here is an example of a device map on a machine with 4 GPUs
+    # using t5-3b, which has a total of 24 attention modules:
+    model = T5ForConditionalGeneration.from_pretrained("t5-3b")
+    device_map = {
+        0: [0, 1, 2],
+        1: [3, 4, 5, 6, 7, 8, 9],
+        2: [10, 11, 12, 13, 14, 15, 16],
+        3: [17, 18, 19, 20, 21, 22, 23],
+    }
+    model.parallelize(device_map)
+    ```
+"""
+DEPARALLELIZE_DOCSTRING = r"""
+    Moves the model to cpu from a model parallel state.
+
+    Example:
+
+    ```python
+    # On a 4 GPU machine with t5-3b:
+    model = T5ForConditionalGeneration.from_pretrained("t5-3b")
+    device_map = {
+        0: [0, 1, 2],
+        1: [3, 4, 5, 6, 7, 8, 9],
+        2: [10, 11, 12, 13, 14, 15, 16],
+        3: [17, 18, 19, 20, 21, 22, 23],
+    }
+    model.parallelize(device_map)  # Splits the model across several devices
+    model.deparallelize()  # Put the model back on cpu and cleans memory by calling torch.cuda.empty_cache()
+    ```
+"""
+
+
+class T5LayerNorm(nn.Module):
+
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        Construct a layernorm module in the T5 style. No bias and no subtraction of mean.
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+
+    def forward(self, hidden_states):
+
+        # T5 uses a layer_norm which only scales and doesn't shift, which is also known as Root Mean
+        # Square Layer Normalization https://arxiv.org/abs/1910.07467 thus varience is calculated
+        # w/o mean and there is no bias. Additionally we want to make sure that the accumulation for
+        # half-precision inputs is done in fp32
+
+        variance = hidden_states.to(torch.float32).pow(2).mean(
+            -1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance
+                                                    + self.variance_epsilon)
+
+        # convert into half-precision if necessary
+        if self.weight.dtype in [torch.float16, torch.bfloat16]:
+            hidden_states = hidden_states.to(self.weight.dtype)
+
+        return self.weight * hidden_states
+
+
+try:
+    from apex.normalization import FusedRMSNorm
+
+    T5LayerNorm = FusedRMSNorm  # noqa
+
+    logger.info(
+        'Discovered apex.normalization.FusedRMSNorm - will use it instead of T5LayerNorm'
+    )
+except ImportError:
+    # using the normal T5LayerNorm
+    pass
+except Exception:
+    logger.warning(
+        'discovered apex but it failed to load, falling back to T5LayerNorm')
+    pass
+
+
+class T5DenseReluDense(nn.Module):
+
+    def __init__(self, config: T5Config):
+        super().__init__()
+        self.wi = nn.Linear(config.d_model, config.d_ff, bias=False)
+        self.wo = nn.Linear(config.d_ff, config.d_model, bias=False)
+        self.dropout = nn.Dropout(config.dropout_rate)
+
+    def forward(self, hidden_states):
+        hidden_states = self.wi(hidden_states)
+        hidden_states = nn.functional.relu(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.wo(hidden_states)
+        return hidden_states
+
+
+class T5DenseGatedGeluDense(nn.Module):
+
+    def __init__(self, config: T5Config):
+        super().__init__()
+        self.wi_0 = nn.Linear(config.d_model, config.d_ff, bias=False)
+        self.wi_1 = nn.Linear(config.d_model, config.d_ff, bias=False)
+        self.wo = nn.Linear(config.d_ff, config.d_model, bias=False)
+        self.dropout = nn.Dropout(config.dropout_rate)
+        self.gelu_act = ACT2FN['gelu_new']
+
+    def forward(self, hidden_states):
+        hidden_gelu = self.gelu_act(self.wi_0(hidden_states))
+        hidden_linear = self.wi_1(hidden_states)
+        hidden_states = hidden_gelu * hidden_linear
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.wo(hidden_states)
+        return hidden_states
+
+
+class T5LayerFF(nn.Module):
+
+    def __init__(self, config: T5Config):
+        super().__init__()
+        if config.feed_forward_proj == 'relu':
+            self.DenseReluDense = T5DenseReluDense(config)
+        elif config.feed_forward_proj == 'gated-gelu':
+            self.DenseReluDense = T5DenseGatedGeluDense(config)
+        else:
+            raise ValueError(
+                f'{self.config.feed_forward_proj} is not supported. Choose between `relu` and `gated-gelu`'
+            )
+
+        self.layer_norm = T5LayerNorm(
+            config.d_model, eps=config.layer_norm_epsilon)
+        self.dropout = nn.Dropout(config.dropout_rate)
+
+    def forward(self, hidden_states):
+        forwarded_states = self.layer_norm(hidden_states)
+        forwarded_states = self.DenseReluDense(forwarded_states)
+        hidden_states = hidden_states + self.dropout(forwarded_states)
+        return hidden_states
+
+
+class T5Attention(nn.Module):
+
+    def __init__(self, config: T5Config, has_relative_attention_bias=False):
+        super().__init__()
+        self.is_decoder = config.is_decoder
+        self.has_relative_attention_bias = has_relative_attention_bias
+        self.relative_attention_num_buckets = config.relative_attention_num_buckets
+        self.relative_attention_max_distance = config.relative_attention_max_distance
+        self.d_model = config.d_model
+        self.key_value_proj_dim = config.d_kv
+        self.n_heads = config.num_heads
+        self.dropout = config.dropout_rate
+        self.inner_dim = self.n_heads * self.key_value_proj_dim
+
+        # Mesh TensorFlow initialization to avoid scaling before softmax
+        self.q = nn.Linear(self.d_model, self.inner_dim, bias=False)
+        self.k = nn.Linear(self.d_model, self.inner_dim, bias=False)
+        self.v = nn.Linear(self.d_model, self.inner_dim, bias=False)
+        self.o = nn.Linear(self.inner_dim, self.d_model, bias=False)
+
+        if self.has_relative_attention_bias:
+            self.relative_attention_bias = nn.Embedding(
+                self.relative_attention_num_buckets, self.n_heads)
+        self.pruned_heads = set()
+        self.gradient_checkpointing = False
+
+    def prune_heads(self, heads):
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(
+            heads, self.n_heads, self.key_value_proj_dim, self.pruned_heads)
+        # Prune linear layers
+        self.q = prune_linear_layer(self.q, index)
+        self.k = prune_linear_layer(self.k, index)
+        self.v = prune_linear_layer(self.v, index)
+        self.o = prune_linear_layer(self.o, index, dim=1)
+        # Update hyper params
+        self.n_heads = self.n_heads - len(heads)
+        self.inner_dim = self.key_value_proj_dim * self.n_heads
+        self.pruned_heads = self.pruned_heads.union(heads)
+
+    @staticmethod
+    def _relative_position_bucket(relative_position,
+                                  bidirectional=True,
+                                  num_buckets=32,
+                                  max_distance=128):
+        """
+        Adapted from Mesh Tensorflow:
+        https://github.com/tensorflow/mesh/blob/0cb87fe07da627bf0b7e60475d59f95ed6b5be3d/mesh_tensorflow/transformer/transformer_layers.py#L593
+
+        Translate relative position to a bucket number for relative attention. The relative position is defined as
+        memory_position - query_position, i.e. the distance in tokens from the attending position to the attended-to
+        position. If bidirectional=False, then positive relative positions are invalid. We use smaller buckets for
+        small absolute relative_position and larger buckets for larger absolute relative_positions. All relative
+        positions >=max_distance map to the same bucket. All relative positions <=-max_distance map to the same bucket.
+        This should allow for more graceful generalization to longer sequences than the model has been trained on
+
+        Args:
+            relative_position: an int32 Tensor
+            bidirectional: a boolean - whether the attention is bidirectional
+            num_buckets: an integer
+            max_distance: an integer
+
+        Returns:
+            a Tensor with the same shape as relative_position, containing int32 values in the range [0, num_buckets)
+        """
+        relative_buckets = 0
+        if bidirectional:
+            num_buckets //= 2
+            relative_buckets += (relative_position > 0).to(
+                torch.long) * num_buckets
+            relative_position = torch.abs(relative_position)
+        else:
+            relative_position = -torch.min(relative_position,
+                                           torch.zeros_like(relative_position))
+        # now relative_position is in the range [0, inf)
+
+        # half of the buckets are for exact increments in positions
+        max_exact = num_buckets // 2
+        is_small = relative_position < max_exact
+
+        # The other half of the buckets are for logarithmically bigger bins in
+        # positions up to max_distance
+        relateive_pos_log = torch.log(relative_position.float() / max_exact)
+        max_dis_log = math.log(max_distance / max_exact)
+        origin_relative_position = relateive_pos_log / max_dis_log * (
+            num_buckets - max_exact)
+        relative_postion_if_large = max_exact + origin_relative_position.to(
+            torch.long)
+        relative_postion_if_large = torch.min(
+            relative_postion_if_large,
+            torch.full_like(relative_postion_if_large, num_buckets - 1))
+
+        relative_buckets += torch.where(is_small, relative_position,
+                                        relative_postion_if_large)
+        return relative_buckets
+
+    def compute_bias(self, query_length, key_length):
+        """Compute binned relative position bias"""
+        context_position = torch.arange(
+            query_length,
+            dtype=torch.long,
+            device=self.relative_attention_bias.weight.device)[:, None]
+        memory_position = torch.arange(
+            key_length,
+            dtype=torch.long,
+            device=self.relative_attention_bias.weight.device)[None, :]
+        relative_position = memory_position - context_position  # shape (query_length, key_length)
+        relative_position_bucket = self._relative_position_bucket(
+            relative_position,  # shape (query_length, key_length)
+            bidirectional=(not self.is_decoder),
+            num_buckets=self.relative_attention_num_buckets,
+            max_distance=self.relative_attention_max_distance,
+        )
+        values = self.relative_attention_bias(
+            relative_position_bucket
+        )  # shape (query_length, key_length, num_heads)
+        values = values.permute([2, 0, 1]).unsqueeze(
+            0)  # shape (1, num_heads, query_length, key_length)
+        return values
+
+    def forward(
+        self,
+        hidden_states,
+        mask=None,
+        key_value_states=None,
+        position_bias=None,
+        past_key_value=None,
+        layer_head_mask=None,
+        query_length=None,
+        use_cache=False,
+        output_attentions=False,
+    ):
+        """
+        Self-attention (if key_value_states is None) or attention over source sentence (provided by key_value_states).
+        """
+        # Input is (batch_size, seq_length, dim)
+        # Mask is (batch_size, key_length) (non-causal) or (batch_size, key_length, key_length)
+        # past_key_value[0] is (batch_size, n_heads, q_len - 1, dim_per_head)
+        batch_size, seq_length = hidden_states.shape[:2]
+
+        real_seq_length = seq_length
+
+        if past_key_value is not None:
+            assert (
+                len(past_key_value) == 2
+            ), f'past_key_value should have 2 past states: keys and values. Got { len(past_key_value)} past states'
+            real_seq_length += past_key_value[0].shape[
+                2] if query_length is None else query_length
+
+        key_length = real_seq_length if key_value_states is None else key_value_states.shape[
+            1]
+
+        def shape(states):
+            """projection"""
+            return states.view(batch_size, -1, self.n_heads,
+                               self.key_value_proj_dim).transpose(1, 2)
+
+        def unshape(states):
+            """reshape"""
+            return states.transpose(1, 2).contiguous().view(
+                batch_size, -1, self.inner_dim)
+
+        def project(hidden_states, proj_layer, key_value_states,
+                    past_key_value):
+            """projects hidden states correctly to key/query states"""
+            if key_value_states is None:
+                # self-attn
+                # (batch_size, n_heads, seq_length, dim_per_head)
+                hidden_states = shape(proj_layer(hidden_states))
+            elif past_key_value is None:
+                # cross-attn
+                # (batch_size, n_heads, seq_length, dim_per_head)
+                hidden_states = shape(proj_layer(key_value_states))
+
+            if past_key_value is not None:
+                if key_value_states is None:
+                    # self-attn
+                    # (batch_size, n_heads, key_length, dim_per_head)
+                    hidden_states = torch.cat([past_key_value, hidden_states],
+                                              dim=2)
+                else:
+                    # cross-attn
+                    hidden_states = past_key_value
+            return hidden_states
+
+        # get query states
+        query_states = shape(self.q(
+            hidden_states))  # (batch_size, n_heads, seq_length, dim_per_head)
+
+        # get key/value states
+        key_states = project(
+            hidden_states, self.k, key_value_states,
+            past_key_value[0] if past_key_value is not None else None)
+        value_states = project(
+            hidden_states, self.v, key_value_states,
+            past_key_value[1] if past_key_value is not None else None)
+
+        # compute scores
+        scores = torch.matmul(
+            query_states, key_states.transpose(3, 2)
+        )  # equivalent of torch.einsum("bnqd,bnkd->bnqk", query_states, key_states), compatible with onnx op>9
+
+        if position_bias is None:
+            if not self.has_relative_attention_bias:
+                position_bias = torch.zeros(
+                    (1, self.n_heads, real_seq_length, key_length),
+                    device=scores.device,
+                    dtype=scores.dtype)
+                if self.gradient_checkpointing and self.training:
+                    position_bias.requires_grad = True
+            else:
+                position_bias = self.compute_bias(real_seq_length, key_length)
+
+            # if key and values are already calculated
+            # we want only the last query position bias
+            if past_key_value is not None:
+                position_bias = position_bias[:, :, -hidden_states.size(1):, :]
+
+            if mask is not None:
+                position_bias = position_bias + mask  # (batch_size, n_heads, seq_length, key_length)
+
+        scores += position_bias
+        attn_weights = nn.functional.softmax(
+            scores.float(), dim=-1).type_as(
+                scores)  # (batch_size, n_heads, seq_length, key_length)
+        attn_weights = nn.functional.dropout(
+            attn_weights, p=self.dropout, training=self.training
+        )  # (batch_size, n_heads, seq_length, key_length)
+
+        # Mask heads if we want to
+        if layer_head_mask is not None:
+            attn_weights = attn_weights * layer_head_mask
+
+        attn_output = unshape(torch.matmul(
+            attn_weights, value_states))  # (batch_size, seq_length, dim)
+        attn_output = self.o(attn_output)
+
+        present_key_value_state = (key_states,
+                                   value_states) if (self.is_decoder
+                                                     and use_cache) else None
+        outputs = (attn_output, ) + (present_key_value_state, ) + (
+            position_bias, )
+
+        if output_attentions:
+            outputs = outputs + (attn_weights, )
+        return outputs
+
+
+class T5LayerSelfAttention(nn.Module):
+
+    def __init__(self, config, has_relative_attention_bias=False):
+        super().__init__()
+        self.SelfAttention = T5Attention(
+            config, has_relative_attention_bias=has_relative_attention_bias)
+        self.layer_norm = T5LayerNorm(
+            config.d_model, eps=config.layer_norm_epsilon)
+        self.dropout = nn.Dropout(config.dropout_rate)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        position_bias=None,
+        layer_head_mask=None,
+        past_key_value=None,
+        use_cache=False,
+        output_attentions=False,
+    ):
+        normed_hidden_states = self.layer_norm(hidden_states)
+        attention_output = self.SelfAttention(
+            normed_hidden_states,
+            mask=attention_mask,
+            position_bias=position_bias,
+            layer_head_mask=layer_head_mask,
+            past_key_value=past_key_value,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+        )
+        hidden_states = hidden_states + self.dropout(attention_output[0])
+        outputs = (hidden_states,
+                   ) + attention_output[1:]  # add attentions if we output them
+        return outputs
+
+
+class T5LayerCrossAttention(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.EncDecAttention = T5Attention(
+            config, has_relative_attention_bias=False)
+        self.layer_norm = T5LayerNorm(
+            config.d_model, eps=config.layer_norm_epsilon)
+        self.dropout = nn.Dropout(config.dropout_rate)
+
+    def forward(
+        self,
+        hidden_states,
+        key_value_states,
+        attention_mask=None,
+        position_bias=None,
+        layer_head_mask=None,
+        past_key_value=None,
+        use_cache=False,
+        query_length=None,
+        output_attentions=False,
+    ):
+        normed_hidden_states = self.layer_norm(hidden_states)
+        attention_output = self.EncDecAttention(
+            normed_hidden_states,
+            mask=attention_mask,
+            key_value_states=key_value_states,
+            position_bias=position_bias,
+            layer_head_mask=layer_head_mask,
+            past_key_value=past_key_value,
+            use_cache=use_cache,
+            query_length=query_length,
+            output_attentions=output_attentions,
+        )
+        layer_output = hidden_states + self.dropout(attention_output[0])
+        outputs = (layer_output,
+                   ) + attention_output[1:]  # add attentions if we output them
+        return outputs
+
+
+class T5Block(nn.Module):
+
+    def __init__(self, config, has_relative_attention_bias=False):
+        super().__init__()
+        self.is_decoder = config.is_decoder
+        self.layer = nn.ModuleList()
+        self.layer.append(
+            T5LayerSelfAttention(
+                config,
+                has_relative_attention_bias=has_relative_attention_bias))
+        if self.is_decoder:
+            self.layer.append(T5LayerCrossAttention(config))
+
+        self.layer.append(T5LayerFF(config))
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        position_bias=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        encoder_decoder_position_bias=None,
+        layer_head_mask=None,
+        cross_attn_layer_head_mask=None,
+        past_key_value=None,
+        use_cache=False,
+        output_attentions=False,
+        return_dict=True,
+    ):
+
+        if past_key_value is not None:
+            if not self.is_decoder:
+                logger.warning(
+                    '`past_key_values` is passed to the encoder. Please make sure this is intended.'
+                )
+            expected_num_past_key_values = 2 if encoder_hidden_states is None else 4
+
+            if len(past_key_value) != expected_num_past_key_values:
+                raise ValueError(
+                    f'There should be {expected_num_past_key_values} past states. '
+                    f"{'2 (past / key) for cross attention. ' if expected_num_past_key_values == 4 else ''}"
+                    f'Got {len(past_key_value)} past key / value states')
+
+            self_attn_past_key_value = past_key_value[:2]
+            cross_attn_past_key_value = past_key_value[2:]
+        else:
+            self_attn_past_key_value, cross_attn_past_key_value = None, None
+
+        self_attention_outputs = self.layer[0](
+            hidden_states,
+            attention_mask=attention_mask,
+            position_bias=position_bias,
+            layer_head_mask=layer_head_mask,
+            past_key_value=self_attn_past_key_value,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+        )
+        hidden_states, present_key_value_state = self_attention_outputs[:2]
+        attention_outputs = self_attention_outputs[
+            2:]  # Keep self-attention outputs and relative position weights
+
+        # clamp inf values to enable fp16 training
+        if hidden_states.dtype == torch.float16 and torch.isinf(
+                hidden_states).any():
+            clamp_value = torch.finfo(hidden_states.dtype).max - 1000
+            hidden_states = torch.clamp(
+                hidden_states, min=-clamp_value, max=clamp_value)
+
+        do_cross_attention = self.is_decoder and encoder_hidden_states is not None
+        if do_cross_attention:
+            # the actual query length is unknown for cross attention
+            # if using past key value states. Need to inject it here
+            if present_key_value_state is not None:
+                query_length = present_key_value_state[0].shape[2]
+            else:
+                query_length = None
+
+            cross_attention_outputs = self.layer[1](
+                hidden_states,
+                key_value_states=encoder_hidden_states,
+                attention_mask=encoder_attention_mask,
+                position_bias=encoder_decoder_position_bias,
+                layer_head_mask=cross_attn_layer_head_mask,
+                past_key_value=cross_attn_past_key_value,
+                query_length=query_length,
+                use_cache=use_cache,
+                output_attentions=output_attentions,
+            )
+            hidden_states = cross_attention_outputs[0]
+
+            # clamp inf values to enable fp16 training
+            if hidden_states.dtype == torch.float16 and torch.isinf(
+                    hidden_states).any():
+                clamp_value = torch.finfo(hidden_states.dtype).max - 1000
+                hidden_states = torch.clamp(
+                    hidden_states, min=-clamp_value, max=clamp_value)
+
+            # Combine self attn and cross attn key value states
+            if present_key_value_state is not None:
+                present_key_value_state = present_key_value_state + cross_attention_outputs[
+                    1]
+
+            # Keep cross-attention outputs and relative position weights
+            attention_outputs = attention_outputs + cross_attention_outputs[2:]
+
+        # Apply Feed Forward layer
+        hidden_states = self.layer[-1](hidden_states)
+
+        # clamp inf values to enable fp16 training
+        if hidden_states.dtype == torch.float16 and torch.isinf(
+                hidden_states).any():
+            clamp_value = torch.finfo(hidden_states.dtype).max - 1000
+            hidden_states = torch.clamp(
+                hidden_states, min=-clamp_value, max=clamp_value)
+
+        outputs = (hidden_states, )
+
+        if use_cache:
+            outputs = outputs + (present_key_value_state, ) + attention_outputs
+        else:
+            outputs = outputs + attention_outputs
+
+        # hidden-states, present_key_value_states, (self-attention position
+        # bias), (self-attention weights), (cross-attention position bias),
+        # (cross-attention weights)
+        return outputs
+
+
+class T5PreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface
+    for downloading and loading pretrained models.
+    """
+
+    config_class = T5Config
+    load_tf_weights = load_tf_weights_in_t5
+    base_model_prefix = 'transformer'
+    is_parallelizable = True
+    supports_gradient_checkpointing = True
+
+    @property
+    def dummy_inputs(self):
+        input_ids = torch.tensor(DUMMY_INPUTS)
+        input_mask = torch.tensor(DUMMY_MASK)
+        dummy_inputs = {
+            'decoder_input_ids': input_ids,
+            'input_ids': input_ids,
+            'decoder_attention_mask': input_mask,
+        }
+        return dummy_inputs
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        factor = self.config.initializer_factor  # Used for testing weights initialization
+        if isinstance(module, T5LayerNorm):
+            module.weight.data.fill_(factor * 1.0)
+        elif isinstance(module,
+                        (T5Model, T5ForConditionalGeneration, T5EncoderModel)):
+            # Mesh TensorFlow embeddings initialization See
+            # https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L1624
+            module.shared.weight.data.normal_(mean=0.0, std=factor * 1.0)
+        elif isinstance(module, T5DenseReluDense):
+            # Mesh TensorFlow FF initialization See
+            # https://github.com/tensorflow/mesh/blob/master/mesh_tensorflow/transformer/transformer_layers.py#L56
+            # and
+            # https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L89
+            module.wi.weight.data.normal_(
+                mean=0.0, std=factor * ((self.config.d_model)**-0.5))
+            if hasattr(module.wi, 'bias') and module.wi.bias is not None:
+                module.wi.bias.data.zero_()
+            module.wo.weight.data.normal_(
+                mean=0.0, std=factor * ((self.config.d_ff)**-0.5))
+            if hasattr(module.wo, 'bias') and module.wo.bias is not None:
+                module.wo.bias.data.zero_()
+        elif isinstance(module, T5DenseGatedGeluDense):
+            module.wi_0.weight.data.normal_(
+                mean=0.0, std=factor * ((self.config.d_model)**-0.5))
+            if hasattr(module.wi_0, 'bias') and module.wi_0.bias is not None:
+                module.wi_0.bias.data.zero_()
+            module.wi_1.weight.data.normal_(
+                mean=0.0, std=factor * ((self.config.d_model)**-0.5))
+            if hasattr(module.wi_1, 'bias') and module.wi_1.bias is not None:
+                module.wi_1.bias.data.zero_()
+            module.wo.weight.data.normal_(
+                mean=0.0, std=factor * ((self.config.d_ff)**-0.5))
+            if hasattr(module.wo, 'bias') and module.wo.bias is not None:
+                module.wo.bias.data.zero_()
+        elif isinstance(module, T5Attention):
+            # Mesh TensorFlow attention initialization to avoid scaling before
+            # softmax See
+            # https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/transformer/attention.py#L136
+            d_model = self.config.d_model
+            key_value_proj_dim = self.config.d_kv
+            n_heads = self.config.num_heads
+            module.q.weight.data.normal_(
+                mean=0.0, std=factor * ((d_model * key_value_proj_dim)**-0.5))
+            module.k.weight.data.normal_(
+                mean=0.0, std=factor * (d_model**-0.5))
+            module.v.weight.data.normal_(
+                mean=0.0, std=factor * (d_model**-0.5))
+            module.o.weight.data.normal_(
+                mean=0.0, std=factor * ((n_heads * key_value_proj_dim)**-0.5))
+            if module.has_relative_attention_bias:
+                module.relative_attention_bias.weight.data.normal_(
+                    mean=0.0, std=factor * ((d_model)**-0.5))
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, (T5Attention, T5Stack)):
+            module.gradient_checkpointing = value
+
+    def _shift_right(self, input_ids):
+        decoder_start_token_id = self.config.decoder_start_token_id
+        pad_token_id = self.config.pad_token_id
+
+        assert (
+            decoder_start_token_id is not None
+        ), 'self.model.config.decoder_start_token_id has to be defined.'
+
+        # shift inputs to the right
+        if is_torch_fx_proxy(input_ids):
+            # Item assignment is not supported natively for proxies.
+            shifted_input_ids = torch.full(input_ids.shape[:-1] + (1, ),
+                                           decoder_start_token_id)
+            shifted_input_ids = torch.cat(
+                [shifted_input_ids, input_ids[..., :-1]], dim=-1)
+        else:
+            shifted_input_ids = input_ids.new_zeros(input_ids.shape)
+            shifted_input_ids[..., 1:] = input_ids[..., :-1].clone()
+            shifted_input_ids[..., 0] = decoder_start_token_id
+
+        assert pad_token_id is not None, 'self.model.config.pad_token_id has to be defined.'
+        # replace possible -100 values in labels by `pad_token_id`
+        shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id)
+
+        assert torch.all(shifted_input_ids >= 0).item(
+        ), 'Verify that `shifted_input_ids` has only positive values'
+
+        return shifted_input_ids
+
+
+class T5Stack(T5PreTrainedModel):
+
+    def __init__(self, config, embed_tokens=None):
+        super().__init__(config)
+
+        self.embed_tokens = embed_tokens
+        self.is_decoder = config.is_decoder
+
+        self.block = nn.ModuleList([
+            T5Block(config, has_relative_attention_bias=bool(i == 0))
+            for i in range(config.num_layers)
+        ])
+        self.final_layer_norm = T5LayerNorm(
+            config.d_model, eps=config.layer_norm_epsilon)
+        self.dropout = nn.Dropout(config.dropout_rate)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+        # Model parallel
+        self.model_parallel = False
+        self.device_map = None
+        self.gradient_checkpointing = False
+
+    @add_start_docstrings(PARALLELIZE_DOCSTRING)
+    def parallelize(self, device_map=None):
+        # Check validity of device_map
+        self.device_map = (
+            get_device_map(len(self.block), range(torch.cuda.device_count()))
+            if device_map is None else device_map)
+        assert_device_map(self.device_map, len(self.block))
+        self.model_parallel = True
+        self.first_device = 'cpu' if 'cpu' in self.device_map.keys(
+        ) else 'cuda:' + str(min(self.device_map.keys()))
+        self.last_device = 'cuda:' + str(max(self.device_map.keys()))
+        # Load onto devices
+        for k, v in self.device_map.items():
+            for layer in v:
+                cuda_device = 'cuda:' + str(k)
+                self.block[layer] = self.block[layer].to(cuda_device)
+
+        # Set embed_tokens to first layer
+        self.embed_tokens = self.embed_tokens.to(self.first_device)
+        # Set final layer norm to last device
+        self.final_layer_norm = self.final_layer_norm.to(self.last_device)
+
+    @add_start_docstrings(PARALLELIZE_DOCSTRING)
+    def deparallelize(self):
+        self.model_parallel = False
+        self.device_map = None
+        self.first_device = 'cpu'
+        self.last_device = 'cpu'
+        for i in range(len(self.block)):
+            self.block[i] = self.block[i].to('cpu')
+        self.embed_tokens = self.embed_tokens.to('cpu')
+        self.final_layer_norm = self.final_layer_norm.to('cpu')
+        torch.cuda.empty_cache()
+
+    def get_input_embeddings(self):
+        return self.embed_tokens
+
+    def set_input_embeddings(self, new_embeddings):
+        self.embed_tokens = new_embeddings
+
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        inputs_embeds=None,
+        head_mask=None,
+        cross_attn_head_mask=None,
+        past_key_values=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        # Model parallel
+        if self.model_parallel:
+            torch.cuda.set_device(self.first_device)
+            self.embed_tokens = self.embed_tokens.to(self.first_device)
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else
+            self.config.output_hidden_states)
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if input_ids is not None and inputs_embeds is not None:
+            err_msg_prefix = 'decoder_' if self.is_decoder else ''
+            raise ValueError(
+                f'You cannot specify both {err_msg_prefix}input_ids and {err_msg_prefix}inputs_embeds at the same time'
+            )
+        elif input_ids is not None:
+            input_shape = input_ids.size()
+            input_ids = input_ids.view(-1, input_shape[-1])
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+        else:
+            err_msg_prefix = 'decoder_' if self.is_decoder else ''
+            raise ValueError(
+                f'You have to specify either {err_msg_prefix}input_ids or {err_msg_prefix}inputs_embeds'
+            )
+
+        if inputs_embeds is None:
+            assert self.embed_tokens is not None, 'You have to initialize the model with valid token embeddings'
+            inputs_embeds = self.embed_tokens(input_ids)
+
+        batch_size, seq_length = input_shape
+
+        # required mask seq length can be calculated via length of past
+        mask_seq_length = past_key_values[0][0].shape[
+            2] + seq_length if past_key_values is not None else seq_length
+
+        if use_cache is True:
+            assert self.is_decoder, f'`use_cache` can only be set to `True` if {self} is used as a decoder'
+
+        if attention_mask is None:
+            attention_mask = torch.ones(batch_size, mask_seq_length).to(
+                inputs_embeds.device)
+        if self.is_decoder and encoder_attention_mask is None and encoder_hidden_states is not None:
+            encoder_seq_length = encoder_hidden_states.shape[1]
+            encoder_attention_mask = torch.ones(
+                batch_size,
+                encoder_seq_length,
+                device=inputs_embeds.device,
+                dtype=torch.long)
+
+        # initialize past_key_values with `None` if past does not exist
+        if past_key_values is None:
+            past_key_values = [None] * len(self.block)
+
+        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+        # ourselves in which case we just need to make it broadcastable to all heads.
+        extended_attention_mask = self.get_extended_attention_mask(
+            attention_mask, input_shape, inputs_embeds.device)
+
+        # If a 2D or 3D attention mask is provided for the cross-attention
+        # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
+        if self.is_decoder and encoder_hidden_states is not None:
+            encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size(
+            )
+            encoder_hidden_shape = (encoder_batch_size,
+                                    encoder_sequence_length)
+            if encoder_attention_mask is None:
+                encoder_attention_mask = torch.ones(
+                    encoder_hidden_shape, device=inputs_embeds.device)
+            encoder_extended_attention_mask = self.invert_attention_mask(
+                encoder_attention_mask)
+        else:
+            encoder_extended_attention_mask = None
+
+        # Prepare head mask if needed
+        head_mask = self.get_head_mask(head_mask, self.config.num_layers)
+        cross_attn_head_mask = self.get_head_mask(cross_attn_head_mask,
+                                                  self.config.num_layers)
+        present_key_value_states = () if use_cache else None
+        all_hidden_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+        all_cross_attentions = () if (output_attentions
+                                      and self.is_decoder) else None
+        position_bias = None
+        encoder_decoder_position_bias = None
+
+        hidden_states = self.dropout(inputs_embeds)
+
+        for i, (layer_module,
+                past_key_value) in enumerate(zip(self.block, past_key_values)):
+            layer_head_mask = head_mask[i]
+            cross_attn_layer_head_mask = cross_attn_head_mask[i]
+            # Model parallel
+            if self.model_parallel:
+                torch.cuda.set_device(hidden_states.device)
+                # Ensure that attention_mask is always on the same device as hidden_states
+                if attention_mask is not None:
+                    attention_mask = attention_mask.to(hidden_states.device)
+                if position_bias is not None:
+                    position_bias = position_bias.to(hidden_states.device)
+                if encoder_hidden_states is not None:
+                    encoder_hidden_states = encoder_hidden_states.to(
+                        hidden_states.device)
+                if encoder_extended_attention_mask is not None:
+                    encoder_extended_attention_mask = encoder_extended_attention_mask.to(
+                        hidden_states.device)
+                if encoder_decoder_position_bias is not None:
+                    encoder_decoder_position_bias = encoder_decoder_position_bias.to(
+                        hidden_states.device)
+                if layer_head_mask is not None:
+                    layer_head_mask = layer_head_mask.to(hidden_states.device)
+                if cross_attn_layer_head_mask is not None:
+                    cross_attn_layer_head_mask = cross_attn_layer_head_mask.to(
+                        hidden_states.device)
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states, )
+
+            if self.gradient_checkpointing and self.training:
+                if use_cache:
+                    logger.warning(
+                        '`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...'
+                    )
+                    use_cache = False
+
+                def create_custom_forward(module):
+
+                    def custom_forward(*inputs):
+                        return tuple(
+                            module(*inputs, use_cache, output_attentions))
+
+                    return custom_forward
+
+                layer_outputs = checkpoint(
+                    create_custom_forward(layer_module),
+                    hidden_states,
+                    extended_attention_mask,
+                    position_bias,
+                    encoder_hidden_states,
+                    encoder_extended_attention_mask,
+                    encoder_decoder_position_bias,
+                    layer_head_mask,
+                    cross_attn_layer_head_mask,
+                    None,  # past_key_value is always None with gradient checkpointing
+                )
+            else:
+                layer_outputs = layer_module(
+                    hidden_states,
+                    attention_mask=extended_attention_mask,
+                    position_bias=position_bias,
+                    encoder_hidden_states=encoder_hidden_states,
+                    encoder_attention_mask=encoder_extended_attention_mask,
+                    encoder_decoder_position_bias=encoder_decoder_position_bias,
+                    layer_head_mask=layer_head_mask,
+                    cross_attn_layer_head_mask=cross_attn_layer_head_mask,
+                    past_key_value=past_key_value,
+                    use_cache=use_cache,
+                    output_attentions=output_attentions,
+                )
+
+            # layer_outputs is a tuple with: hidden-states, key-value-states,
+            # (self-attention position bias), (self-attention weights),
+            # (cross-attention position bias), (cross-attention weights)
+            if use_cache is False:
+                layer_outputs = layer_outputs[:1] + (
+                    None, ) + layer_outputs[1:]
+
+            hidden_states, present_key_value_state = layer_outputs[:2]
+
+            # We share the position biases between the layers - the first layer
+            # store them layer_outputs = hidden-states, key-value-states
+            # (self-attention position bias), (self-attention weights),
+            # (cross-attention position bias), (cross-attention weights)
+            position_bias = layer_outputs[2]
+            if self.is_decoder and encoder_hidden_states is not None:
+                encoder_decoder_position_bias = layer_outputs[
+                    4 if output_attentions else 3]
+            # append next layer key value states
+            if use_cache:
+                present_key_value_states = present_key_value_states + (
+                    present_key_value_state, )
+
+            if output_attentions:
+                all_attentions = all_attentions + (layer_outputs[3], )
+                if self.is_decoder:
+                    all_cross_attentions = all_cross_attentions + (
+                        layer_outputs[5], )
+
+            # Model Parallel: If it's the last layer for that device, put things on the next device
+            if self.model_parallel:
+                for k, v in self.device_map.items():
+                    if i == v[-1] and 'cuda:' + str(k) != self.last_device:
+                        hidden_states = hidden_states.to('cuda:' + str(k + 1))
+
+        hidden_states = self.final_layer_norm(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+
+        # Add last layer
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states, )
+
+        if not return_dict:
+            return tuple(v for v in [
+                hidden_states,
+                present_key_value_states,
+                all_hidden_states,
+                all_attentions,
+                all_cross_attentions,
+            ] if v is not None)
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=present_key_value_states,
+            hidden_states=all_hidden_states,
+            attentions=all_attentions,
+            cross_attentions=all_cross_attentions,
+        )
+
+
+T5_START_DOCSTRING = r"""
+
+    The T5 model was proposed in [Exploring the Limits of Transfer Learning with
+    a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683) by
+    Colin Raffel, Noam Shazeer, Adam Roberts, Katherine Lee, Sharan Narang,
+    Michael Matena, Yanqi Zhou, Wei Li, Peter J. Liu. It's an encoder decoder
+    transformer pre-trained in a text-to-text denoising generative setting.
+
+    This model inherits from [`PreTrainedModel`]. Check the superclass
+    documentation for the generic methods the library implements for all its
+    model (such as downloading or saving, resizing the input embeddings, pruning
+    heads etc.)
+
+    This model is also a PyTorch
+    [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module)
+    subclass. Use it as a regular PyTorch Module and refer to the PyTorch
+    documentation for all matter related to general usage and behavior.
+
+    Parameters:
+        config ([`T5Config`]): Model configuration class with all the parameters
+        of the model.
+            Initializing with a config file does not load the weights associated
+            with the model, only the configuration. Check out the
+            [`~PreTrainedModel.from_pretrained`] method to load the model
+            weights.
+"""
+
+T5_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. T5 is a model
+            with relative position embeddings so you should be able to pad the
+            inputs on both the right and the left.
+
+            Indices can be obtained using [`T5Tokenizer`]. See
+            [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`]
+            for detail.
+
+            [What are input IDs?](../glossary#input-ids)
+
+            To know more on how to prepare `input_ids` for pretraining take a
+            look a [T5 Training](./t5#training).
+        attention_mask (`torch.FloatTensor` of shape `(batch_size,
+        sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask
+            values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        decoder_input_ids (`torch.LongTensor` of shape `(batch_size,
+        target_sequence_length)`, *optional*):
+            Indices of decoder input sequence tokens in the vocabulary.
+
+            Indices can be obtained using [`T5Tokenizer`]. See
+            [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`]
+            for details.
+
+            [What are decoder input IDs?](../glossary#decoder-input-ids)
+
+            T5 uses the `pad_token_id` as the starting token for
+            `decoder_input_ids` generation. If `past_key_values` is used,
+            optionally only the last `decoder_input_ids` have to be input (see
+            `past_key_values`).
+
+            To know more on how to prepare `decoder_input_ids` for pretraining
+            take a look at [T5 Training](./t5#training).
+        decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size,
+        target_sequence_length)`, *optional*):
+            Default behavior: generate a tensor that ignores pad tokens in
+            `decoder_input_ids`. Causal mask will also be used by default.
+        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers,
+        num_heads)`, *optional*):
+            Mask to nullify selected heads of the self-attention modules in the
+            encoder. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        decoder_head_mask (`torch.FloatTensor` of shape `(num_heads,)` or
+        `(num_layers, num_heads)`, *optional*):
+            Mask to nullify selected heads of the self-attention modules in the
+            decoder. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        cross_attn_head_mask (`torch.Tensor` of shape `(num_heads,)` or
+        `(num_layers, num_heads)`, *optional*):
+                Mask to nullify selected heads of the cross-attention modules in
+                the decoder. Mask values selected in `[0, 1]`:
+
+                - 1 indicates the head is **not masked**,
+                - 0 indicates the head is **masked**.
+
+        encoder_outputs (`tuple(tuple(torch.FloatTensor)`, *optional*):
+            Tuple consists of (`last_hidden_state`, `optional`: *hidden_states*,
+            `optional`: *attentions*) `last_hidden_state` of shape `(batch_size,
+            sequence_length, hidden_size)` is a sequence of hidden states at the
+            output of the last layer of the encoder. Used in the cross-attention
+            of the decoder.
+        past_key_values (`tuple(tuple(torch.FloatTensor))` of length
+        `config.n_layers` with each tuple having 4 tensors of shape
+        `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+            Contains precomputed key and value hidden states of the attention
+            blocks. Can be used to speed up decoding.
+
+            If `past_key_values` are used, the user can optionally input only
+            the last `decoder_input_ids` (those that don't have their past key
+            value states given to this model) of shape `(batch_size, 1)` instead
+            of all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size,
+        sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to
+            directly pass an embedded representation. This is useful if you want
+            more control over how to convert `input_ids` indices into associated
+            vectors than the model's internal embedding lookup matrix.
+        decoder_inputs_embeds (`torch.FloatTensor` of shape `(batch_size,
+        target_sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `decoder_input_ids` you can choose to
+            directly pass an embedded representation. If `past_key_values` is
+            used, optionally only the last `decoder_inputs_embeds` have to be
+            input (see `past_key_values`). This is useful if you want more
+            control over how to convert `decoder_input_ids` indices into
+            associated vectors than the model's internal embedding lookup
+            matrix.
+
+            If `decoder_input_ids` and `decoder_inputs_embeds` are both unset,
+            `decoder_inputs_embeds` takes the value of `inputs_embeds`.
+
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned
+            and can be used to speed up decoding (see `past_key_values`).
+
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention
+            layers. See `attentions` under returned tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See
+            `hidden_states` under returned tensors for more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain
+            tuple.
+"""
+
+T5_ENCODER_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. T5 is a model
+            with relative position embeddings so you should be able to pad the
+            inputs on both the right and the left.
+
+            Indices can be obtained using [`T5Tokenizer`]. See
+            [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`]
+            for detail.
+
+            To know more on how to prepare `input_ids` for pretraining take a
+            look a [T5 Training](./t5#training).
+        attention_mask (`torch.FloatTensor` of shape `(batch_size,
+        sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask
+            values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers,
+        num_heads)`, *optional*):
+            Mask to nullify selected heads of the self-attention modules. Mask
+            values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size,
+        sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to
+            directly pass an embedded representation. This is useful if you want
+            more control over how to convert `input_ids` indices into associated
+            vectors than the model's internal embedding lookup matrix.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention
+            layers. See `attentions` under returned tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See
+            `hidden_states` under returned tensors for more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain
+            tuple.
+"""
+
+# Warning message for FutureWarning: head_mask was separated into two input args - head_mask, decoder_head_mask
+__HEAD_MASK_WARNING_MSG = """
+The input argument `head_mask` was split into two arguments `head_mask` and
+`decoder_head_mask`. Currently, `decoder_head_mask` is set to copy `head_mask`,
+but this feature is deprecated and will be removed in future versions. If you do
+not want to use any `decoder_head_mask` now, please set `decoder_head_mask =
+torch.ones(num_layers, num_heads)`.
+"""
+
+
+@add_start_docstrings(
+    'The bare T5 Model transformer outputting raw hidden-states without any specific head on top.',
+    T5_START_DOCSTRING,
+)
+class T5Model(T5PreTrainedModel):
+    _keys_to_ignore_on_load_missing = [
+        r'encoder\.embed_tokens\.weight',
+        r'decoder\.embed_tokens\.weight',
+    ]
+    _keys_to_ignore_on_load_unexpected = [
+        r'decoder\.block\.0\.layer\.1\.EncDecAttention\.relative_attention_bias\.weight',
+    ]
+
+    def __init__(self, config: T5Config):
+        super().__init__(config)
+        self.shared = nn.Embedding(config.vocab_size, config.d_model)
+
+        encoder_config = copy.deepcopy(config)
+        encoder_config.is_decoder = False
+        encoder_config.use_cache = False
+        encoder_config.is_encoder_decoder = False
+        self.encoder = T5Stack(encoder_config, self.shared)
+
+        decoder_config = copy.deepcopy(config)
+        decoder_config.is_decoder = True
+        decoder_config.is_encoder_decoder = False
+        decoder_config.num_layers = config.num_decoder_layers
+        self.decoder = T5Stack(decoder_config, self.shared)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+        # Model parallel
+        self.model_parallel = False
+        self.device_map = None
+
+    @add_start_docstrings(PARALLELIZE_DOCSTRING)
+    def parallelize(self, device_map=None):
+        self.device_map = (
+            get_device_map(
+                len(self.encoder.block), range(torch.cuda.device_count()))
+            if device_map is None else device_map)
+        assert_device_map(self.device_map, len(self.encoder.block))
+        self.encoder.parallelize(self.device_map)
+        self.decoder.parallelize(self.device_map)
+        self.model_parallel = True
+
+    @add_start_docstrings(DEPARALLELIZE_DOCSTRING)
+    def deparallelize(self):
+        self.encoder.deparallelize()
+        self.decoder.deparallelize()
+        self.encoder = self.encoder.to('cpu')
+        self.decoder = self.decoder.to('cpu')
+        self.model_parallel = False
+        self.device_map = None
+        torch.cuda.empty_cache()
+
+    def get_input_embeddings(self):
+        return self.shared
+
+    def set_input_embeddings(self, new_embeddings):
+        self.shared = new_embeddings
+        self.encoder.set_input_embeddings(new_embeddings)
+        self.decoder.set_input_embeddings(new_embeddings)
+
+    def get_encoder(self):
+        return self.encoder
+
+    def get_decoder(self):
+        return self.decoder
+
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of
+        heads to prune in this layer} See base class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+
+    @add_start_docstrings_to_model_forward(T5_INPUTS_DOCSTRING)
+    @replace_return_docstrings(
+        output_type=Seq2SeqModelOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        decoder_input_ids: Optional[torch.LongTensor] = None,
+        decoder_attention_mask: Optional[torch.BoolTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        decoder_head_mask: Optional[torch.FloatTensor] = None,
+        cross_attn_head_mask: Optional[torch.Tensor] = None,
+        encoder_outputs: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        decoder_inputs_embeds: Optional[torch.Tensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.FloatTensor], Seq2SeqModelOutput]:
+        r"""
+        Returns:
+
+        Example:
+
+        ```python >>> from transformers import T5Tokenizer, T5Model
+
+        >>> tokenizer = T5Tokenizer.from_pretrained("t5-small")
+        >>> model = T5Model.from_pretrained("t5-small")
+
+        >>> input_ids = tokenizer(
+        ...     "Studies have been shown that owning a dog is good for you", return_tensors="pt"
+        >>> ).input_ids  # Batch size 1
+        >>> decoder_input_ids = tokenizer("Studies show that", return_tensors="pt").input_ids  # Batch size 1
+
+        >>> # forward pass
+        >>> outputs = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids)
+        >>> last_hidden_states = outputs.last_hidden_state
+        ```"""
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # FutureWarning: head_mask was separated into two input args - head_mask, decoder_head_mask
+        if head_mask is not None and decoder_head_mask is None:
+            if self.config.num_layers == self.config.num_decoder_layers:
+                warnings.warn(__HEAD_MASK_WARNING_MSG, FutureWarning)
+                decoder_head_mask = head_mask
+
+        # Encode if needed (training, first prediction pass)
+        if encoder_outputs is None:
+            encoder_outputs = self.encoder(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                inputs_embeds=inputs_embeds,
+                head_mask=head_mask,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+        elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
+            encoder_outputs = BaseModelOutput(
+                last_hidden_state=encoder_outputs[0],
+                hidden_states=encoder_outputs[1]
+                if len(encoder_outputs) > 1 else None,
+                attentions=encoder_outputs[2]
+                if len(encoder_outputs) > 2 else None,
+            )
+
+        hidden_states = encoder_outputs[0]
+        if self.model_parallel:
+            torch.cuda.set_device(self.decoder.first_device)
+        # Set device for model parallelism
+        if self.model_parallel:
+            torch.cuda.set_device(self.decoder.first_device)
+            hidden_states = hidden_states.to(self.decoder.first_device)
+            if decoder_input_ids is not None:
+                decoder_input_ids = decoder_input_ids.to(
+                    self.decoder.first_device)
+            if attention_mask is not None:
+                attention_mask = attention_mask.to(self.decoder.first_device)
+            if decoder_attention_mask is not None:
+                decoder_attention_mask = decoder_attention_mask.to(
+                    self.decoder.first_device)
+
+        # Decode
+        decoder_outputs = self.decoder(
+            input_ids=decoder_input_ids,
+            attention_mask=decoder_attention_mask,
+            inputs_embeds=decoder_inputs_embeds,
+            past_key_values=past_key_values,
+            encoder_hidden_states=hidden_states,
+            encoder_attention_mask=attention_mask,
+            head_mask=decoder_head_mask,
+            cross_attn_head_mask=cross_attn_head_mask,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        if not return_dict:
+            return decoder_outputs + encoder_outputs
+
+        return Seq2SeqModelOutput(
+            last_hidden_state=decoder_outputs.last_hidden_state,
+            past_key_values=decoder_outputs.past_key_values,
+            decoder_hidden_states=decoder_outputs.hidden_states,
+            decoder_attentions=decoder_outputs.attentions,
+            cross_attentions=decoder_outputs.cross_attentions,
+            encoder_last_hidden_state=encoder_outputs.last_hidden_state,
+            encoder_hidden_states=encoder_outputs.hidden_states,
+            encoder_attentions=encoder_outputs.attentions,
+        )
+
+
+@add_start_docstrings("""T5 Model with a `language modeling` head on top.""",
+                      T5_START_DOCSTRING)
+class T5ForConditionalGeneration(T5PreTrainedModel):
+    _keys_to_ignore_on_load_missing = [
+        r'encoder\.embed_tokens\.weight',
+        r'decoder\.embed_tokens\.weight',
+        r'lm_head\.weight',
+    ]
+    _keys_to_ignore_on_load_unexpected = [
+        r'decoder\.block\.0\.layer\.1\.EncDecAttention\.relative_attention_bias\.weight',
+    ]
+
+    def __init__(self, config: T5Config):
+        super().__init__(config)
+        self.model_dim = config.d_model
+
+        self.shared = nn.Embedding(config.vocab_size, config.d_model)
+
+        encoder_config = copy.deepcopy(config)
+        encoder_config.is_decoder = False
+        encoder_config.use_cache = False
+        encoder_config.is_encoder_decoder = False
+        self.encoder = T5Stack(encoder_config, self.shared)
+
+        decoder_config = copy.deepcopy(config)
+        decoder_config.is_decoder = True
+        decoder_config.is_encoder_decoder = False
+        decoder_config.num_layers = config.num_decoder_layers
+        self.decoder = T5Stack(decoder_config, self.shared)
+
+        self.lm_head = nn.Linear(config.d_model, config.vocab_size, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+        # Model parallel
+        self.model_parallel = False
+        self.device_map = None
+
+    @add_start_docstrings(PARALLELIZE_DOCSTRING)
+    def parallelize(self, device_map=None):
+        self.device_map = (
+            get_device_map(
+                len(self.encoder.block), range(torch.cuda.device_count()))
+            if device_map is None else device_map)
+        assert_device_map(self.device_map, len(self.encoder.block))
+        self.encoder.parallelize(self.device_map)
+        self.decoder.parallelize(self.device_map)
+        self.lm_head = self.lm_head.to(self.decoder.first_device)
+        self.model_parallel = True
+
+    @add_start_docstrings(DEPARALLELIZE_DOCSTRING)
+    def deparallelize(self):
+        self.encoder.deparallelize()
+        self.decoder.deparallelize()
+        self.encoder = self.encoder.to('cpu')
+        self.decoder = self.decoder.to('cpu')
+        self.lm_head = self.lm_head.to('cpu')
+        self.model_parallel = False
+        self.device_map = None
+        torch.cuda.empty_cache()
+
+    def get_input_embeddings(self):
+        return self.shared
+
+    def set_input_embeddings(self, new_embeddings):
+        self.shared = new_embeddings
+        self.encoder.set_input_embeddings(new_embeddings)
+        self.decoder.set_input_embeddings(new_embeddings)
+
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+
+    def get_output_embeddings(self):
+        return self.lm_head
+
+    def get_encoder(self):
+        return self.encoder
+
+    def get_decoder(self):
+        return self.decoder
+
+    @add_start_docstrings_to_model_forward(T5_INPUTS_DOCSTRING)
+    @replace_return_docstrings(
+        output_type=Seq2SeqLMOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        decoder_input_ids: Optional[torch.LongTensor] = None,
+        decoder_attention_mask: Optional[torch.BoolTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        decoder_head_mask: Optional[torch.FloatTensor] = None,
+        cross_attn_head_mask: Optional[torch.Tensor] = None,
+        encoder_outputs: Optional[Tuple[Tuple[torch.Tensor]]] = None,
+        past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.FloatTensor], Seq2SeqLMOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss.
+            Indices should be in `[-100, 0, ..., config.vocab_size - 1]`. All
+            labels set to `-100` are ignored (masked), the loss is only computed
+            for labels in `[0, ..., config.vocab_size]`
+
+        Returns:
+
+        Examples:
+
+        ```python >>> from transformers import T5Tokenizer,
+        T5ForConditionalGeneration
+
+        >>> tokenizer = T5Tokenizer.from_pretrained("t5-small")
+        >>> model = T5ForConditionalGeneration.from_pretrained("t5-small")
+
+        >>> # training
+        >>> input_ids = tokenizer("The <extra_id_0> walks in <extra_id_1> park", return_tensors="pt").input_ids
+        >>> labels = tokenizer("<extra_id_0> cute dog <extra_id_1> the <extra_id_2>", return_tensors="pt").input_ids
+        >>> outputs = model(input_ids=input_ids, labels=labels)
+        >>> loss = outputs.loss
+        >>> logits = outputs.logits
+
+        >>> # inference
+        >>> input_ids = tokenizer(
+        ...     "summarize: studies have shown that owning a dog is good for you", return_tensors="pt"
+        >>> ).input_ids  # Batch size 1
+        >>> outputs = model.generate(input_ids)
+        >>> print(tokenizer.decode(outputs[0], skip_special_tokens=True))
+        >>> # studies have shown that owning a dog is good for you.
+        ```"""
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # FutureWarning: head_mask was separated into two input args - head_mask, decoder_head_mask
+        if head_mask is not None and decoder_head_mask is None:
+            if self.config.num_layers == self.config.num_decoder_layers:
+                warnings.warn(__HEAD_MASK_WARNING_MSG, FutureWarning)
+                decoder_head_mask = head_mask
+
+        # Encode if needed (training, first prediction pass)
+        if encoder_outputs is None:
+            # Convert encoder inputs in embeddings if needed
+            encoder_outputs = self.encoder(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                inputs_embeds=inputs_embeds,
+                head_mask=head_mask,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+        elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
+            encoder_outputs = BaseModelOutput(
+                last_hidden_state=encoder_outputs[0],
+                hidden_states=encoder_outputs[1]
+                if len(encoder_outputs) > 1 else None,
+                attentions=encoder_outputs[2]
+                if len(encoder_outputs) > 2 else None,
+            )
+
+        hidden_states = encoder_outputs[0]
+
+        if self.model_parallel:
+            torch.cuda.set_device(self.decoder.first_device)
+
+        if labels is not None and decoder_input_ids is None and decoder_inputs_embeds is None:
+            # get decoder inputs from shifting lm labels to the right
+            decoder_input_ids = self._shift_right(labels)
+
+        # Set device for model parallelism
+        if self.model_parallel:
+            torch.cuda.set_device(self.decoder.first_device)
+            hidden_states = hidden_states.to(self.decoder.first_device)
+            if decoder_input_ids is not None:
+                decoder_input_ids = decoder_input_ids.to(
+                    self.decoder.first_device)
+            if attention_mask is not None:
+                attention_mask = attention_mask.to(self.decoder.first_device)
+            if decoder_attention_mask is not None:
+                decoder_attention_mask = decoder_attention_mask.to(
+                    self.decoder.first_device)
+
+        # Decode
+        decoder_outputs = self.decoder(
+            input_ids=decoder_input_ids,
+            attention_mask=decoder_attention_mask,
+            inputs_embeds=decoder_inputs_embeds,
+            past_key_values=past_key_values,
+            encoder_hidden_states=hidden_states,
+            encoder_attention_mask=attention_mask,
+            head_mask=decoder_head_mask,
+            cross_attn_head_mask=cross_attn_head_mask,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = decoder_outputs[0]
+
+        # Set device for model parallelism
+        if self.model_parallel:
+            torch.cuda.set_device(self.encoder.first_device)
+            self.lm_head = self.lm_head.to(self.encoder.first_device)
+            sequence_output = sequence_output.to(self.lm_head.weight.device)
+
+        if self.config.tie_word_embeddings:
+            # Rescale output before projecting on vocab See
+            # https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/transformer/transformer.py#L586
+            sequence_output = sequence_output * (self.model_dim**-0.5)
+
+        lm_logits = self.lm_head(sequence_output)
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss(ignore_index=-100)
+            loss = loss_fct(
+                lm_logits.view(-1, lm_logits.size(-1)), labels.view(-1))
+            # TODO(thom): Add z_loss
+            # https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L666
+
+        if not return_dict:
+            output = (lm_logits, ) + decoder_outputs[1:] + encoder_outputs
+            return ((loss, ) + output) if loss is not None else output
+
+        return Seq2SeqLMOutput(
+            loss=loss,
+            logits=lm_logits,
+            past_key_values=decoder_outputs.past_key_values,
+            decoder_hidden_states=decoder_outputs.hidden_states,
+            decoder_attentions=decoder_outputs.attentions,
+            cross_attentions=decoder_outputs.cross_attentions,
+            encoder_last_hidden_state=encoder_outputs.last_hidden_state,
+            encoder_hidden_states=encoder_outputs.hidden_states,
+            encoder_attentions=encoder_outputs.attentions,
+        )
+
+    def prepare_inputs_for_generation(self,
+                                      input_ids,
+                                      past=None,
+                                      attention_mask=None,
+                                      head_mask=None,
+                                      decoder_head_mask=None,
+                                      cross_attn_head_mask=None,
+                                      use_cache=None,
+                                      encoder_outputs=None,
+                                      **kwargs):
+
+        # cut decoder_input_ids if past is used
+        if past is not None:
+            input_ids = input_ids[:, -1:]
+
+        return {
+            'decoder_input_ids': input_ids,
+            'past_key_values': past,
+            'encoder_outputs': encoder_outputs,
+            'attention_mask': attention_mask,
+            'head_mask': head_mask,
+            'decoder_head_mask': decoder_head_mask,
+            'cross_attn_head_mask': cross_attn_head_mask,
+            'use_cache': use_cache,
+        }
+
+    def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor):
+        return self._shift_right(labels)
+
+    def _reorder_cache(self, past, beam_idx):
+        # if decoder past is not included in output
+        # speedy decoding is disabled and no need to reorder
+        if past is None:
+            logger.warning(
+                'You might want to consider setting `use_cache=True` to speed up decoding'
+            )
+            return past
+
+        reordered_decoder_past = ()
+        for layer_past_states in past:
+            # get the correct batch idx from layer past batch dim
+            # batch dim of `past` is at 2nd position
+            reordered_layer_past_states = ()
+            for layer_past_state in layer_past_states:
+                # need to set correct `past` for each of the four key / value states
+                reordered_layer_past_states = reordered_layer_past_states + (
+                    layer_past_state.index_select(
+                        0, beam_idx.to(layer_past_state.device)), )
+
+            assert reordered_layer_past_states[0].shape == layer_past_states[
+                0].shape
+            assert len(reordered_layer_past_states) == len(layer_past_states)
+
+            reordered_decoder_past = reordered_decoder_past + (
+                reordered_layer_past_states, )
+        return reordered_decoder_past
+
+
+@add_start_docstrings(
+    "The bare T5 Model transformer outputting encoder's raw hidden-states without any specific head on top.",
+    T5_START_DOCSTRING,
+)
+class T5EncoderModel(T5PreTrainedModel):
+    authorized_missing_keys = [
+        r'encoder\.embed_tokens\.weight',
+    ]
+
+    def __init__(self, config: T5Config):
+        super().__init__(config)
+        self.shared = nn.Embedding(config.vocab_size, config.d_model)
+
+        encoder_config = copy.deepcopy(config)
+        encoder_config.use_cache = False
+        encoder_config.is_encoder_decoder = False
+        self.encoder = T5Stack(encoder_config, self.shared)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+        # Model parallel
+        self.model_parallel = False
+        self.device_map = None
+
+    @add_start_docstrings(PARALLELIZE_DOCSTRING)
+    def parallelize(self, device_map=None):
+        self.device_map = (
+            get_device_map(
+                len(self.encoder.block), range(torch.cuda.device_count()))
+            if device_map is None else device_map)
+        assert_device_map(self.device_map, len(self.encoder.block))
+        self.encoder.parallelize(self.device_map)
+        self.model_parallel = True
+
+    @add_start_docstrings(DEPARALLELIZE_DOCSTRING)
+    def deparallelize(self):
+        self.encoder.deparallelize()
+        self.encoder = self.encoder.to('cpu')
+        self.model_parallel = False
+        self.device_map = None
+        torch.cuda.empty_cache()
+
+    def get_input_embeddings(self):
+        return self.shared
+
+    def set_input_embeddings(self, new_embeddings):
+        self.shared = new_embeddings
+        self.encoder.set_input_embeddings(new_embeddings)
+
+    def get_encoder(self):
+        return self.encoder
+
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of
+        heads to prune in this layer} See base class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+
+    @add_start_docstrings_to_model_forward(T5_ENCODER_INPUTS_DOCSTRING)
+    @replace_return_docstrings(
+        output_type=BaseModelOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.FloatTensor], BaseModelOutput]:
+        r"""
+        Returns:
+
+        Example:
+
+        ```python
+        >>> from transformers import T5Tokenizer, T5EncoderModel
+
+        >>> tokenizer = T5Tokenizer.from_pretrained("t5-small")
+        >>> model = T5EncoderModel.from_pretrained("t5-small")
+        >>> input_ids = tokenizer(
+        ...     "Studies have been shown that owning a dog is good for you", return_tensors="pt"
+        >>> ).input_ids  # Batch size 1
+        >>> outputs = model(input_ids=input_ids)
+        >>> last_hidden_states = outputs.last_hidden_state
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        encoder_outputs = self.encoder(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+            head_mask=head_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        return encoder_outputs
diff --git a/modelscope/models/nlp/T5/t5_for_text_generation.py b/modelscope/models/nlp/T5/t5_for_text_generation.py
new file mode 100644
index 00000000..27f077d8
--- /dev/null
+++ b/modelscope/models/nlp/T5/t5_for_text_generation.py
@@ -0,0 +1,56 @@
+from typing import Optional, Tuple
+
+import torch
+
+from modelscope.metainfo import Models
+from modelscope.models.base import Tensor, TorchModel
+from modelscope.models.builder import MODELS
+from modelscope.outputs import OutputKeys
+from modelscope.utils.constant import Tasks
+from .modeling_t5 import T5Config
+from .modeling_t5 import T5ForConditionalGeneration as T5ForGeneration
+
+
+@MODELS.register_module(
+    group_key=Tasks.text2text_generation,
+    module_name=Models.T5,
+)
+class T5ForConditionalGeneration(TorchModel):
+
+    def __init__(self, model_dir=None, *args, **kwargs):
+        """initialize the text generation model from the `model_dir` path.
+
+        Args:
+            model_dir (str): the model path.
+            model_cls (Optional[Any], optional): model loader, if None, use the
+                default loader to load model weights, by default None.
+        """
+        super().__init__(model_dir, *args, **kwargs)
+        self.model = T5ForGeneration.from_pretrained(model_dir)
+        self.generate = self.model.generate
+        self.config = self.model.config
+
+    def forward(self,
+                input_ids: Optional[torch.LongTensor] = None,
+                attention_mask: Optional[torch.FloatTensor] = None,
+                decoder_input_ids: Optional[torch.LongTensor] = None,
+                decoder_attention_mask: Optional[torch.BoolTensor] = None,
+                head_mask: Optional[torch.FloatTensor] = None,
+                decoder_head_mask: Optional[torch.FloatTensor] = None,
+                cross_attn_head_mask: Optional[torch.Tensor] = None,
+                encoder_outputs: Optional[Tuple[Tuple[torch.Tensor]]] = None,
+                past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
+                inputs_embeds: Optional[torch.FloatTensor] = None,
+                decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
+                labels: Optional[torch.LongTensor] = None,
+                use_cache: Optional[bool] = None,
+                output_attentions: Optional[bool] = None,
+                output_hidden_states: Optional[bool] = None,
+                return_dict: Optional[bool] = None,
+                **kwargs):
+        return self.model.forward(
+            self, input_ids, attention_mask, decoder_input_ids,
+            decoder_attention_mask, head_mask, decoder_head_mask,
+            cross_attn_head_mask, encoder_outputs, past_key_values,
+            inputs_embeds, decoder_inputs_embeds, labels, use_cache,
+            output_attentions, output_hidden_states, return_dict, **kwargs)
diff --git a/modelscope/models/nlp/__init__.py b/modelscope/models/nlp/__init__.py
index 3fd76f98..8ef96365 100644
--- a/modelscope/models/nlp/__init__.py
+++ b/modelscope/models/nlp/__init__.py
@@ -5,45 +5,76 @@ from modelscope.utils.import_utils import LazyImportModule
 
 if TYPE_CHECKING:
     from .backbones import SbertModel
-    from .heads import SequenceClassificationHead
-    from .bert_for_sequence_classification import BertForSequenceClassification
+    from .bart_for_text_error_correction import BartForTextErrorCorrection
+    from .bert_for_document_segmentation import BertForDocumentSegmentation
     from .csanmt_for_translation import CsanmtForTranslation
+    from .heads import SequenceClassificationHead
+    from .gpt3 import GPT3ForTextGeneration
     from .masked_language import (StructBertForMaskedLM, VecoForMaskedLM,
-                                  BertForMaskedLM)
-    from .nncrf_for_named_entity_recognition import TransformerCRFForNamedEntityRecognition
+                                  BertForMaskedLM, DebertaV2ForMaskedLM)
+    from .ponet_for_masked_language import PoNetForMaskedLM
+    from .nncrf_for_named_entity_recognition import (
+        TransformerCRFForNamedEntityRecognition,
+        LSTMCRFForNamedEntityRecognition)
     from .palm_v2 import PalmForTextGeneration
-    from .token_classification import SbertForTokenClassification
-    from .sequence_classification import VecoForSequenceClassification, SbertForSequenceClassification
+    from .sbert_for_faq_question_answering import SbertForFaqQuestionAnswering
+    from .star_text_to_sql import StarForTextToSql
+    from .sequence_classification import (VecoForSequenceClassification,
+                                          SbertForSequenceClassification,
+                                          BertForSequenceClassification)
     from .space import SpaceForDialogIntent
     from .space import SpaceForDialogModeling
     from .space import SpaceForDialogStateTracking
-    from .star_text_to_sql import StarForTextToSql
-    from .task_models.task_model import SingleBackboneTaskModelBase
-    from .bart_for_text_error_correction import BartForTextErrorCorrection
-    from .gpt3 import GPT3ForTextGeneration
-
+    from .table_question_answering import TableQuestionAnswering
+    from .task_models import (FeatureExtractionModel,
+                              InformationExtractionModel,
+                              SequenceClassificationModel,
+                              SingleBackboneTaskModelBase,
+                              TokenClassificationModel)
+    from .token_classification import SbertForTokenClassification
+    from .sentence_embedding import SentenceEmbedding
+    from .passage_ranking import PassageRanking
+    from .T5 import T5ForConditionalGeneration
 else:
     _import_structure = {
-        'star_text_to_sql': ['StarForTextToSql'],
         'backbones': ['SbertModel'],
-        'heads': ['SequenceClassificationHead'],
+        'bart_for_text_error_correction': ['BartForTextErrorCorrection'],
+        'bert_for_document_segmentation': ['BertForDocumentSegmentation'],
         'csanmt_for_translation': ['CsanmtForTranslation'],
-        'bert_for_sequence_classification': ['BertForSequenceClassification'],
-        'masked_language':
-        ['StructBertForMaskedLM', 'VecoForMaskedLM', 'BertForMaskedLM'],
-        'nncrf_for_named_entity_recognition':
-        ['TransformerCRFForNamedEntityRecognition'],
+        'heads': ['SequenceClassificationHead'],
+        'gpt3': ['GPT3ForTextGeneration'],
+        'masked_language': [
+            'StructBertForMaskedLM', 'VecoForMaskedLM', 'BertForMaskedLM',
+            'DebertaV2ForMaskedLM'
+        ],
+        'nncrf_for_named_entity_recognition': [
+            'TransformerCRFForNamedEntityRecognition',
+            'LSTMCRFForNamedEntityRecognition'
+        ],
+        'ponet_for_masked_language': ['PoNetForMaskedLM'],
         'palm_v2': ['PalmForTextGeneration'],
-        'token_classification': ['SbertForTokenClassification'],
-        'sequence_classification':
-        ['VecoForSequenceClassification', 'SbertForSequenceClassification'],
+        'sbert_for_faq_question_answering': ['SbertForFaqQuestionAnswering'],
+        'star_text_to_sql': ['StarForTextToSql'],
+        'sequence_classification': [
+            'VecoForSequenceClassification', 'SbertForSequenceClassification',
+            'BertForSequenceClassification'
+        ],
         'space': [
             'SpaceForDialogIntent', 'SpaceForDialogModeling',
             'SpaceForDialogStateTracking'
         ],
-        'task_model': ['SingleBackboneTaskModelBase'],
-        'bart_for_text_error_correction': ['BartForTextErrorCorrection'],
-        'gpt3': ['GPT3ForTextGeneration'],
+        'task_models': [
+            'FeatureExtractionModel',
+            'InformationExtractionModel',
+            'SequenceClassificationModel',
+            'SingleBackboneTaskModelBase',
+            'TokenClassificationModel',
+        ],
+        'token_classification': ['SbertForTokenClassification'],
+        'table_question_answering': ['TableQuestionAnswering'],
+        'sentence_embedding': ['SentenceEmbedding'],
+        'passage_ranking': ['PassageRanking'],
+        'T5': ['T5ForConditionalGeneration'],
     }
 
     import sys
diff --git a/modelscope/models/nlp/backbones/bert.py b/modelscope/models/nlp/backbones/bert.py
new file mode 100644
index 00000000..aa513944
--- /dev/null
+++ b/modelscope/models/nlp/backbones/bert.py
@@ -0,0 +1,7 @@
+from modelscope.metainfo import Models
+from modelscope.models.builder import BACKBONES
+from modelscope.models.nlp.bert import BertModel
+from modelscope.utils.constant import Fields
+
+BACKBONES.register_module(
+    group_key=Fields.nlp, module_name=Models.bert, module_cls=BertModel)
diff --git a/modelscope/models/nlp/backbones/structbert.py b/modelscope/models/nlp/backbones/structbert.py
index f47900c3..74735520 100644
--- a/modelscope/models/nlp/backbones/structbert.py
+++ b/modelscope/models/nlp/backbones/structbert.py
@@ -1,3 +1,4 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
 from modelscope.metainfo import Models
 from modelscope.models.base import TorchModel
 from modelscope.models.builder import BACKBONES
diff --git a/modelscope/models/nlp/bart_for_text_error_correction.py b/modelscope/models/nlp/bart_for_text_error_correction.py
index 2339f221..27abedb5 100644
--- a/modelscope/models/nlp/bart_for_text_error_correction.py
+++ b/modelscope/models/nlp/bart_for_text_error_correction.py
@@ -1,3 +1,4 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
 import os.path as osp
 from typing import Any, Dict
 
diff --git a/modelscope/models/nlp/bert/__init__.py b/modelscope/models/nlp/bert/__init__.py
new file mode 100644
index 00000000..705d9519
--- /dev/null
+++ b/modelscope/models/nlp/bert/__init__.py
@@ -0,0 +1,60 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .modeling_bert import (
+        BERT_PRETRAINED_MODEL_ARCHIVE_LIST,
+        BertForMaskedLM,
+        BertForMultipleChoice,
+        BertForNextSentencePrediction,
+        BertForPreTraining,
+        BertForQuestionAnswering,
+        BertForSequenceClassification,
+        BertForTokenClassification,
+        BertLayer,
+        BertLMHeadModel,
+        BertModel,
+        BertPreTrainedModel,
+        load_tf_weights_in_bert,
+    )
+
+    from .configuration_bert import BERT_PRETRAINED_CONFIG_ARCHIVE_MAP, BertConfig, BertOnnxConfig
+    from .tokenization_bert import BasicTokenizer, BertTokenizer, WordpieceTokenizer
+    from .tokenization_bert_fast import BertTokenizerFast
+
+else:
+    _import_structure = {
+        'configuration_bert':
+        ['BERT_PRETRAINED_CONFIG_ARCHIVE_MAP', 'BertConfig', 'BertOnnxConfig'],
+        'tokenization_bert':
+        ['BasicTokenizer', 'BertTokenizer', 'WordpieceTokenizer'],
+    }
+    _import_structure['tokenization_bert_fast'] = ['BertTokenizerFast']
+
+    _import_structure['modeling_bert'] = [
+        'BERT_PRETRAINED_MODEL_ARCHIVE_LIST',
+        'BertForMaskedLM',
+        'BertForMultipleChoice',
+        'BertForNextSentencePrediction',
+        'BertForPreTraining',
+        'BertForQuestionAnswering',
+        'BertForSequenceClassification',
+        'BertForTokenClassification',
+        'BertLayer',
+        'BertLMHeadModel',
+        'BertModel',
+        'BertPreTrainedModel',
+        'load_tf_weights_in_bert',
+    ]
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/models/nlp/bert/configuration_bert.py b/modelscope/models/nlp/bert/configuration_bert.py
new file mode 100644
index 00000000..2c9293ec
--- /dev/null
+++ b/modelscope/models/nlp/bert/configuration_bert.py
@@ -0,0 +1,162 @@
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" BERT model configuration """
+from collections import OrderedDict
+from typing import Mapping
+
+from transformers.configuration_utils import PretrainedConfig
+from transformers.onnx import OnnxConfig
+
+from modelscope.utils.logger import get_logger
+
+logger = get_logger(__name__)
+
+
+class BertConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a
+    [`BertModel`] or a [`TFBertModel`]. It is used to instantiate a BERT model
+    according to the specified arguments, defining the model architecture.
+    Instantiating a configuration with the defaults will yield a similar
+    configuration to that of the BERT
+    [bert-base-uncased](https://huggingface.co/bert-base-uncased) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to
+    control the model outputs. Read the documentation from [`PretrainedConfig`]
+    for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 30522):
+            Vocabulary size of the BERT model. Defines the number of different
+            tokens that can be represented by the `inputs_ids` passed when
+            calling [`BertModel`] or [`TFBertModel`].
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the
+            Transformer encoder.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimensionality of the "intermediate" (often named feed-forward)
+            layer in the Transformer encoder.
+        hidden_act (`str` or `Callable`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the
+            encoder and pooler. If string, `"gelu"`, `"relu"`, `"silu"` and
+            `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the
+            embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the attention probabilities.
+        max_position_embeddings (`int`, *optional*, defaults to 512):
+            The maximum sequence length that this model might ever be used with.
+            Typically set this to something large just in case (e.g., 512 or
+            1024 or 2048).
+        type_vocab_size (`int`, *optional*, defaults to 2):
+            The vocabulary size of the `token_type_ids` passed when calling
+            [`BertModel`] or [`TFBertModel`].
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for
+            initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        position_embedding_type (`str`, *optional*, defaults to `"absolute"`):
+            Type of position embedding. Choose one of `"absolute"`,
+            `"relative_key"`, `"relative_key_query"`. For positional embeddings
+            use `"absolute"`. For more information on `"relative_key"`, please
+            refer to [Self-Attention with Relative Position Representations
+            (Shaw et al.)](https://arxiv.org/abs/1803.02155). For more
+            information on `"relative_key_query"`, please refer to *Method 4* in
+            [Improve Transformer Models with Better Relative Position Embeddings
+            (Huang et al.)](https://arxiv.org/abs/2009.13658).
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values
+            attentions (not used by all models). Only relevant if
+            `config.is_decoder=True`.
+        classifier_dropout (`float`, *optional*):
+            The dropout ratio for the classification head.
+
+    Examples:
+
+    ```python >>> from transformers import BertModel, BertConfig
+
+    >>> # Initializing a BERT bert-base-uncased style configuration
+    >>> configuration = BertConfig()
+
+    >>> # Initializing a model from the bert-base-uncased style configuration
+    >>> model = BertModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = 'bert'
+
+    def __init__(self,
+                 vocab_size=30522,
+                 hidden_size=768,
+                 num_hidden_layers=12,
+                 num_attention_heads=12,
+                 intermediate_size=3072,
+                 hidden_act='gelu',
+                 hidden_dropout_prob=0.1,
+                 attention_probs_dropout_prob=0.1,
+                 max_position_embeddings=512,
+                 type_vocab_size=2,
+                 initializer_range=0.02,
+                 layer_norm_eps=1e-12,
+                 pad_token_id=0,
+                 position_embedding_type='absolute',
+                 use_cache=True,
+                 classifier_dropout=None,
+                 **kwargs):
+        super().__init__(pad_token_id=pad_token_id, **kwargs)
+
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.hidden_act = hidden_act
+        self.intermediate_size = intermediate_size
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.type_vocab_size = type_vocab_size
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.position_embedding_type = position_embedding_type
+        self.use_cache = use_cache
+        self.classifier_dropout = classifier_dropout
+
+
+class BertOnnxConfig(OnnxConfig):
+
+    @property
+    def inputs(self) -> Mapping[str, Mapping[int, str]]:
+        return OrderedDict([
+            ('input_ids', {
+                0: 'batch',
+                1: 'sequence'
+            }),
+            ('attention_mask', {
+                0: 'batch',
+                1: 'sequence'
+            }),
+            ('token_type_ids', {
+                0: 'batch',
+                1: 'sequence'
+            }),
+        ])
diff --git a/modelscope/models/nlp/bert/modeling_bert.py b/modelscope/models/nlp/bert/modeling_bert.py
new file mode 100755
index 00000000..f8fd5994
--- /dev/null
+++ b/modelscope/models/nlp/bert/modeling_bert.py
@@ -0,0 +1,2040 @@
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch BERT model. """
+
+import math
+import os
+import warnings
+from dataclasses import dataclass
+from typing import Optional, Tuple
+
+import torch
+import torch.utils.checkpoint
+from packaging import version
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+from transformers.activations import ACT2FN
+from transformers.file_utils import (ModelOutput, add_start_docstrings,
+                                     add_start_docstrings_to_model_forward,
+                                     replace_return_docstrings)
+from transformers.modeling_outputs import (
+    BaseModelOutputWithPastAndCrossAttentions,
+    BaseModelOutputWithPoolingAndCrossAttentions,
+    CausalLMOutputWithCrossAttentions, MaskedLMOutput,
+    MultipleChoiceModelOutput, NextSentencePredictorOutput,
+    QuestionAnsweringModelOutput, SequenceClassifierOutput,
+    TokenClassifierOutput)
+from transformers.modeling_utils import (PreTrainedModel,
+                                         apply_chunking_to_forward,
+                                         find_pruneable_heads_and_indices,
+                                         prune_linear_layer)
+
+from modelscope.models.base import TorchModel
+from modelscope.utils.logger import get_logger
+from .configuration_bert import BertConfig
+
+logger = get_logger(__name__)
+
+_CONFIG_FOR_DOC = 'BertConfig'
+
+
+def load_tf_weights_in_bert(model, config, tf_checkpoint_path):
+    """Load tf checkpoints in a pytorch model."""
+    try:
+        import re
+
+        import numpy as np
+        import tensorflow as tf
+    except ImportError:
+        logger.error(
+            'Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see '
+            'https://www.tensorflow.org/install/ for installation instructions.'
+        )
+        raise
+    tf_path = os.path.abspath(tf_checkpoint_path)
+    logger.info(f'Converting TensorFlow checkpoint from {tf_path}')
+    # Load weights from TF model
+    init_vars = tf.train.list_variables(tf_path)
+    names = []
+    arrays = []
+    for name, shape in init_vars:
+        logger.info(f'Loading TF weight {name} with shape {shape}')
+        array = tf.train.load_variable(tf_path, name)
+        names.append(name)
+        arrays.append(array)
+
+    for name, array in zip(names, arrays):
+        name = name.split('/')
+        # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
+        # which are not required for using pretrained model
+        if any(n in [
+                'adam_v', 'adam_m', 'AdamWeightDecayOptimizer',
+                'AdamWeightDecayOptimizer_1', 'global_step'
+        ] for n in name):
+            logger.info(f"Skipping {'/'.join(name)}")
+            continue
+        pointer = model
+        for m_name in name:
+            if re.fullmatch(r'[A-Za-z]+_\d+', m_name):
+                scope_names = re.split(r'_(\d+)', m_name)
+            else:
+                scope_names = [m_name]
+            if scope_names[0] == 'kernel' or scope_names[0] == 'gamma':
+                pointer = getattr(pointer, 'weight')
+            elif scope_names[0] == 'output_bias' or scope_names[0] == 'beta':
+                pointer = getattr(pointer, 'bias')
+            elif scope_names[0] == 'output_weights':
+                pointer = getattr(pointer, 'weight')
+            elif scope_names[0] == 'squad':
+                pointer = getattr(pointer, 'classifier')
+            else:
+                try:
+                    pointer = getattr(pointer, scope_names[0])
+                except AttributeError:
+                    logger.info(f"Skipping {'/'.join(name)}")
+                    continue
+            if len(scope_names) >= 2:
+                num = int(scope_names[1])
+                pointer = pointer[num]
+        if m_name[-11:] == '_embeddings':
+            pointer = getattr(pointer, 'weight')
+        elif m_name == 'kernel':
+            array = np.transpose(array)
+        try:
+            if pointer.shape != array.shape:
+                raise ValueError(
+                    f'Pointer shape {pointer.shape} and array shape {array.shape} mismatched'
+                )
+        except AssertionError as e:
+            e.args += (pointer.shape, array.shape)
+            raise
+        logger.info(f'Initialize PyTorch weight {name}')
+        pointer.data = torch.from_numpy(array)
+    return model
+
+
+class BertEmbeddings(nn.Module):
+    """Construct the embeddings from word, position and token_type embeddings."""
+
+    def __init__(self, config):
+        super().__init__()
+        self.word_embeddings = nn.Embedding(
+            config.vocab_size,
+            config.hidden_size,
+            padding_idx=config.pad_token_id)
+        self.position_embeddings = nn.Embedding(config.max_position_embeddings,
+                                                config.hidden_size)
+        self.token_type_embeddings = nn.Embedding(config.type_vocab_size,
+                                                  config.hidden_size)
+
+        # self.LayerNorm is not snake-cased to stick with TensorFlow model
+        # variable name and be able to load any TensorFlow checkpoint file
+        self.LayerNorm = nn.LayerNorm(
+            config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        # position_ids (1, len position emb) is contiguous in memory and
+        # exported when serialized
+        self.position_embedding_type = getattr(config,
+                                               'position_embedding_type',
+                                               'absolute')
+        self.register_buffer(
+            'position_ids',
+            torch.arange(config.max_position_embeddings).expand((1, -1)))
+        if version.parse(torch.__version__) > version.parse('1.6.0'):
+            self.register_buffer(
+                'token_type_ids',
+                torch.zeros(self.position_ids.size(), dtype=torch.long),
+                persistent=False,
+            )
+
+    def forward(self,
+                input_ids=None,
+                token_type_ids=None,
+                position_ids=None,
+                inputs_embeds=None,
+                past_key_values_length=0):
+        if input_ids is not None:
+            input_shape = input_ids.size()
+        else:
+            input_shape = inputs_embeds.size()[:-1]
+
+        seq_length = input_shape[1]
+
+        if position_ids is None:
+            position_ids = self.position_ids[:,
+                                             past_key_values_length:seq_length
+                                             + past_key_values_length]
+
+        # Setting the token_type_ids to the registered buffer in constructor
+        # where it is all zeros, which usually occurs when its auto-generated,
+        # registered buffer helps users when tracing the model without passing
+        # token_type_ids, solves issue #5664
+        if token_type_ids is None:
+            if hasattr(self, 'token_type_ids'):
+                buffered_token_type_ids = self.token_type_ids[:, :seq_length]
+                buffered_token_type_ids_expanded = buffered_token_type_ids.expand(
+                    input_shape[0], seq_length)
+                token_type_ids = buffered_token_type_ids_expanded
+            else:
+                token_type_ids = torch.zeros(
+                    input_shape,
+                    dtype=torch.long,
+                    device=self.position_ids.device)
+
+        if inputs_embeds is None:
+            inputs_embeds = self.word_embeddings(input_ids)
+        token_type_embeddings = self.token_type_embeddings(token_type_ids)
+
+        embeddings = inputs_embeds + token_type_embeddings
+        if self.position_embedding_type == 'absolute':
+            position_embeddings = self.position_embeddings(position_ids)
+            embeddings += position_embeddings
+        embeddings = self.LayerNorm(embeddings)
+        embeddings = self.dropout(embeddings)
+        return embeddings
+
+
+class BertSelfAttention(nn.Module):
+
+    def __init__(self, config, position_embedding_type=None):
+        super().__init__()
+        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(
+                config, 'embedding_size'):
+            raise ValueError(
+                f'The hidden size ({config.hidden_size}) is not a multiple of the number of attention '
+                f'heads ({config.num_attention_heads})')
+
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size
+                                       / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+
+        self.query = nn.Linear(config.hidden_size, self.all_head_size)
+        self.key = nn.Linear(config.hidden_size, self.all_head_size)
+        self.value = nn.Linear(config.hidden_size, self.all_head_size)
+
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+        self.position_embedding_type = position_embedding_type or getattr(
+            config, 'position_embedding_type', 'absolute')
+        if self.position_embedding_type == 'relative_key' or self.position_embedding_type == 'relative_key_query':
+            self.max_position_embeddings = config.max_position_embeddings
+            self.distance_embedding = nn.Embedding(
+                2 * config.max_position_embeddings - 1,
+                self.attention_head_size)
+
+        self.is_decoder = config.is_decoder
+
+    def transpose_for_scores(self, x):
+        new_x_shape = x.size()[:-1] + (self.num_attention_heads,
+                                       self.attention_head_size)
+        x = x.view(*new_x_shape)
+        return x.permute(0, 2, 1, 3)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=False,
+    ):
+        mixed_query_layer = self.query(hidden_states)
+
+        # If this is instantiated as a cross-attention module, the keys
+        # and values come from an encoder; the attention mask needs to be
+        # such that the encoder's padding tokens are not attended to.
+        is_cross_attention = encoder_hidden_states is not None
+
+        if is_cross_attention and past_key_value is not None:
+            # reuse k,v, cross_attentions
+            key_layer = past_key_value[0]
+            value_layer = past_key_value[1]
+            attention_mask = encoder_attention_mask
+        elif is_cross_attention:
+            key_layer = self.transpose_for_scores(
+                self.key(encoder_hidden_states))
+            value_layer = self.transpose_for_scores(
+                self.value(encoder_hidden_states))
+            attention_mask = encoder_attention_mask
+        elif past_key_value is not None:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
+            key_layer = torch.cat([past_key_value[0], key_layer], dim=2)
+            value_layer = torch.cat([past_key_value[1], value_layer], dim=2)
+        else:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
+
+        query_layer = self.transpose_for_scores(mixed_query_layer)
+
+        if self.is_decoder:
+            # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all
+            # cross attention key/value_states. Further calls to cross_attention
+            # layer can then reuse all cross-attention key/value_states (first
+            # "if" case) if uni-directional self-attention (decoder) save
+            # Tuple(torch.Tensor, torch.Tensor) of all previous decoder
+            # key/value_states. Further calls to uni-directional self-attention
+            # can concat previous decoder key/value_states to current projected
+            # key/value_states (third "elif" case) if encoder bi-directional
+            # self-attention `past_key_value` is always `None`
+            past_key_value = (key_layer, value_layer)
+
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = torch.matmul(query_layer,
+                                        key_layer.transpose(-1, -2))
+
+        if self.position_embedding_type == 'relative_key' or self.position_embedding_type == 'relative_key_query':
+            seq_length = hidden_states.size()[1]
+            position_ids_l = torch.arange(
+                seq_length, dtype=torch.long,
+                device=hidden_states.device).view(-1, 1)
+            position_ids_r = torch.arange(
+                seq_length, dtype=torch.long,
+                device=hidden_states.device).view(1, -1)
+            distance = position_ids_l - position_ids_r
+            positional_embedding = self.distance_embedding(
+                distance + self.max_position_embeddings - 1)
+            positional_embedding = positional_embedding.to(
+                dtype=query_layer.dtype)  # fp16 compatibility
+
+            if self.position_embedding_type == 'relative_key':
+                relative_position_scores = torch.einsum(
+                    'bhld,lrd->bhlr', query_layer, positional_embedding)
+                attention_scores = attention_scores + relative_position_scores
+            elif self.position_embedding_type == 'relative_key_query':
+                relative_position_scores_query = torch.einsum(
+                    'bhld,lrd->bhlr', query_layer, positional_embedding)
+                relative_position_scores_key = torch.einsum(
+                    'bhrd,lrd->bhlr', key_layer, positional_embedding)
+                attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key
+
+        attention_scores = attention_scores / math.sqrt(
+            self.attention_head_size)
+        if attention_mask is not None:
+            # Apply the attention mask is (precomputed for all layers in BertModel forward() function)
+            attention_scores = attention_scores + attention_mask
+
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.functional.softmax(attention_scores, dim=-1)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(attention_probs)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs = attention_probs * head_mask
+
+        context_layer = torch.matmul(attention_probs, value_layer)
+
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (
+            self.all_head_size, )
+        context_layer = context_layer.view(*new_context_layer_shape)
+
+        outputs = (context_layer,
+                   attention_probs) if output_attentions else (context_layer, )
+
+        if self.is_decoder:
+            outputs = outputs + (past_key_value, )
+        return outputs
+
+
+class BertSelfOutput(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(
+            config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+class BertAttention(nn.Module):
+
+    def __init__(self, config, position_embedding_type=None):
+        super().__init__()
+        self.self = BertSelfAttention(
+            config, position_embedding_type=position_embedding_type)
+        self.output = BertSelfOutput(config)
+        self.pruned_heads = set()
+
+    def prune_heads(self, heads):
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(
+            heads, self.self.num_attention_heads,
+            self.self.attention_head_size, self.pruned_heads)
+
+        # Prune linear layers
+        self.self.query = prune_linear_layer(self.self.query, index)
+        self.self.key = prune_linear_layer(self.self.key, index)
+        self.self.value = prune_linear_layer(self.self.value, index)
+        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
+
+        # Update hyper params and store pruned heads
+        self.self.num_attention_heads = self.self.num_attention_heads - len(
+            heads)
+        self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
+        self.pruned_heads = self.pruned_heads.union(heads)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=False,
+    ):
+        self_outputs = self.self(
+            hidden_states,
+            attention_mask,
+            head_mask,
+            encoder_hidden_states,
+            encoder_attention_mask,
+            past_key_value,
+            output_attentions,
+        )
+        attention_output = self.output(self_outputs[0], hidden_states)
+        outputs = (attention_output,
+                   ) + self_outputs[1:]  # add attentions if we output them
+        return outputs
+
+
+class BertIntermediate(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        return hidden_states
+
+
+class BertOutput(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(
+            config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+class BertLayer(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.chunk_size_feed_forward = config.chunk_size_feed_forward
+        self.seq_len_dim = 1
+        self.attention = BertAttention(config)
+        self.is_decoder = config.is_decoder
+        self.add_cross_attention = config.add_cross_attention
+        if self.add_cross_attention:
+            if not self.is_decoder:
+                raise ValueError(
+                    f'{self} should be used as a decoder model if cross attention is added'
+                )
+            self.crossattention = BertAttention(
+                config, position_embedding_type='absolute')
+        self.intermediate = BertIntermediate(config)
+        self.output = BertOutput(config)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=False,
+    ):
+        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
+        self_attn_past_key_value = past_key_value[:
+                                                  2] if past_key_value is not None else None
+        self_attention_outputs = self.attention(
+            hidden_states,
+            attention_mask,
+            head_mask,
+            output_attentions=output_attentions,
+            past_key_value=self_attn_past_key_value,
+        )
+        attention_output = self_attention_outputs[0]
+
+        # if decoder, the last output is tuple of self-attn cache
+        if self.is_decoder:
+            outputs = self_attention_outputs[1:-1]
+            present_key_value = self_attention_outputs[-1]
+        else:
+            outputs = self_attention_outputs[
+                1:]  # add self attentions if we output attention weights
+
+        cross_attn_present_key_value = None
+        if self.is_decoder and encoder_hidden_states is not None:
+            if not hasattr(self, 'crossattention'):
+                raise ValueError(
+                    f'If `encoder_hidden_states` are passed, {self} has to be instantiated '
+                    f'with cross-attention layers by setting `config.add_cross_attention=True`'
+                )
+
+            # cross_attn cached key/values tuple is at positions 3,4 of past_key_value tuple
+            cross_attn_past_key_value = past_key_value[
+                -2:] if past_key_value is not None else None
+            cross_attention_outputs = self.crossattention(
+                attention_output,
+                attention_mask,
+                head_mask,
+                encoder_hidden_states,
+                encoder_attention_mask,
+                cross_attn_past_key_value,
+                output_attentions,
+            )
+            attention_output = cross_attention_outputs[0]
+            outputs = outputs + cross_attention_outputs[
+                1:-1]  # add cross attentions if we output attention weights
+
+            # add cross-attn cache to positions 3,4 of present_key_value tuple
+            cross_attn_present_key_value = cross_attention_outputs[-1]
+            present_key_value = present_key_value + cross_attn_present_key_value
+
+        layer_output = apply_chunking_to_forward(self.feed_forward_chunk,
+                                                 self.chunk_size_feed_forward,
+                                                 self.seq_len_dim,
+                                                 attention_output)
+        outputs = (layer_output, ) + outputs
+
+        # if decoder, return the attn key/values as the last output
+        if self.is_decoder:
+            outputs = outputs + (present_key_value, )
+
+        return outputs
+
+    def feed_forward_chunk(self, attention_output):
+        intermediate_output = self.intermediate(attention_output)
+        layer_output = self.output(intermediate_output, attention_output)
+        return layer_output
+
+
+class BertEncoder(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.layer = nn.ModuleList(
+            [BertLayer(config) for _ in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_values=None,
+        use_cache=None,
+        output_attentions=False,
+        output_hidden_states=False,
+        return_dict=True,
+    ):
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+        all_cross_attentions = (
+        ) if output_attentions and self.config.add_cross_attention else None
+
+        next_decoder_cache = () if use_cache else None
+        for i, layer_module in enumerate(self.layer):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states, )
+
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+            past_key_value = past_key_values[
+                i] if past_key_values is not None else None
+
+            if self.gradient_checkpointing and self.training:
+
+                if use_cache:
+                    logger.warning(
+                        '`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...'
+                    )
+                    use_cache = False
+
+                def create_custom_forward(module):
+
+                    def custom_forward(*inputs):
+                        return module(*inputs, past_key_value,
+                                      output_attentions)
+
+                    return custom_forward
+
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(layer_module),
+                    hidden_states,
+                    attention_mask,
+                    layer_head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                )
+            else:
+                layer_outputs = layer_module(
+                    hidden_states,
+                    attention_mask,
+                    layer_head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    past_key_value,
+                    output_attentions,
+                )
+
+            hidden_states = layer_outputs[0]
+            if use_cache:
+                next_decoder_cache += (layer_outputs[-1], )
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (
+                    layer_outputs[1], )
+                if self.config.add_cross_attention:
+                    all_cross_attentions = all_cross_attentions + (
+                        layer_outputs[2], )
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states, )
+
+        if not return_dict:
+            return tuple(v for v in [
+                hidden_states,
+                next_decoder_cache,
+                all_hidden_states,
+                all_self_attentions,
+                all_cross_attentions,
+            ] if v is not None)
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=next_decoder_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+            cross_attentions=all_cross_attentions,
+        )
+
+
+class BertPooler(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.activation = nn.Tanh()
+
+    def forward(self, hidden_states):
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the first token.
+        first_token_tensor = hidden_states[:, 0]
+        pooled_output = self.dense(first_token_tensor)
+        pooled_output = self.activation(pooled_output)
+        return pooled_output
+
+
+class BertPredictionHeadTransform(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        if isinstance(config.hidden_act, str):
+            self.transform_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.transform_act_fn = config.hidden_act
+        self.LayerNorm = nn.LayerNorm(
+            config.hidden_size, eps=config.layer_norm_eps)
+
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.transform_act_fn(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states)
+        return hidden_states
+
+
+class BertLMPredictionHead(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.transform = BertPredictionHeadTransform(config)
+
+        # The output weights are the same as the input embeddings, but there is
+        # an output-only bias for each token.
+        self.decoder = nn.Linear(
+            config.hidden_size, config.vocab_size, bias=False)
+
+        self.bias = nn.Parameter(torch.zeros(config.vocab_size))
+
+        # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
+        self.decoder.bias = self.bias
+
+    def forward(self, hidden_states):
+        hidden_states = self.transform(hidden_states)
+        hidden_states = self.decoder(hidden_states)
+        return hidden_states
+
+
+class BertOnlyMLMHead(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.predictions = BertLMPredictionHead(config)
+
+    def forward(self, sequence_output):
+        prediction_scores = self.predictions(sequence_output)
+        return prediction_scores
+
+
+class BertOnlyNSPHead(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.seq_relationship = nn.Linear(config.hidden_size, 2)
+
+    def forward(self, pooled_output):
+        seq_relationship_score = self.seq_relationship(pooled_output)
+        return seq_relationship_score
+
+
+class BertPreTrainingHeads(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.predictions = BertLMPredictionHead(config)
+        self.seq_relationship = nn.Linear(config.hidden_size, 2)
+
+    def forward(self, sequence_output, pooled_output):
+        prediction_scores = self.predictions(sequence_output)
+        seq_relationship_score = self.seq_relationship(pooled_output)
+        return prediction_scores, seq_relationship_score
+
+
+class BertPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface
+    for downloading and loading pretrained models.
+    """
+
+    config_class = BertConfig
+    load_tf_weights = load_tf_weights_in_bert
+    base_model_prefix = 'bert'
+    supports_gradient_checkpointing = True
+    _keys_to_ignore_on_load_missing = [r'position_ids']
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, nn.Linear):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(
+                mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(
+                mean=0.0, std=self.config.initializer_range)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, BertEncoder):
+            module.gradient_checkpointing = value
+
+
+@dataclass
+class BertForPreTrainingOutput(ModelOutput):
+    """
+    Output type of [`BertForPreTraining`].
+
+    Args:
+        loss (*optional*, returned when `labels` is provided,
+        `torch.FloatTensor` of shape `(1,)`):
+            Total loss as the sum of the masked language modeling loss and the
+            next sequence prediction (classification) loss.
+        prediction_logits (`torch.FloatTensor` of shape `(batch_size,
+        sequence_length, config.vocab_size)`):
+            Prediction scores of the language modeling head (scores for each
+            vocabulary token before SoftMax).
+        seq_relationship_logits (`torch.FloatTensor` of shape `(batch_size,
+        2)`):
+            Prediction scores of the next sequence prediction (classification)
+            head (scores of True/False continuation before SoftMax).
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when
+        `output_hidden_states=True` is passed or when
+        `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings +
+            one for the output of each layer) of shape `(batch_size,
+            sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the
+            initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when
+        `output_attentions=True` is passed or when
+        `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape
+            `(batch_size, num_heads, sequence_length, sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the
+            weighted average in the self-attention heads.
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    prediction_logits: torch.FloatTensor = None
+    seq_relationship_logits: torch.FloatTensor = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+
+
+BERT_START_DOCSTRING = r"""
+
+    This model inherits from [`PreTrainedModel`]. Check the superclass
+    documentation for the generic methods the library implements for all its
+    model (such as downloading or saving, resizing the input embeddings, pruning
+    heads etc.)
+
+    This model is also a PyTorch
+    [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module)
+    subclass. Use it as a regular PyTorch Module and refer to the PyTorch
+    documentation for all matter related to general usage and behavior.
+
+    Parameters:
+        config ([`BertConfig`]): Model configuration class with all the
+        parameters of the model.
+            Initializing with a config file does not load the weights associated
+            with the model, only the configuration. Check out the
+            [`~PreTrainedModel.from_pretrained`] method to load the model
+            weights.
+"""
+
+BERT_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `({0})`):
+            Indices of input sequence tokens in the vocabulary.
+
+            Indices can be obtained using [`BertTokenizer`]. See
+            [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`]
+            for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask
+            values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
+            Segment token indices to indicate first and second portions of the
+            inputs. Indices are selected in `[0, 1]`:
+
+            - 0 corresponds to a *sentence A* token,
+            - 1 corresponds to a *sentence B* token.
+
+            [What are token type IDs?](../glossary#token-type-ids)
+        position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
+            Indices of positions of each input sequence tokens in the position
+            embeddings. Selected in the range `[0,
+            config.max_position_embeddings - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers,
+        num_heads)`, *optional*):
+            Mask to nullify selected heads of the self-attention modules. Mask
+            values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`,
+        *optional*):
+            Optionally, instead of passing `input_ids` you can choose to
+            directly pass an embedded representation. This is useful if you want
+            more control over how to convert `input_ids` indices into associated
+            vectors than the model's internal embedding lookup matrix.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention
+            layers. See `attentions` under returned tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See
+            `hidden_states` under returned tensors for more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~file_utils.ModelOutput`] instead of a
+            plain tuple.
+"""
+
+
+@add_start_docstrings(
+    'The bare Bert Model transformer outputting raw hidden-states without any specific head on top.',
+    BERT_START_DOCSTRING,
+)
+class BertModel(BertPreTrainedModel):
+    """
+
+    The model can behave as an encoder (with only self-attention) as well as a
+    decoder, in which case a layer of cross-attention is added between the
+    self-attention layers, following the architecture described in [Attention is
+    all you need](https://arxiv.org/abs/1706.03762) by Ashish Vaswani, Noam
+    Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz
+    Kaiser and Illia Polosukhin.
+
+    To behave as an decoder the model needs to be initialized with the
+    `is_decoder` argument of the configuration set to `True`. To be used in a
+    Seq2Seq model, the model needs to initialized with both `is_decoder`
+    argument and `add_cross_attention` set to `True`; an `encoder_hidden_states`
+    is then expected as an input to the forward pass.
+    """
+
+    def __init__(self, config, add_pooling_layer=True):
+        super().__init__(config)
+        self.embeddings = BertEmbeddings(config)
+        self.encoder = BertEncoder(config)
+
+        self.pooler = BertPooler(config) if add_pooling_layer else None
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @classmethod
+    def _instantiate(cls, model_dir=None, add_pooling_layer=True, **config):
+        config = BertConfig(**config)
+        model = cls(config, add_pooling_layer)
+        return model
+
+    def get_input_embeddings(self):
+        return self.embeddings.word_embeddings
+
+    def set_input_embeddings(self, value):
+        self.embeddings.word_embeddings = value
+
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+
+    @add_start_docstrings_to_model_forward(
+        BERT_INPUTS_DOCSTRING.format('batch_size, sequence_length'))
+    def forward(self,
+                input_ids=None,
+                attention_mask=None,
+                token_type_ids=None,
+                position_ids=None,
+                head_mask=None,
+                inputs_embeds=None,
+                encoder_hidden_states=None,
+                encoder_attention_mask=None,
+                past_key_values=None,
+                use_cache=None,
+                output_attentions=None,
+                output_hidden_states=None,
+                return_dict=None,
+                **kwargs):
+        r"""
+        encoder_hidden_states  (`torch.FloatTensor` of shape `(batch_size,
+        sequence_length, hidden_size)`, *optional*):
+            Sequence of hidden-states at the output of the last layer of the
+            encoder. Used in the cross-attention if the model is configured as a
+            decoder.
+        encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size,
+        sequence_length)`, *optional*):
+            Mask to avoid performing attention on the padding token indices of
+            the encoder input. This mask is used in the cross-attention if the
+            model is configured as a decoder. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+        past_key_values (`tuple(tuple(torch.FloatTensor))` of length
+        `config.n_layers` with each tuple having 4 tensors of shape
+        `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+            Contains precomputed key and value hidden states of the attention
+            blocks. Can be used to speed up decoding.
+
+            If `past_key_values` are used, the user can optionally input only
+            the last `decoder_input_ids` (those that don't have their past key
+            value states given to this model) of shape `(batch_size, 1)` instead
+            of all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned
+            and can be used to speed up decoding (see `past_key_values`).
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else
+            self.config.output_hidden_states)
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if self.config.is_decoder:
+            use_cache = use_cache if use_cache is not None else self.config.use_cache
+        else:
+            use_cache = False
+
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError(
+                'You cannot specify both input_ids and inputs_embeds at the same time'
+            )
+        elif input_ids is not None:
+            input_shape = input_ids.size()
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+        else:
+            raise ValueError(
+                'You have to specify either input_ids or inputs_embeds')
+
+        batch_size, seq_length = input_shape
+        device = input_ids.device if input_ids is not None else inputs_embeds.device
+
+        # past_key_values_length
+        past_key_values_length = past_key_values[0][0].shape[
+            2] if past_key_values is not None else 0
+
+        if attention_mask is None:
+            attention_mask = torch.ones(
+                ((batch_size, seq_length + past_key_values_length)),
+                device=device)
+
+        if token_type_ids is None:
+            if hasattr(self.embeddings, 'token_type_ids'):
+                buffered_token_type_ids = self.embeddings.token_type_ids[:, :
+                                                                         seq_length]
+                buffered_token_type_ids_expanded = buffered_token_type_ids.expand(
+                    batch_size, seq_length)
+                token_type_ids = buffered_token_type_ids_expanded
+            else:
+                token_type_ids = torch.zeros(
+                    input_shape, dtype=torch.long, device=device)
+
+        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+        # ourselves in which case we just need to make it broadcastable to all heads.
+        extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(
+            attention_mask, input_shape, device)
+
+        # If a 2D or 3D attention mask is provided for the cross-attention
+        # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
+        if self.config.is_decoder and encoder_hidden_states is not None:
+            encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size(
+            )
+            encoder_hidden_shape = (encoder_batch_size,
+                                    encoder_sequence_length)
+            if encoder_attention_mask is None:
+                encoder_attention_mask = torch.ones(
+                    encoder_hidden_shape, device=device)
+            encoder_extended_attention_mask = self.invert_attention_mask(
+                encoder_attention_mask)
+        else:
+            encoder_extended_attention_mask = None
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        head_mask = self.get_head_mask(head_mask,
+                                       self.config.num_hidden_layers)
+
+        embedding_output = self.embeddings(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            token_type_ids=token_type_ids,
+            inputs_embeds=inputs_embeds,
+            past_key_values_length=past_key_values_length,
+        )
+        encoder_outputs = self.encoder(
+            embedding_output,
+            attention_mask=extended_attention_mask,
+            head_mask=head_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_extended_attention_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = encoder_outputs[0]
+        pooled_output = self.pooler(
+            sequence_output) if self.pooler is not None else None
+
+        if not return_dict:
+            return (sequence_output, pooled_output) + encoder_outputs[1:]
+
+        return BaseModelOutputWithPoolingAndCrossAttentions(
+            last_hidden_state=sequence_output,
+            pooler_output=pooled_output,
+            past_key_values=encoder_outputs.past_key_values,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+            cross_attentions=encoder_outputs.cross_attentions,
+        )
+
+    def extract_sequence_outputs(self, outputs):
+        return outputs['last_hidden_state']
+
+    def extract_pooled_outputs(self, outputs):
+        return outputs['pooler_output']
+
+
+@add_start_docstrings(
+    """
+    Bert Model with two heads on top as done during the pretraining: a `masked
+    language modeling` head and a `next sentence prediction (classification)`
+    head.
+    """,
+    BERT_START_DOCSTRING,
+)
+class BertForPreTraining(BertPreTrainedModel):
+
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.bert = BertModel(config)
+        self.cls = BertPreTrainingHeads(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_output_embeddings(self):
+        return self.cls.predictions.decoder
+
+    def set_output_embeddings(self, new_embeddings):
+        self.cls.predictions.decoder = new_embeddings
+
+    @add_start_docstrings_to_model_forward(
+        BERT_INPUTS_DOCSTRING.format('batch_size, sequence_length'))
+    @replace_return_docstrings(
+        output_type=BertForPreTrainingOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+        next_sentence_label=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`,
+            *optional*):
+                Labels for computing the masked language modeling loss. Indices
+                should be in `[-100, 0, ..., config.vocab_size]` (see
+                `input_ids` docstring) Tokens with indices set to `-100` are
+                ignored (masked), the loss is only computed for the tokens with
+                labels in `[0, ..., config.vocab_size]`
+            next_sentence_label (`torch.LongTensor` of shape `(batch_size,)`,
+            *optional*):
+                Labels for computing the next sequence prediction
+                (classification) loss. Input should be a sequence pair (see
+                `input_ids` docstring) Indices should be in `[0, 1]`:
+
+                - 0 indicates sequence B is a continuation of sequence A,
+                - 1 indicates sequence B is a random sequence.
+            kwargs (`Dict[str, any]`, optional, defaults to *{}*):
+                Used to hide legacy arguments that have been deprecated.
+
+        Returns:
+
+        Example:
+
+        ```python >>> from transformers import BertTokenizer, BertForPreTraining
+        >>> import torch
+
+        >>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+        >>> model = BertForPreTraining.from_pretrained('bert-base-uncased')
+
+        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
+        >>> outputs = model(**inputs)
+
+        >>> prediction_logits = outputs.prediction_logits
+        >>> seq_relationship_logits = outputs.seq_relationship_logits
+        ```
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output, pooled_output = outputs[:2]
+        prediction_scores, seq_relationship_score = self.cls(
+            sequence_output, pooled_output)
+
+        total_loss = None
+        if labels is not None and next_sentence_label is not None:
+            loss_fct = CrossEntropyLoss()
+            masked_lm_loss = loss_fct(
+                prediction_scores.view(-1, self.config.vocab_size),
+                labels.view(-1))
+            next_sentence_loss = loss_fct(
+                seq_relationship_score.view(-1, 2),
+                next_sentence_label.view(-1))
+            total_loss = masked_lm_loss + next_sentence_loss
+
+        if not return_dict:
+            output = (prediction_scores, seq_relationship_score) + outputs[2:]
+            return ((total_loss, )
+                    + output) if total_loss is not None else output
+
+        return BertForPreTrainingOutput(
+            loss=total_loss,
+            prediction_logits=prediction_scores,
+            seq_relationship_logits=seq_relationship_score,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """Bert Model with a `language modeling` head on top for CLM fine-tuning. """,
+    BERT_START_DOCSTRING)
+class BertLMHeadModel(BertPreTrainedModel):
+
+    _keys_to_ignore_on_load_unexpected = [r'pooler']
+    _keys_to_ignore_on_load_missing = [
+        r'position_ids', r'predictions.decoder.bias'
+    ]
+
+    def __init__(self, config):
+        super().__init__(config)
+
+        if not config.is_decoder:
+            logger.warning(
+                'If you want to use `BertLMHeadModel` as a standalone, add `is_decoder=True.`'
+            )
+
+        self.bert = BertModel(config, add_pooling_layer=False)
+        self.cls = BertOnlyMLMHead(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_output_embeddings(self):
+        return self.cls.predictions.decoder
+
+    def set_output_embeddings(self, new_embeddings):
+        self.cls.predictions.decoder = new_embeddings
+
+    @add_start_docstrings_to_model_forward(
+        BERT_INPUTS_DOCSTRING.format('batch_size, sequence_length'))
+    @replace_return_docstrings(
+        output_type=CausalLMOutputWithCrossAttentions,
+        config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        labels=None,
+        past_key_values=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+            encoder_hidden_states  (`torch.FloatTensor` of shape `(batch_size,
+            sequence_length, hidden_size)`, *optional*):
+                Sequence of hidden-states at the output of the last layer of the
+                encoder. Used in the cross-attention if the model is configured
+                as a decoder.
+            encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size,
+            sequence_length)`, *optional*):
+                Mask to avoid performing attention on the padding token indices
+                of the encoder input. This mask is used in the cross-attention
+                if the model is configured as a decoder. Mask values selected in
+                `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`,
+            *optional*):
+                Labels for computing the left-to-right language modeling loss
+                (next word prediction). Indices should be in `[-100, 0, ...,
+                config.vocab_size]` (see `input_ids` docstring) Tokens with
+                indices set to `-100` are ignored (masked), the loss is only
+                computed for the tokens with labels n `[0, ...,
+                config.vocab_size]`
+            past_key_values (`tuple(tuple(torch.FloatTensor))` of length
+            `config.n_layers` with each tuple having 4 tensors of shape
+            `(batch_size, num_heads, sequence_length - 1,
+            embed_size_per_head)`):
+                Contains precomputed key and value hidden states of the
+                attention blocks. Can be used to speed up decoding.
+
+                If `past_key_values` are used, the user can optionally input
+                only the last `decoder_input_ids` (those that don't have their
+                past key value states given to this model) of shape
+                `(batch_size, 1)` instead of all `decoder_input_ids` of shape
+                `(batch_size, sequence_length)`.
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are
+                returned and can be used to speed up decoding (see
+                `past_key_values`).
+
+        Returns:
+
+        Example:
+
+        ```python >>> from transformers import BertTokenizer, BertLMHeadModel,
+        BertConfig >>> import torch
+
+        >>> tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
+        >>> config = BertConfig.from_pretrained("bert-base-cased")
+        >>> config.is_decoder = True
+        >>> model = BertLMHeadModel.from_pretrained('bert-base-cased', config=config)
+
+        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
+        >>> outputs = model(**inputs)
+
+        >>> prediction_logits = outputs.logits
+        ```
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if labels is not None:
+            use_cache = False
+
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+        prediction_scores = self.cls(sequence_output)
+
+        lm_loss = None
+        if labels is not None:
+            # we are doing next-token prediction; shift prediction scores and input ids by one
+            shifted_prediction_scores = prediction_scores[:, :
+                                                          -1, :].contiguous()
+            labels = labels[:, 1:].contiguous()
+            loss_fct = CrossEntropyLoss()
+            lm_loss = loss_fct(
+                shifted_prediction_scores.view(-1, self.config.vocab_size),
+                labels.view(-1))
+
+        if not return_dict:
+            output = (prediction_scores, ) + outputs[2:]
+            return ((lm_loss, ) + output) if lm_loss is not None else output
+
+        return CausalLMOutputWithCrossAttentions(
+            loss=lm_loss,
+            logits=prediction_scores,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            cross_attentions=outputs.cross_attentions,
+        )
+
+    def prepare_inputs_for_generation(self,
+                                      input_ids,
+                                      past=None,
+                                      attention_mask=None,
+                                      **model_kwargs):
+        input_shape = input_ids.shape
+        # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly
+        if attention_mask is None:
+            attention_mask = input_ids.new_ones(input_shape)
+
+        # cut decoder_input_ids if past is used
+        if past is not None:
+            input_ids = input_ids[:, -1:]
+
+        return {
+            'input_ids': input_ids,
+            'attention_mask': attention_mask,
+            'past_key_values': past
+        }
+
+    def _reorder_cache(self, past, beam_idx):
+        reordered_past = ()
+        for layer_past in past:
+            reordered_past += (tuple(
+                past_state.index_select(0, beam_idx)
+                for past_state in layer_past), )
+        return reordered_past
+
+
+@add_start_docstrings(
+    """Bert Model with a `language modeling` head on top. """,
+    BERT_START_DOCSTRING)
+class BertForMaskedLM(BertPreTrainedModel):
+
+    _keys_to_ignore_on_load_unexpected = [r'pooler']
+    _keys_to_ignore_on_load_missing = [
+        r'position_ids', r'predictions.decoder.bias'
+    ]
+
+    def __init__(self, config):
+        super().__init__(config)
+
+        if config.is_decoder:
+            logger.warning(
+                'If you want to use `BertForMaskedLM` make sure `config.is_decoder=False` for '
+                'bi-directional self-attention.')
+
+        self.bert = BertModel(config, add_pooling_layer=False)
+        self.cls = BertOnlyMLMHead(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_output_embeddings(self):
+        return self.cls.predictions.decoder
+
+    def set_output_embeddings(self, new_embeddings):
+        self.cls.predictions.decoder = new_embeddings
+
+    @add_start_docstrings_to_model_forward(
+        BERT_INPUTS_DOCSTRING.format('batch_size, sequence_length'))
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        labels=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`,
+        *optional*):
+            Labels for computing the masked language modeling loss. Indices
+            should be in `[-100, 0, ..., config.vocab_size]` (see `input_ids`
+            docstring) Tokens with indices set to `-100` are ignored (masked),
+            the loss is only computed for the tokens with labels in `[0, ...,
+            config.vocab_size]`
+        """
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+        prediction_scores = self.cls(sequence_output)
+
+        masked_lm_loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()  # -100 index = padding token
+            masked_lm_loss = loss_fct(
+                prediction_scores.view(-1, self.config.vocab_size),
+                labels.view(-1))
+
+        if not return_dict:
+            output = (prediction_scores, ) + outputs[2:]
+            return ((masked_lm_loss, )
+                    + output) if masked_lm_loss is not None else output
+
+        return MaskedLMOutput(
+            loss=masked_lm_loss,
+            logits=prediction_scores,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+    def prepare_inputs_for_generation(self,
+                                      input_ids,
+                                      attention_mask=None,
+                                      **model_kwargs):
+        input_shape = input_ids.shape
+        effective_batch_size = input_shape[0]
+
+        #  add a dummy token
+        if self.config.pad_token_id is None:
+            raise ValueError('The PAD token should be defined for generation')
+
+        padding_mask = attention_mask.new_zeros((attention_mask.shape[0], 1))
+        attention_mask = torch.cat([attention_mask, padding_mask], dim=-1)
+        dummy_token = torch.full((effective_batch_size, 1),
+                                 self.config.pad_token_id,
+                                 dtype=torch.long,
+                                 device=input_ids.device)
+        input_ids = torch.cat([input_ids, dummy_token], dim=1)
+
+        return {'input_ids': input_ids, 'attention_mask': attention_mask}
+
+
+@add_start_docstrings(
+    """Bert Model with a `next sentence prediction (classification)` head on top. """,
+    BERT_START_DOCSTRING,
+)
+class BertForNextSentencePrediction(BertPreTrainedModel):
+
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.bert = BertModel(config)
+        self.cls = BertOnlyNSPHead(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(
+        BERT_INPUTS_DOCSTRING.format('batch_size, sequence_length'))
+    @replace_return_docstrings(
+        output_type=NextSentencePredictorOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+        **kwargs,
+    ):
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the next sequence prediction (classification)
+            loss. Input should be a sequence pair (see `input_ids` docstring).
+            Indices should be in `[0, 1]`:
+
+            - 0 indicates sequence B is a continuation of sequence A,
+            - 1 indicates sequence B is a random sequence.
+
+        Returns:
+
+        Example:
+
+        ```python >>> from transformers import BertTokenizer,
+        BertForNextSentencePrediction >>> import torch
+
+        >>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+        >>> model = BertForNextSentencePrediction.from_pretrained('bert-base-uncased')
+
+        >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
+        >>> next_sentence = "The sky is blue due to the shorter wavelength of blue light."
+        >>> encoding = tokenizer(prompt, next_sentence, return_tensors='pt')
+
+        >>> outputs = model(**encoding, labels=torch.LongTensor([1]))
+        >>> logits = outputs.logits
+        >>> assert logits[0, 0] < logits[0, 1] # next sentence was random
+        ```
+        """
+
+        if 'next_sentence_label' in kwargs:
+            warnings.warn(
+                'The `next_sentence_label` argument is deprecated, use `labels` instead.',
+                FutureWarning,
+            )
+            labels = kwargs.pop('next_sentence_label')
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        pooled_output = outputs[1]
+
+        seq_relationship_scores = self.cls(pooled_output)
+
+        next_sentence_loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            next_sentence_loss = loss_fct(
+                seq_relationship_scores.view(-1, 2), labels.view(-1))
+
+        if not return_dict:
+            output = (seq_relationship_scores, ) + outputs[2:]
+            return ((next_sentence_loss, )
+                    + output) if next_sentence_loss is not None else output
+
+        return NextSentencePredictorOutput(
+            loss=next_sentence_loss,
+            logits=seq_relationship_scores,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    Bert Model transformer with a sequence classification/regression head on top
+    (a linear layer on top of the pooled output) e.g. for GLUE tasks.
+    """,
+    BERT_START_DOCSTRING,
+)
+class BertForSequenceClassification(BertPreTrainedModel):
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.config = config
+
+        self.bert = BertModel(config)
+        classifier_dropout = (
+            config.classifier_dropout if config.classifier_dropout is not None
+            else config.hidden_dropout_prob)
+        self.dropout = nn.Dropout(classifier_dropout)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(
+        BERT_INPUTS_DOCSTRING.format('batch_size, sequence_length'))
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss.
+            Indices should be in `[0, ..., config.num_labels - 1]`. If
+            `config.num_labels == 1` a regression loss is computed (Mean-Square
+            loss), If `config.num_labels > 1` a classification loss is computed
+            (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        pooled_output = outputs[1]
+
+        pooled_output = self.dropout(pooled_output)
+        logits = self.classifier(pooled_output)
+
+        loss = None
+        if labels is not None:
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = 'regression'
+                elif self.num_labels > 1 and (labels.dtype == torch.long
+                                              or labels.dtype == torch.int):
+                    self.config.problem_type = 'single_label_classification'
+                else:
+                    self.config.problem_type = 'multi_label_classification'
+
+            if self.config.problem_type == 'regression':
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(logits, labels)
+            elif self.config.problem_type == 'single_label_classification':
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(
+                    logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == 'multi_label_classification':
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(logits, labels)
+        if not return_dict:
+            output = (logits, ) + outputs[2:]
+            return ((loss, ) + output) if loss is not None else output
+
+        return SequenceClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    Bert Model with a multiple choice classification head on top (a linear layer
+    on top of the pooled output and a softmax) e.g. for RocStories/SWAG tasks.
+    """,
+    BERT_START_DOCSTRING,
+)
+class BertForMultipleChoice(BertPreTrainedModel):
+
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.bert = BertModel(config)
+        classifier_dropout = (
+            config.classifier_dropout if config.classifier_dropout is not None
+            else config.hidden_dropout_prob)
+        self.dropout = nn.Dropout(classifier_dropout)
+        self.classifier = nn.Linear(config.hidden_size, 1)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(
+        BERT_INPUTS_DOCSTRING.format(
+            'batch_size, num_choices, sequence_length'))
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the multiple choice classification loss.
+            Indices should be in `[0, ..., num_choices-1]` where `num_choices`
+            is the size of the second dimension of the input tensors. (See
+            `input_ids` above)
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        num_choices = input_ids.shape[
+            1] if input_ids is not None else inputs_embeds.shape[1]
+
+        input_ids = input_ids.view(
+            -1, input_ids.size(-1)) if input_ids is not None else None
+        attention_mask = attention_mask.view(
+            -1,
+            attention_mask.size(-1)) if attention_mask is not None else None
+        token_type_ids = token_type_ids.view(
+            -1,
+            token_type_ids.size(-1)) if token_type_ids is not None else None
+        position_ids = position_ids.view(
+            -1, position_ids.size(-1)) if position_ids is not None else None
+        inputs_embeds = (
+            inputs_embeds.view(-1, inputs_embeds.size(-2),
+                               inputs_embeds.size(-1))
+            if inputs_embeds is not None else None)
+
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        pooled_output = outputs[1]
+
+        pooled_output = self.dropout(pooled_output)
+        logits = self.classifier(pooled_output)
+        reshaped_logits = logits.view(-1, num_choices)
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(reshaped_logits, labels)
+
+        if not return_dict:
+            output = (reshaped_logits, ) + outputs[2:]
+            return ((loss, ) + output) if loss is not None else output
+
+        return MultipleChoiceModelOutput(
+            loss=loss,
+            logits=reshaped_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    Bert Model with a token classification head on top (a linear layer on top of
+    the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks.
+    """,
+    BERT_START_DOCSTRING,
+)
+class BertForTokenClassification(BertPreTrainedModel):
+
+    _keys_to_ignore_on_load_unexpected = [r'pooler']
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+
+        self.bert = BertModel(config, add_pooling_layer=False)
+        classifier_dropout = (
+            config.classifier_dropout if config.classifier_dropout is not None
+            else config.hidden_dropout_prob)
+        self.dropout = nn.Dropout(classifier_dropout)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(
+        BERT_INPUTS_DOCSTRING.format('batch_size, sequence_length'))
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`,
+        *optional*):
+            Labels for computing the token classification loss. Indices should
+            be in `[0, ..., config.num_labels - 1]`.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+
+        sequence_output = self.dropout(sequence_output)
+        logits = self.classifier(sequence_output)
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            # Only keep active parts of the loss
+            if attention_mask is not None:
+                active_loss = attention_mask.view(-1) == 1
+                active_logits = logits.view(-1, self.num_labels)
+                active_labels = torch.where(
+                    active_loss, labels.view(-1),
+                    torch.tensor(loss_fct.ignore_index).type_as(labels))
+                loss = loss_fct(active_logits, active_labels)
+            else:
+                loss = loss_fct(
+                    logits.view(-1, self.num_labels), labels.view(-1))
+
+        if not return_dict:
+            output = (logits, ) + outputs[2:]
+            return ((loss, ) + output) if loss is not None else output
+
+        return TokenClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    Bert Model with a span classification head on top for extractive
+    question-answering tasks like SQuAD (a linear layers on top of the
+    hidden-states output to compute `span start logits` and `span end logits`).
+    """,
+    BERT_START_DOCSTRING,
+)
+class BertForQuestionAnswering(BertPreTrainedModel):
+
+    _keys_to_ignore_on_load_unexpected = [r'pooler']
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+
+        self.bert = BertModel(config, add_pooling_layer=False)
+        self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(
+        BERT_INPUTS_DOCSTRING.format('batch_size, sequence_length'))
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        start_positions=None,
+        end_positions=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        start_positions (`torch.LongTensor` of shape `(batch_size,)`,
+        *optional*):
+            Labels for position (index) of the start of the labelled span for
+            computing the token classification loss. Positions are clamped to
+            the length of the sequence (`sequence_length`). Position outside of
+            the sequence are not taken into account for computing the loss.
+        end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for position (index) of the end of the labelled span for
+            computing the token classification loss. Positions are clamped to
+            the length of the sequence (`sequence_length`). Position outside of
+            the sequence are not taken into account for computing the loss.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+
+        logits = self.qa_outputs(sequence_output)
+        start_logits, end_logits = logits.split(1, dim=-1)
+        start_logits = start_logits.squeeze(-1).contiguous()
+        end_logits = end_logits.squeeze(-1).contiguous()
+
+        total_loss = None
+        if start_positions is not None and end_positions is not None:
+            # If we are on multi-GPU, split add a dimension
+            if len(start_positions.size()) > 1:
+                start_positions = start_positions.squeeze(-1)
+            if len(end_positions.size()) > 1:
+                end_positions = end_positions.squeeze(-1)
+            # sometimes the start/end positions are outside our model inputs, we ignore these terms
+            ignored_index = start_logits.size(1)
+            start_positions = start_positions.clamp(0, ignored_index)
+            end_positions = end_positions.clamp(0, ignored_index)
+
+            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
+            start_loss = loss_fct(start_logits, start_positions)
+            end_loss = loss_fct(end_logits, end_positions)
+            total_loss = (start_loss + end_loss) / 2
+
+        if not return_dict:
+            output = (start_logits, end_logits) + outputs[2:]
+            return ((total_loss, )
+                    + output) if total_loss is not None else output
+
+        return QuestionAnsweringModelOutput(
+            loss=total_loss,
+            start_logits=start_logits,
+            end_logits=end_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
diff --git a/modelscope/models/nlp/bert_for_document_segmentation.py b/modelscope/models/nlp/bert_for_document_segmentation.py
new file mode 100644
index 00000000..dfa57597
--- /dev/null
+++ b/modelscope/models/nlp/bert_for_document_segmentation.py
@@ -0,0 +1,108 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+from typing import Any, Dict
+
+from torch import nn
+from torch.nn import CrossEntropyLoss
+from transformers.modeling_outputs import TokenClassifierOutput
+from transformers.models.bert.modeling_bert import (BertModel,
+                                                    BertPreTrainedModel)
+
+from modelscope.metainfo import Models
+from modelscope.models.base import Model
+from modelscope.models.builder import MODELS
+from modelscope.utils.constant import Tasks
+
+__all__ = ['BertForDocumentSegmentation']
+
+
+@MODELS.register_module(
+    Tasks.document_segmentation, module_name=Models.bert_for_ds)
+class BertForDocumentSegmentation(Model):
+
+    def __init__(self, model_dir: str, *args, **kwargs):
+        super().__init__(model_dir, *args, **kwargs)
+
+    def build_with_config(self, config):
+        self.bert_model = BertForDocumentSegmentationBase.from_pretrained(
+            self.model_dir, from_tf=False, config=config)
+        return self.bert_model
+
+    def forward(self, input: Dict[str, Dict]) -> Dict[str, Any]:
+        pass
+
+
+class BertForDocumentSegmentationBase(BertPreTrainedModel):
+
+    _keys_to_ignore_on_load_unexpected = [r'pooler']
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.sentence_pooler_type = None
+        self.bert = BertModel(config, add_pooling_layer=False)
+
+        classifier_dropout = config.hidden_dropout_prob
+        self.dropout = nn.Dropout(classifier_dropout)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+        self.class_weights = None
+        self.init_weights()
+
+    def forward(self,
+                input_ids=None,
+                attention_mask=None,
+                token_type_ids=None,
+                position_ids=None,
+                head_mask=None,
+                sentence_attention_mask=None,
+                inputs_embeds=None,
+                labels=None,
+                output_attentions=None,
+                output_hidden_states=None,
+                return_dict=None):
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+        if self.sentence_pooler_type is not None:
+            raise NotImplementedError
+        else:
+            sequence_output = self.dropout(sequence_output)
+
+        logits = self.classifier(sequence_output)
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss(weight=self.class_weights)
+            if sentence_attention_mask is not None:
+                active_loss = sentence_attention_mask.view(-1) == 1
+                active_logits = logits.view(-1, self.num_labels)
+                active_labels = torch.where(
+                    active_loss, labels.view(-1),
+                    torch.tensor(loss_fct.ignore_index).type_as(labels))
+                loss = loss_fct(active_logits, active_labels)
+            else:
+                loss = loss_fct(
+                    logits.view(-1, self.num_labels), labels.view(-1))
+
+        if not return_dict:
+            output = (logits, ) + outputs[2:]
+            return ((loss, ) + output) if loss is not None else output
+
+        return TokenClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
diff --git a/modelscope/models/nlp/bert_for_sequence_classification.py b/modelscope/models/nlp/bert_for_sequence_classification.py
deleted file mode 100644
index 75105f36..00000000
--- a/modelscope/models/nlp/bert_for_sequence_classification.py
+++ /dev/null
@@ -1,69 +0,0 @@
-import os
-from typing import Any, Dict
-
-import json
-import numpy as np
-
-from modelscope.metainfo import Models
-from modelscope.models import TorchModel
-from modelscope.models.builder import MODELS
-from modelscope.utils.constant import Tasks
-
-__all__ = ['BertForSequenceClassification']
-
-
-@MODELS.register_module(Tasks.text_classification, module_name=Models.bert)
-class BertForSequenceClassification(TorchModel):
-
-    def __init__(self, model_dir: str, *args, **kwargs):
-        # Model.__init__(self, model_dir, model_cls, first_sequence, *args, **kwargs)
-        # Predictor.__init__(self, *args, **kwargs)
-        """initialize the sequence classification model from the `model_dir` path.
-
-        Args:
-            model_dir (str): the model path.
-        """
-
-        super().__init__(model_dir, *args, **kwargs)
-        import torch
-        from easynlp.appzoo import SequenceClassification
-        from easynlp.core.predictor import get_model_predictor
-        self.model = get_model_predictor(
-            model_dir=self.model_dir,
-            model_cls=SequenceClassification,
-            input_keys=[('input_ids', torch.LongTensor),
-                        ('attention_mask', torch.LongTensor),
-                        ('token_type_ids', torch.LongTensor)],
-            output_keys=['predictions', 'probabilities', 'logits'])
-
-        self.label_path = os.path.join(self.model_dir, 'label_mapping.json')
-        with open(self.label_path) as f:
-            self.label_mapping = json.load(f)
-        self.id2label = {idx: name for name, idx in self.label_mapping.items()}
-
-    def forward(self, input: Dict[str, Any]) -> Dict[str, np.ndarray]:
-        """return the result by the model
-
-        Args:
-            input (Dict[str, Any]): the preprocessed data
-
-        Returns:
-            Dict[str, np.ndarray]: results
-                Example:
-                    {
-                        'predictions': array([1]), # lable 0-negative 1-positive
-                        'probabilities': array([[0.11491239, 0.8850876 ]], dtype=float32),
-                        'logits': array([[-0.53860897,  1.5029076 ]], dtype=float32) # true value
-                    }
-        """
-        return self.model.predict(input)
-
-    def postprocess(self, inputs: Dict[str, np.ndarray],
-                    **kwargs) -> Dict[str, np.ndarray]:
-        # N x num_classes
-        probs = inputs['probabilities']
-        result = {
-            'probs': probs,
-        }
-
-        return result
diff --git a/modelscope/models/nlp/csanmt_for_translation.py b/modelscope/models/nlp/csanmt_for_translation.py
index 83b58060..4bac8e6d 100644
--- a/modelscope/models/nlp/csanmt_for_translation.py
+++ b/modelscope/models/nlp/csanmt_for_translation.py
@@ -1,3 +1,6 @@
+# Part of the implementation is borrowed and modified from THUMT,
+# publicly available at https://github.com/THUNLP-MT/THUMT
+# Copyright 2017-2022 The Alibaba MT Team Authors. All rights reserved.
 import math
 from collections import namedtuple
 from typing import Dict
diff --git a/modelscope/models/nlp/deberta_v2/__init__.py b/modelscope/models/nlp/deberta_v2/__init__.py
new file mode 100644
index 00000000..830210ed
--- /dev/null
+++ b/modelscope/models/nlp/deberta_v2/__init__.py
@@ -0,0 +1,63 @@
+# flake8: noqa
+# There's no way to ignore "F401 '...' imported but unused" warnings in this
+# module, but to preserve other warnings. So, don't check this module at all.
+
+# Copyright 2021-2022 The Alibaba DAMO NLP Team Authors.
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .configuration_deberta_v2 import DebertaV2Config
+    from .tokenization_deberta_v2 import DebertaV2Tokenizer
+    from .tokenization_deberta_v2_fast import DebertaV2TokenizerFast
+
+    from .modeling_deberta_v2 import (
+        DebertaV2ForMaskedLM,
+        DebertaV2ForMultipleChoice,
+        DebertaV2ForQuestionAnswering,
+        DebertaV2ForSequenceClassification,
+        DebertaV2ForTokenClassification,
+        DebertaV2Model,
+        DebertaV2PreTrainedModel,
+    )
+
+else:
+    _import_structure = {
+        'configuration_deberta_v2':
+        ['DEBERTA_V2_PRETRAINED_CONFIG_ARCHIVE_MAP', 'DebertaV2Config'],
+        'tokenization_deberta_v2': ['DebertaV2Tokenizer']
+    }
+    _import_structure['tokenization_deberta_v2_fast'] = [
+        'DebertaV2TokenizerFast'
+    ]
+    _import_structure['modeling_deberta_v2'] = [
+        'DebertaV2ForMaskedLM',
+        'DebertaV2ForMultipleChoice',
+        'DebertaV2ForQuestionAnswering',
+        'DebertaV2ForSequenceClassification',
+        'DebertaV2ForTokenClassification',
+        'DebertaV2Model',
+        'DebertaV2PreTrainedModel',
+    ]
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__)
diff --git a/modelscope/models/nlp/deberta_v2/configuration_deberta_v2.py b/modelscope/models/nlp/deberta_v2/configuration_deberta_v2.py
new file mode 100644
index 00000000..65e8f0b7
--- /dev/null
+++ b/modelscope/models/nlp/deberta_v2/configuration_deberta_v2.py
@@ -0,0 +1,130 @@
+# Copyright 2021-2022 The Alibaba DAMO NLP Team Authors.
+# Copyright 2020, Microsoft and the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" DeBERTa-v2 model configuration, mainly copied from :class:`~transformers.DeBERTaV2Config"""
+from collections import OrderedDict
+from typing import TYPE_CHECKING, Any, Mapping, Optional, Union
+
+from transformers import PretrainedConfig
+
+from modelscope.utils import logger as logging
+
+logger = logging.get_logger(__name__)
+
+
+class DebertaV2Config(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`DebertaV2Model`]. It is used to instantiate a
+    DeBERTa-v2 model according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the DeBERTa
+    [microsoft/deberta-v2-xlarge](https://huggingface.co/microsoft/deberta-v2-xlarge) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Arguments:
+        vocab_size (`int`, *optional*, defaults to 128100):
+            Vocabulary size of the DeBERTa-v2 model. Defines the number of different tokens that can be represented by
+            the `inputs_ids` passed when calling [`DebertaV2Model`].
+        hidden_size (`int`, *optional*, defaults to 1536):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 24):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 24):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (`int`, *optional*, defaults to 6144):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
+        hidden_act (`str` or `Callable`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"silu"`, `"gelu"`, `"tanh"`, `"gelu_fast"`, `"mish"`, `"linear"`, `"sigmoid"` and `"gelu_new"`
+            are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the attention probabilities.
+        max_position_embeddings (`int`, *optional*, defaults to 512):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        type_vocab_size (`int`, *optional*, defaults to 0):
+            The vocabulary size of the `token_type_ids` passed when calling [`DebertaModel`] or [`TFDebertaModel`].
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-7):
+            The epsilon used by the layer normalization layers.
+        relative_attention (`bool`, *optional*, defaults to `True`):
+            Whether use relative position encoding.
+        max_relative_positions (`int`, *optional*, defaults to -1):
+            The range of relative positions `[-max_position_embeddings, max_position_embeddings]`. Use the same value
+            as `max_position_embeddings`.
+        pad_token_id (`int`, *optional*, defaults to 0):
+            The value used to pad input_ids.
+        position_biased_input (`bool`, *optional*, defaults to `False`):
+            Whether add absolute position embedding to content embedding.
+        pos_att_type (`List[str]`, *optional*):
+            The type of relative position attention, it can be a combination of `["p2c", "c2p"]`, e.g. `["p2c"]`,
+            `["p2c", "c2p"]`, `["p2c", "c2p"]`.
+        layer_norm_eps (`float`, optional, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+    """
+    model_type = 'deberta_v2'
+
+    def __init__(self,
+                 vocab_size=128100,
+                 hidden_size=1536,
+                 num_hidden_layers=24,
+                 num_attention_heads=24,
+                 intermediate_size=6144,
+                 hidden_act='gelu',
+                 hidden_dropout_prob=0.1,
+                 attention_probs_dropout_prob=0.1,
+                 max_position_embeddings=512,
+                 type_vocab_size=0,
+                 initializer_range=0.02,
+                 layer_norm_eps=1e-7,
+                 relative_attention=False,
+                 max_relative_positions=-1,
+                 pad_token_id=0,
+                 position_biased_input=True,
+                 pos_att_type=None,
+                 pooler_dropout=0,
+                 pooler_hidden_act='gelu',
+                 **kwargs):
+        super().__init__(**kwargs)
+
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.type_vocab_size = type_vocab_size
+        self.initializer_range = initializer_range
+        self.relative_attention = relative_attention
+        self.max_relative_positions = max_relative_positions
+        self.pad_token_id = pad_token_id
+        self.position_biased_input = position_biased_input
+
+        # Backwards compatibility
+        if type(pos_att_type) == str:
+            pos_att_type = [x.strip() for x in pos_att_type.lower().split('|')]
+
+        self.pos_att_type = pos_att_type
+        self.vocab_size = vocab_size
+        self.layer_norm_eps = layer_norm_eps
+
+        self.pooler_hidden_size = kwargs.get('pooler_hidden_size', hidden_size)
+        self.pooler_dropout = pooler_dropout
+        self.pooler_hidden_act = pooler_hidden_act
diff --git a/modelscope/models/nlp/deberta_v2/modeling_deberta_v2.py b/modelscope/models/nlp/deberta_v2/modeling_deberta_v2.py
new file mode 100644
index 00000000..1c6b9071
--- /dev/null
+++ b/modelscope/models/nlp/deberta_v2/modeling_deberta_v2.py
@@ -0,0 +1,1789 @@
+# Copyright 2021-2022 The Alibaba DAMO NLP Team Authors.
+# Copyright 2020 Microsoft and the Hugging Face Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch DeBERTa-v2 model."""
+
+from collections.abc import Sequence
+from typing import Optional, Tuple, Union
+
+import torch
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, LayerNorm, MSELoss
+from transformers.activations import ACT2FN
+from transformers.file_utils import (add_code_sample_docstrings,
+                                     add_start_docstrings,
+                                     add_start_docstrings_to_model_forward)
+from transformers.modeling_outputs import (BaseModelOutput, MaskedLMOutput,
+                                           MultipleChoiceModelOutput,
+                                           QuestionAnsweringModelOutput,
+                                           SequenceClassifierOutput,
+                                           TokenClassifierOutput)
+from transformers.modeling_utils import PreTrainedModel
+from transformers.pytorch_utils import softmax_backward_data
+
+from modelscope.utils import logger as logging
+from .configuration_deberta_v2 import DebertaV2Config
+
+logger = logging.get_logger(__name__)
+
+_CONFIG_FOR_DOC = 'DebertaV2Config'
+_TOKENIZER_FOR_DOC = 'DebertaV2Tokenizer'
+_CHECKPOINT_FOR_DOC = 'nlp_debertav2_fill-mask_chinese-lite'
+
+
+# Copied from transformers.models.deberta.modeling_deberta.ContextPooler
+class ContextPooler(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.pooler_hidden_size,
+                               config.pooler_hidden_size)
+        self.dropout = StableDropout(config.pooler_dropout)
+        self.config = config
+
+    def forward(self, hidden_states):
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the first token.
+
+        context_token = hidden_states[:, 0]
+        context_token = self.dropout(context_token)
+        pooled_output = self.dense(context_token)
+        pooled_output = ACT2FN[self.config.pooler_hidden_act](pooled_output)
+        return pooled_output
+
+    @property
+    def output_dim(self):
+        return self.config.hidden_size
+
+
+# Copied from transformers.models.deberta.modeling_deberta.XSoftmax with deberta->deberta_v2
+class XSoftmax(torch.autograd.Function):
+    """
+    Masked Softmax which is optimized for saving memory
+
+    Args:
+        input (`torch.tensor`): The input tensor that will apply softmax.
+        mask (`torch.IntTensor`):
+            The mask matrix where 0 indicate that element will be ignored in the softmax calculation.
+        dim (int): The dimension that will apply softmax
+
+    Example:
+
+    ```python
+    >>> import torch
+    >>> from transformers.models.deberta_v2.modeling_deberta_v2 import XSoftmax
+
+    >>> # Make a tensor
+    >>> x = torch.randn([4, 20, 100])
+
+    >>> # Create a mask
+    >>> mask = (x > 0).int()
+
+    >>> # Specify the dimension to apply softmax
+    >>> dim = -1
+
+    >>> y = XSoftmax.apply(x, mask, dim)
+    ```"""
+
+    @staticmethod
+    def forward(self, input, mask, dim):
+        self.dim = dim
+        rmask = ~(mask.to(torch.bool))
+
+        output = input.masked_fill(rmask,
+                                   torch.tensor(torch.finfo(input.dtype).min))
+        output = torch.softmax(output, self.dim)
+        output.masked_fill_(rmask, 0)
+        self.save_for_backward(output)
+        return output
+
+    @staticmethod
+    def backward(self, grad_output):
+        (output, ) = self.saved_tensors
+        inputGrad = softmax_backward_data(self, grad_output, output, self.dim,
+                                          output)
+        return inputGrad, None, None
+
+    @staticmethod
+    def symbolic(g, self, mask, dim):
+        import torch.onnx.symbolic_helper as sym_help
+        from torch.onnx.symbolic_opset9 import masked_fill, softmax
+
+        mask_cast_value = g.op(
+            'Cast', mask, to_i=sym_help.cast_pytorch_to_onnx['Long'])
+        r_mask = g.op(
+            'Cast',
+            g.op('Sub',
+                 g.op('Constant', value_t=torch.tensor(1, dtype=torch.int64)),
+                 mask_cast_value),
+            to_i=sym_help.cast_pytorch_to_onnx['Byte'],
+        )
+        output = masked_fill(
+            g, self, r_mask,
+            g.op(
+                'Constant',
+                value_t=torch.tensor(torch.finfo(self.type().dtype()).min)))
+        output = softmax(g, output, dim)
+        return masked_fill(
+            g, output, r_mask,
+            g.op('Constant', value_t=torch.tensor(0, dtype=torch.uint8)))
+
+
+# Copied from transformers.models.deberta.modeling_deberta.DropoutContext
+class DropoutContext(object):
+
+    def __init__(self):
+        self.dropout = 0
+        self.mask = None
+        self.scale = 1
+        self.reuse_mask = True
+
+
+# Copied from transformers.models.deberta.modeling_deberta.get_mask
+def get_mask(input, local_context):
+    if not isinstance(local_context, DropoutContext):
+        dropout = local_context
+        mask = None
+    else:
+        dropout = local_context.dropout
+        dropout *= local_context.scale
+        mask = local_context.mask if local_context.reuse_mask else None
+
+    if dropout > 0 and mask is None:
+        mask = (1 - torch.empty_like(input).bernoulli_(1 - dropout)).to(
+            torch.bool)
+
+    if isinstance(local_context, DropoutContext):
+        if local_context.mask is None:
+            local_context.mask = mask
+
+    return mask, dropout
+
+
+# Copied from transformers.models.deberta.modeling_deberta.XDropout
+class XDropout(torch.autograd.Function):
+    """Optimized dropout function to save computation and memory by using mask operation instead of multiplication."""
+
+    @staticmethod
+    def forward(ctx, input, local_ctx):
+        mask, dropout = get_mask(input, local_ctx)
+        ctx.scale = 1.0 / (1 - dropout)
+        if dropout > 0:
+            ctx.save_for_backward(mask)
+            return input.masked_fill(mask, 0) * ctx.scale
+        else:
+            return input
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        if ctx.scale > 1:
+            (mask, ) = ctx.saved_tensors
+            return grad_output.masked_fill(mask, 0) * ctx.scale, None
+        else:
+            return grad_output, None
+
+    @staticmethod
+    def symbolic(g: torch._C.Graph, input: torch._C.Value,
+                 local_ctx: Union[float, DropoutContext]) -> torch._C.Value:
+        from torch.onnx import symbolic_opset12
+
+        dropout_p = local_ctx
+        if isinstance(local_ctx, DropoutContext):
+            dropout_p = local_ctx.dropout
+        # StableDropout only calls this function when training.
+        train = True
+        # TODO: We should check if the opset_version being used to export
+        # is > 12 here, but there's no good way to do that. As-is, if the
+        # opset_version < 12, export will fail with a CheckerError.
+        # Once https://github.com/pytorch/pytorch/issues/78391 is fixed, do something like:
+        # if opset_version < 12:
+        #   return torch.onnx.symbolic_opset9.dropout(g, input, dropout_p, train)
+        return symbolic_opset12.dropout(g, input, dropout_p, train)
+
+
+# Copied from transformers.models.deberta.modeling_deberta.StableDropout
+class StableDropout(nn.Module):
+    """
+    Optimized dropout module for stabilizing the training
+
+    Args:
+        drop_prob (float): the dropout probabilities
+    """
+
+    def __init__(self, drop_prob):
+        super().__init__()
+        self.drop_prob = drop_prob
+        self.count = 0
+        self.context_stack = None
+
+    def forward(self, x):
+        """
+        Call the module
+
+        Args:
+            x (`torch.tensor`): The input tensor to apply dropout
+        """
+        if self.training and self.drop_prob > 0:
+            return XDropout.apply(x, self.get_context())
+        return x
+
+    def clear_context(self):
+        self.count = 0
+        self.context_stack = None
+
+    def init_context(self, reuse_mask=True, scale=1):
+        if self.context_stack is None:
+            self.context_stack = []
+        self.count = 0
+        for c in self.context_stack:
+            c.reuse_mask = reuse_mask
+            c.scale = scale
+
+    def get_context(self):
+        if self.context_stack is not None:
+            if self.count >= len(self.context_stack):
+                self.context_stack.append(DropoutContext())
+            ctx = self.context_stack[self.count]
+            ctx.dropout = self.drop_prob
+            self.count += 1
+            return ctx
+        else:
+            return self.drop_prob
+
+
+# Copied from transformers.models.deberta.modeling_deberta.DebertaSelfOutput with DebertaLayerNorm->LayerNorm
+class DebertaV2SelfOutput(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.LayerNorm = LayerNorm(config.hidden_size, config.layer_norm_eps)
+        self.dropout = StableDropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+# Copied from transformers.models.deberta.modeling_deberta.DebertaAttention with Deberta->DebertaV2
+class DebertaV2Attention(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.self = DisentangledSelfAttention(config)
+        self.output = DebertaV2SelfOutput(config)
+        self.config = config
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask,
+        output_attentions=False,
+        query_states=None,
+        relative_pos=None,
+        rel_embeddings=None,
+    ):
+        self_output = self.self(
+            hidden_states,
+            attention_mask,
+            output_attentions,
+            query_states=query_states,
+            relative_pos=relative_pos,
+            rel_embeddings=rel_embeddings,
+        )
+        if output_attentions:
+            self_output, att_matrix = self_output
+        if query_states is None:
+            query_states = hidden_states
+        attention_output = self.output(self_output, query_states)
+
+        if output_attentions:
+            return (attention_output, att_matrix)
+        else:
+            return attention_output
+
+
+# Copied from transformers.models.bert.modeling_bert.BertIntermediate with Bert->DebertaV2
+class DebertaV2Intermediate(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        return hidden_states
+
+
+# Copied from transformers.models.deberta.modeling_deberta.DebertaOutput with DebertaLayerNorm->LayerNorm
+class DebertaV2Output(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.LayerNorm = LayerNorm(config.hidden_size, config.layer_norm_eps)
+        self.dropout = StableDropout(config.hidden_dropout_prob)
+        self.config = config
+
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+# Copied from transformers.models.deberta.modeling_deberta.DebertaLayer with Deberta->DebertaV2
+class DebertaV2Layer(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.attention = DebertaV2Attention(config)
+        self.intermediate = DebertaV2Intermediate(config)
+        self.output = DebertaV2Output(config)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask,
+        query_states=None,
+        relative_pos=None,
+        rel_embeddings=None,
+        output_attentions=False,
+    ):
+        attention_output = self.attention(
+            hidden_states,
+            attention_mask,
+            output_attentions=output_attentions,
+            query_states=query_states,
+            relative_pos=relative_pos,
+            rel_embeddings=rel_embeddings,
+        )
+        if output_attentions:
+            attention_output, att_matrix = attention_output
+        intermediate_output = self.intermediate(attention_output)
+        layer_output = self.output(intermediate_output, attention_output)
+        if output_attentions:
+            return (layer_output, att_matrix)
+        else:
+            return layer_output
+
+
+class ConvLayer(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        kernel_size = getattr(config, 'conv_kernel_size', 3)
+        groups = getattr(config, 'conv_groups', 1)
+        self.conv_act = getattr(config, 'conv_act', 'tanh')
+        self.conv = nn.Conv1d(
+            config.hidden_size,
+            config.hidden_size,
+            kernel_size,
+            padding=(kernel_size - 1) // 2,
+            groups=groups)
+        self.LayerNorm = LayerNorm(config.hidden_size, config.layer_norm_eps)
+        self.dropout = StableDropout(config.hidden_dropout_prob)
+        self.config = config
+
+    def forward(self, hidden_states, residual_states, input_mask):
+        out = self.conv(hidden_states.permute(0, 2, 1).contiguous()).permute(
+            0, 2, 1).contiguous()
+        rmask = (1 - input_mask).bool()
+        out.masked_fill_(rmask.unsqueeze(-1).expand(out.size()), 0)
+        out = ACT2FN[self.conv_act](self.dropout(out))
+
+        layer_norm_input = residual_states + out
+        output = self.LayerNorm(layer_norm_input).to(layer_norm_input)
+
+        if input_mask is None:
+            output_states = output
+        else:
+            if input_mask.dim() != layer_norm_input.dim():
+                if input_mask.dim() == 4:
+                    input_mask = input_mask.squeeze(1).squeeze(1)
+                input_mask = input_mask.unsqueeze(2)
+
+            input_mask = input_mask.to(output.dtype)
+            output_states = output * input_mask
+
+        return output_states
+
+
+class DebertaV2Encoder(nn.Module):
+    """Modified BertEncoder with relative position bias support"""
+
+    def __init__(self, config):
+        super().__init__()
+
+        self.layer = nn.ModuleList(
+            [DebertaV2Layer(config) for _ in range(config.num_hidden_layers)])
+        self.relative_attention = getattr(config, 'relative_attention', False)
+
+        if self.relative_attention:
+            self.max_relative_positions = getattr(config,
+                                                  'max_relative_positions', -1)
+            if self.max_relative_positions < 1:
+                self.max_relative_positions = config.max_position_embeddings
+
+            self.position_buckets = getattr(config, 'position_buckets', -1)
+            pos_ebd_size = self.max_relative_positions * 2
+
+            if self.position_buckets > 0:
+                pos_ebd_size = self.position_buckets * 2
+
+            self.rel_embeddings = nn.Embedding(pos_ebd_size,
+                                               config.hidden_size)
+
+        self.norm_rel_ebd = [
+            x.strip()
+            for x in getattr(config, 'norm_rel_ebd', 'none').lower().split('|')
+        ]
+
+        if 'layer_norm' in self.norm_rel_ebd:
+            self.LayerNorm = LayerNorm(
+                config.hidden_size,
+                config.layer_norm_eps,
+                elementwise_affine=True)
+
+        self.conv = ConvLayer(config) if getattr(config, 'conv_kernel_size',
+                                                 0) > 0 else None
+        self.gradient_checkpointing = False
+
+    def get_rel_embedding(self):
+        rel_embeddings = self.rel_embeddings.weight if self.relative_attention else None
+        if rel_embeddings is not None and ('layer_norm' in self.norm_rel_ebd):
+            rel_embeddings = self.LayerNorm(rel_embeddings)
+        return rel_embeddings
+
+    def get_attention_mask(self, attention_mask):
+        if attention_mask.dim() <= 2:
+            extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
+            attention_mask = extended_attention_mask * extended_attention_mask.squeeze(
+                -2).unsqueeze(-1)
+            attention_mask = attention_mask.byte()
+        elif attention_mask.dim() == 3:
+            attention_mask = attention_mask.unsqueeze(1)
+
+        return attention_mask
+
+    def get_rel_pos(self, hidden_states, query_states=None, relative_pos=None):
+        if self.relative_attention and relative_pos is None:
+            q = query_states.size(
+                -2) if query_states is not None else hidden_states.size(-2)
+            relative_pos = build_relative_position(
+                q,
+                hidden_states.size(-2),
+                bucket_size=self.position_buckets,
+                max_position=self.max_relative_positions)
+        return relative_pos
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask,
+        output_hidden_states=True,
+        output_attentions=False,
+        query_states=None,
+        relative_pos=None,
+        return_dict=True,
+    ):
+        if attention_mask.dim() <= 2:
+            input_mask = attention_mask
+        else:
+            input_mask = (attention_mask.sum(-2) > 0).byte()
+        attention_mask = self.get_attention_mask(attention_mask)
+        relative_pos = self.get_rel_pos(hidden_states, query_states,
+                                        relative_pos)
+
+        all_hidden_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+
+        if isinstance(hidden_states, Sequence):
+            next_kv = hidden_states[0]
+        else:
+            next_kv = hidden_states
+        rel_embeddings = self.get_rel_embedding()
+        output_states = next_kv
+        for i, layer_module in enumerate(self.layer):
+
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (output_states, )
+
+            if self.gradient_checkpointing and self.training:
+
+                def create_custom_forward(module):
+
+                    def custom_forward(*inputs):
+                        return module(*inputs, output_attentions)
+
+                    return custom_forward
+
+                output_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(layer_module),
+                    next_kv,
+                    attention_mask,
+                    query_states,
+                    relative_pos,
+                    rel_embeddings,
+                )
+            else:
+                output_states = layer_module(
+                    next_kv,
+                    attention_mask,
+                    query_states=query_states,
+                    relative_pos=relative_pos,
+                    rel_embeddings=rel_embeddings,
+                    output_attentions=output_attentions,
+                )
+
+            if output_attentions:
+                output_states, att_m = output_states
+
+            if i == 0 and self.conv is not None:
+                output_states = self.conv(hidden_states, output_states,
+                                          input_mask)
+
+            if query_states is not None:
+                query_states = output_states
+                if isinstance(hidden_states, Sequence):
+                    next_kv = hidden_states[i + 1] if i + 1 < len(
+                        self.layer) else None
+            else:
+                next_kv = output_states
+
+            if output_attentions:
+                all_attentions = all_attentions + (att_m, )
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (output_states, )
+
+        if not return_dict:
+            return tuple(
+                v for v in [output_states, all_hidden_states, all_attentions]
+                if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=output_states,
+            hidden_states=all_hidden_states,
+            attentions=all_attentions)
+
+
+def make_log_bucket_position(relative_pos, bucket_size, max_position):
+    sign = torch.sign(relative_pos)
+    mid = bucket_size // 2
+    abs_pos = torch.where(
+        (relative_pos < mid) & (relative_pos > -mid),
+        torch.tensor(mid - 1).type_as(relative_pos),
+        torch.abs(relative_pos),
+    )
+    log_pos = (
+        torch.ceil(
+            torch.log(abs_pos / mid)
+            / torch.log(torch.tensor(
+                (max_position - 1) / mid)) * (mid - 1)) + mid)
+    bucket_pos = torch.where(abs_pos <= mid, relative_pos.type_as(log_pos),
+                             log_pos * sign)
+    return bucket_pos
+
+
+def build_relative_position(query_size,
+                            key_size,
+                            bucket_size=-1,
+                            max_position=-1):
+    """
+    Build relative position according to the query and key
+
+    We assume the absolute position of query \\(P_q\\) is range from (0, query_size) and the absolute position of key
+    \\(P_k\\) is range from (0, key_size), The relative positions from query to key is \\(R_{q \\rightarrow k} = P_q -
+    P_k\\)
+
+    Args:
+        query_size (int): the length of query
+        key_size (int): the length of key
+        bucket_size (int): the size of position bucket
+        max_position (int): the maximum allowed absolute position
+
+    Return:
+        `torch.LongTensor`: A tensor with shape [1, query_size, key_size]
+
+    """
+    q_ids = torch.arange(0, query_size)
+    k_ids = torch.arange(0, key_size)
+    rel_pos_ids = q_ids[:, None] - k_ids[None, :]
+    if bucket_size > 0 and max_position > 0:
+        rel_pos_ids = make_log_bucket_position(rel_pos_ids, bucket_size,
+                                               max_position)
+    rel_pos_ids = rel_pos_ids.to(torch.long)
+    rel_pos_ids = rel_pos_ids[:query_size, :]
+    rel_pos_ids = rel_pos_ids.unsqueeze(0)
+    return rel_pos_ids
+
+
+@torch.jit.script
+# Copied from transformers.models.deberta.modeling_deberta.c2p_dynamic_expand
+def c2p_dynamic_expand(c2p_pos, query_layer, relative_pos):
+    return c2p_pos.expand([
+        query_layer.size(0),
+        query_layer.size(1),
+        query_layer.size(2),
+        relative_pos.size(-1)
+    ])
+
+
+@torch.jit.script
+# Copied from transformers.models.deberta.modeling_deberta.p2c_dynamic_expand
+def p2c_dynamic_expand(c2p_pos, query_layer, key_layer):
+    return c2p_pos.expand([
+        query_layer.size(0),
+        query_layer.size(1),
+        key_layer.size(-2),
+        key_layer.size(-2)
+    ])
+
+
+@torch.jit.script
+# Copied from transformers.models.deberta.modeling_deberta.pos_dynamic_expand
+def pos_dynamic_expand(pos_index, p2c_att, key_layer):
+    return pos_index.expand(p2c_att.size()[:2]
+                            + (pos_index.size(-2), key_layer.size(-2)))
+
+
+class DisentangledSelfAttention(nn.Module):
+    """
+    Disentangled self-attention module
+
+    Parameters:
+        config (`DebertaV2Config`):
+            A model config class instance with the configuration to build a new model. The schema is similar to
+            *BertConfig*, for more details, please refer [`DebertaV2Config`]
+
+    """
+
+    def __init__(self, config):
+        super().__init__()
+        if config.hidden_size % config.num_attention_heads != 0:
+            raise ValueError(
+                f'The hidden size ({config.hidden_size}) is not a multiple of the number of attention '
+                f'heads ({config.num_attention_heads})')
+        self.num_attention_heads = config.num_attention_heads
+        _attention_head_size = config.hidden_size // config.num_attention_heads
+        self.attention_head_size = getattr(config, 'attention_head_size',
+                                           _attention_head_size)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+        self.query_proj = nn.Linear(
+            config.hidden_size, self.all_head_size, bias=True)
+        self.key_proj = nn.Linear(
+            config.hidden_size, self.all_head_size, bias=True)
+        self.value_proj = nn.Linear(
+            config.hidden_size, self.all_head_size, bias=True)
+
+        self.share_att_key = getattr(config, 'share_att_key', False)
+        self.pos_att_type = config.pos_att_type if config.pos_att_type is not None else []
+        self.relative_attention = getattr(config, 'relative_attention', False)
+
+        if self.relative_attention:
+            self.position_buckets = getattr(config, 'position_buckets', -1)
+            self.max_relative_positions = getattr(config,
+                                                  'max_relative_positions', -1)
+            if self.max_relative_positions < 1:
+                self.max_relative_positions = config.max_position_embeddings
+            self.pos_ebd_size = self.max_relative_positions
+            if self.position_buckets > 0:
+                self.pos_ebd_size = self.position_buckets
+
+            self.pos_dropout = StableDropout(config.hidden_dropout_prob)
+
+            if not self.share_att_key:
+                if 'c2p' in self.pos_att_type:
+                    self.pos_key_proj = nn.Linear(
+                        config.hidden_size, self.all_head_size, bias=True)
+                if 'p2c' in self.pos_att_type:
+                    self.pos_query_proj = nn.Linear(config.hidden_size,
+                                                    self.all_head_size)
+
+        self.dropout = StableDropout(config.attention_probs_dropout_prob)
+
+    def transpose_for_scores(self, x, attention_heads):
+        new_x_shape = x.size()[:-1] + (attention_heads, -1)
+        x = x.view(new_x_shape)
+        return x.permute(0, 2, 1, 3).contiguous().view(-1, x.size(1),
+                                                       x.size(-1))
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask,
+        output_attentions=False,
+        query_states=None,
+        relative_pos=None,
+        rel_embeddings=None,
+    ):
+        """
+        Call the module
+
+        Args:
+            hidden_states (`torch.FloatTensor`):
+                Input states to the module usually the output from previous layer, it will be the Q,K and V in
+                *Attention(Q,K,V)*
+
+            attention_mask (`torch.ByteTensor`):
+                An attention mask matrix of shape [*B*, *N*, *N*] where *B* is the batch size, *N* is the maximum
+                sequence length in which element [i,j] = *1* means the *i* th token in the input can attend to the *j*
+                th token.
+
+            output_attentions (`bool`, optional):
+                Whether return the attention matrix.
+
+            query_states (`torch.FloatTensor`, optional):
+                The *Q* state in *Attention(Q,K,V)*.
+
+            relative_pos (`torch.LongTensor`):
+                The relative position encoding between the tokens in the sequence. It's of shape [*B*, *N*, *N*] with
+                values ranging in [*-max_relative_positions*, *max_relative_positions*].
+
+            rel_embeddings (`torch.FloatTensor`):
+                The embedding of relative distances. It's a tensor of shape [\\(2 \\times
+                \\text{max_relative_positions}\\), *hidden_size*].
+
+
+        """
+        if query_states is None:
+            query_states = hidden_states
+        query_layer = self.transpose_for_scores(
+            self.query_proj(query_states), self.num_attention_heads)
+        key_layer = self.transpose_for_scores(
+            self.key_proj(hidden_states), self.num_attention_heads)
+        value_layer = self.transpose_for_scores(
+            self.value_proj(hidden_states), self.num_attention_heads)
+
+        rel_att = None
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        scale_factor = 1
+        if 'c2p' in self.pos_att_type:
+            scale_factor += 1
+        if 'p2c' in self.pos_att_type:
+            scale_factor += 1
+        scale = torch.sqrt(
+            torch.tensor(query_layer.size(-1), dtype=torch.float)
+            * scale_factor)
+        attention_scores = torch.bmm(query_layer, key_layer.transpose(
+            -1, -2)) / torch.tensor(
+                scale, dtype=query_layer.dtype)
+        if self.relative_attention:
+            rel_embeddings = self.pos_dropout(rel_embeddings)
+            rel_att = self.disentangled_attention_bias(query_layer, key_layer,
+                                                       relative_pos,
+                                                       rel_embeddings,
+                                                       scale_factor)
+
+        if rel_att is not None:
+            attention_scores = attention_scores + rel_att
+        attention_scores = attention_scores
+        attention_scores = attention_scores.view(-1, self.num_attention_heads,
+                                                 attention_scores.size(-2),
+                                                 attention_scores.size(-1))
+
+        # bsz x height x length x dimension
+        attention_probs = XSoftmax.apply(attention_scores, attention_mask, -1)
+        attention_probs = self.dropout(attention_probs)
+        context_layer = torch.bmm(
+            attention_probs.view(-1, attention_probs.size(-2),
+                                 attention_probs.size(-1)), value_layer)
+        context_layer = (
+            context_layer.view(-1, self.num_attention_heads,
+                               context_layer.size(-2),
+                               context_layer.size(-1)).permute(0, 2, 1,
+                                                               3).contiguous())
+        new_context_layer_shape = context_layer.size()[:-2] + (-1, )
+        context_layer = context_layer.view(new_context_layer_shape)
+        if output_attentions:
+            return (context_layer, attention_probs)
+        else:
+            return context_layer
+
+    def disentangled_attention_bias(self, query_layer, key_layer, relative_pos,
+                                    rel_embeddings, scale_factor):
+        if relative_pos is None:
+            q = query_layer.size(-2)
+            relative_pos = build_relative_position(
+                q,
+                key_layer.size(-2),
+                bucket_size=self.position_buckets,
+                max_position=self.max_relative_positions)
+        if relative_pos.dim() == 2:
+            relative_pos = relative_pos.unsqueeze(0).unsqueeze(0)
+        elif relative_pos.dim() == 3:
+            relative_pos = relative_pos.unsqueeze(1)
+        # bsz x height x query x key
+        elif relative_pos.dim() != 4:
+            raise ValueError(
+                f'Relative position ids must be of dim 2 or 3 or 4. {relative_pos.dim()}'
+            )
+
+        att_span = self.pos_ebd_size
+        relative_pos = relative_pos.long().to(query_layer.device)
+
+        rel_embeddings = rel_embeddings[0:att_span * 2, :].unsqueeze(0)
+        if self.share_att_key:
+            pos_query_layer = self.transpose_for_scores(
+                self.query_proj(rel_embeddings),
+                self.num_attention_heads).repeat(
+                    query_layer.size(0) // self.num_attention_heads, 1, 1)
+            pos_key_layer = self.transpose_for_scores(
+                self.key_proj(rel_embeddings),
+                self.num_attention_heads).repeat(
+                    query_layer.size(0) // self.num_attention_heads, 1, 1)
+        else:
+            if 'c2p' in self.pos_att_type:
+                pos_key_layer = self.transpose_for_scores(
+                    self.pos_key_proj(rel_embeddings),
+                    self.num_attention_heads).repeat(
+                        query_layer.size(0) // self.num_attention_heads, 1,
+                        1)  # .split(self.all_head_size, dim=-1)
+            if 'p2c' in self.pos_att_type:
+                pos_query_layer = self.transpose_for_scores(
+                    self.pos_query_proj(rel_embeddings),
+                    self.num_attention_heads).repeat(
+                        query_layer.size(0) // self.num_attention_heads, 1,
+                        1)  # .split(self.all_head_size, dim=-1)
+
+        score = 0
+        # content->position
+        if 'c2p' in self.pos_att_type:
+            scale = torch.sqrt(
+                torch.tensor(pos_key_layer.size(-1), dtype=torch.float)
+                * scale_factor)
+            c2p_att = torch.bmm(query_layer, pos_key_layer.transpose(-1, -2))
+            c2p_pos = torch.clamp(relative_pos + att_span, 0, att_span * 2 - 1)
+            c2p_att = torch.gather(
+                c2p_att,
+                dim=-1,
+                index=c2p_pos.squeeze(0).expand([
+                    query_layer.size(0),
+                    query_layer.size(1),
+                    relative_pos.size(-1)
+                ]),
+            )
+            score += c2p_att / torch.tensor(scale, dtype=c2p_att.dtype)
+
+        # position->content
+        if 'p2c' in self.pos_att_type:
+            scale = torch.sqrt(
+                torch.tensor(pos_query_layer.size(-1), dtype=torch.float)
+                * scale_factor)
+            if key_layer.size(-2) != query_layer.size(-2):
+                r_pos = build_relative_position(
+                    key_layer.size(-2),
+                    key_layer.size(-2),
+                    bucket_size=self.position_buckets,
+                    max_position=self.max_relative_positions,
+                ).to(query_layer.device)
+                r_pos = r_pos.unsqueeze(0)
+            else:
+                r_pos = relative_pos
+
+            p2c_pos = torch.clamp(-r_pos + att_span, 0, att_span * 2 - 1)
+            p2c_att = torch.bmm(key_layer, pos_query_layer.transpose(-1, -2))
+            p2c_att = torch.gather(
+                p2c_att,
+                dim=-1,
+                index=p2c_pos.squeeze(0).expand([
+                    query_layer.size(0),
+                    key_layer.size(-2),
+                    key_layer.size(-2)
+                ]),
+            ).transpose(-1, -2)
+            score += p2c_att / torch.tensor(scale, dtype=p2c_att.dtype)
+
+        return score
+
+
+# Copied from transformers.models.deberta.modeling_deberta.DebertaEmbeddings with DebertaLayerNorm->LayerNorm
+class DebertaV2Embeddings(nn.Module):
+    """Construct the embeddings from word, position and token_type embeddings."""
+
+    def __init__(self, config):
+        super().__init__()
+        pad_token_id = getattr(config, 'pad_token_id', 0)
+        self.embedding_size = getattr(config, 'embedding_size',
+                                      config.hidden_size)
+        self.word_embeddings = nn.Embedding(
+            config.vocab_size, self.embedding_size, padding_idx=pad_token_id)
+
+        self.position_biased_input = getattr(config, 'position_biased_input',
+                                             True)
+        if not self.position_biased_input:
+            self.position_embeddings = None
+        else:
+            self.position_embeddings = nn.Embedding(
+                config.max_position_embeddings, self.embedding_size)
+
+        if config.type_vocab_size > 0:
+            self.token_type_embeddings = nn.Embedding(config.type_vocab_size,
+                                                      self.embedding_size)
+
+        if self.embedding_size != config.hidden_size:
+            self.embed_proj = nn.Linear(
+                self.embedding_size, config.hidden_size, bias=False)
+        self.LayerNorm = LayerNorm(config.hidden_size, config.layer_norm_eps)
+        self.dropout = StableDropout(config.hidden_dropout_prob)
+        self.config = config
+
+        # position_ids (1, len position emb) is contiguous in memory and exported when serialized
+        self.register_buffer(
+            'position_ids',
+            torch.arange(config.max_position_embeddings).expand((1, -1)))
+
+    def forward(self,
+                input_ids=None,
+                token_type_ids=None,
+                position_ids=None,
+                mask=None,
+                inputs_embeds=None):
+        if input_ids is not None:
+            input_shape = input_ids.size()
+        else:
+            input_shape = inputs_embeds.size()[:-1]
+
+        seq_length = input_shape[1]
+
+        if position_ids is None:
+            position_ids = self.position_ids[:, :seq_length]
+
+        if token_type_ids is None:
+            token_type_ids = torch.zeros(
+                input_shape, dtype=torch.long, device=self.position_ids.device)
+
+        if inputs_embeds is None:
+            inputs_embeds = self.word_embeddings(input_ids)
+
+        if self.position_embeddings is not None:
+            position_embeddings = self.position_embeddings(position_ids.long())
+        else:
+            position_embeddings = torch.zeros_like(inputs_embeds)
+
+        embeddings = inputs_embeds
+        if self.position_biased_input:
+            embeddings += position_embeddings
+        if self.config.type_vocab_size > 0:
+            token_type_embeddings = self.token_type_embeddings(token_type_ids)
+            embeddings += token_type_embeddings
+
+        if self.embedding_size != self.config.hidden_size:
+            embeddings = self.embed_proj(embeddings)
+
+        embeddings = self.LayerNorm(embeddings)
+
+        if mask is not None:
+            if mask.dim() != embeddings.dim():
+                if mask.dim() == 4:
+                    mask = mask.squeeze(1).squeeze(1)
+                mask = mask.unsqueeze(2)
+            mask = mask.to(embeddings.dtype)
+
+            embeddings = embeddings * mask
+
+        embeddings = self.dropout(embeddings)
+        return embeddings
+
+
+# Copied from transformers.models.deberta.modeling_deberta.DebertaPreTrainedModel with Deberta->DebertaV2
+class DebertaV2PreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = DebertaV2Config
+    base_model_prefix = 'deberta'
+    _keys_to_ignore_on_load_missing = ['position_ids']
+    _keys_to_ignore_on_load_unexpected = ['position_embeddings']
+    supports_gradient_checkpointing = True
+
+    def _init_weights(self, module):
+        """Initialize the weights."""
+        if isinstance(module, nn.Linear):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(
+                mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(
+                mean=0.0, std=self.config.initializer_range)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, DebertaV2Encoder):
+            module.gradient_checkpointing = value
+
+
+DEBERTA_START_DOCSTRING = r"""
+    The DeBERTa model was proposed in [DeBERTa: Decoding-enhanced BERT with Disentangled
+    Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen. It's build
+    on top of BERT/RoBERTa with two improvements, i.e. disentangled attention and enhanced mask decoder. With those two
+    improvements, it out perform BERT/RoBERTa on a majority of tasks with 80GB pretraining data.
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+
+    Parameters:
+        config ([`DebertaV2Config`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+DEBERTA_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `({0})`):
+            Indices of input sequence tokens in the vocabulary.
+
+            Indices can be obtained using [`DebertaV2Tokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
+            1]`:
+
+            - 0 corresponds to a *sentence A* token,
+            - 1 corresponds to a *sentence B* token.
+
+            [What are token type IDs?](../glossary#token-type-ids)
+        position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.max_position_embeddings - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert *input_ids* indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+    'The bare DeBERTa Model transformer outputting raw hidden-states without any specific head on top.',
+    DEBERTA_START_DOCSTRING,
+)
+# Copied from transformers.models.deberta.modeling_deberta.DebertaModel with Deberta->DebertaV2
+class DebertaV2Model(DebertaV2PreTrainedModel):
+
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.embeddings = DebertaV2Embeddings(config)
+        self.encoder = DebertaV2Encoder(config)
+        self.z_steps = 0
+        self.config = config
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.embeddings.word_embeddings
+
+    def set_input_embeddings(self, new_embeddings):
+        self.embeddings.word_embeddings = new_embeddings
+
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        raise NotImplementedError(
+            'The prune function is not implemented in DeBERTa model.')
+
+    @add_start_docstrings_to_model_forward(
+        DEBERTA_INPUTS_DOCSTRING.format('batch_size, sequence_length'))
+    @add_code_sample_docstrings(
+        processor_class=_TOKENIZER_FOR_DOC,
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=BaseModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutput]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else
+            self.config.output_hidden_states)
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError(
+                'You cannot specify both input_ids and inputs_embeds at the same time'
+            )
+        elif input_ids is not None:
+            input_shape = input_ids.size()
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+        else:
+            raise ValueError(
+                'You have to specify either input_ids or inputs_embeds')
+
+        device = input_ids.device if input_ids is not None else inputs_embeds.device
+
+        if attention_mask is None:
+            attention_mask = torch.ones(input_shape, device=device)
+        if token_type_ids is None:
+            token_type_ids = torch.zeros(
+                input_shape, dtype=torch.long, device=device)
+
+        embedding_output = self.embeddings(
+            input_ids=input_ids,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+        )
+
+        encoder_outputs = self.encoder(
+            embedding_output,
+            attention_mask,
+            output_hidden_states=True,
+            output_attentions=output_attentions,
+            return_dict=return_dict,
+        )
+        encoded_layers = encoder_outputs[1]
+
+        if self.z_steps > 1:
+            hidden_states = encoded_layers[-2]
+            layers = [self.encoder.layer[-1] for _ in range(self.z_steps)]
+            query_states = encoded_layers[-1]
+            rel_embeddings = self.encoder.get_rel_embedding()
+            attention_mask = self.encoder.get_attention_mask(attention_mask)
+            rel_pos = self.encoder.get_rel_pos(embedding_output)
+            for layer in layers[1:]:
+                query_states = layer(
+                    hidden_states,
+                    attention_mask,
+                    output_attentions=False,
+                    query_states=query_states,
+                    relative_pos=rel_pos,
+                    rel_embeddings=rel_embeddings,
+                )
+                encoded_layers.append(query_states)
+
+        sequence_output = encoded_layers[-1]
+
+        if not return_dict:
+            return (sequence_output, ) + encoder_outputs[
+                (1 if output_hidden_states else 2):]
+
+        return BaseModelOutput(
+            last_hidden_state=sequence_output,
+            hidden_states=encoder_outputs.hidden_states
+            if output_hidden_states else None,
+            attentions=encoder_outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """DeBERTa Model with a `language modeling` head on top.""",
+    DEBERTA_START_DOCSTRING)
+# Copied from transformers.models.deberta.modeling_deberta.DebertaForMaskedLM with Deberta->DebertaV2
+class DebertaV2ForMaskedLM(DebertaV2PreTrainedModel):
+    _keys_to_ignore_on_load_unexpected = [r'pooler']
+    _keys_to_ignore_on_load_missing = [
+        r'position_ids', r'predictions.decoder.bias'
+    ]
+
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.deberta = DebertaV2Model(config)
+        self.cls = DebertaV2OnlyMLMHead(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_output_embeddings(self):
+        return self.cls.predictions.decoder
+
+    def set_output_embeddings(self, new_embeddings):
+        self.cls.predictions.decoder = new_embeddings
+
+    @add_start_docstrings_to_model_forward(
+        DEBERTA_INPUTS_DOCSTRING.format('batch_size, sequence_length'))
+    @add_code_sample_docstrings(
+        processor_class=_TOKENIZER_FOR_DOC,
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=MaskedLMOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, MaskedLMOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
+            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
+            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
+        """
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.deberta(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+        prediction_scores = self.cls(sequence_output)
+
+        masked_lm_loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()  # -100 index = padding token
+            masked_lm_loss = loss_fct(
+                prediction_scores.view(-1, self.config.vocab_size),
+                labels.view(-1))
+
+        if not return_dict:
+            output = (prediction_scores, ) + outputs[1:]
+            return ((masked_lm_loss, )
+                    + output) if masked_lm_loss is not None else output
+
+        return MaskedLMOutput(
+            loss=masked_lm_loss,
+            logits=prediction_scores,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+# copied from transformers.models.bert.BertPredictionHeadTransform with bert -> deberta
+class DebertaV2PredictionHeadTransform(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        if isinstance(config.hidden_act, str):
+            self.transform_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.transform_act_fn = config.hidden_act
+        self.LayerNorm = nn.LayerNorm(
+            config.hidden_size, eps=config.layer_norm_eps)
+
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.transform_act_fn(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states)
+        return hidden_states
+
+
+# copied from transformers.models.bert.BertLMPredictionHead with bert -> deberta
+class DebertaV2LMPredictionHead(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.transform = DebertaV2PredictionHeadTransform(config)
+
+        # The output weights are the same as the input embeddings, but there is
+        # an output-only bias for each token.
+        self.decoder = nn.Linear(
+            config.hidden_size, config.vocab_size, bias=False)
+
+        self.bias = nn.Parameter(torch.zeros(config.vocab_size))
+
+        # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
+        self.decoder.bias = self.bias
+
+    def forward(self, hidden_states):
+        hidden_states = self.transform(hidden_states)
+        hidden_states = self.decoder(hidden_states)
+        return hidden_states
+
+
+# copied from transformers.models.bert.BertOnlyMLMHead with bert -> deberta
+class DebertaV2OnlyMLMHead(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.predictions = DebertaV2LMPredictionHead(config)
+
+    def forward(self, sequence_output):
+        prediction_scores = self.predictions(sequence_output)
+        return prediction_scores
+
+
+@add_start_docstrings(
+    """
+    DeBERTa Model transformer with a sequence classification/regression head on top (a linear layer on top of the
+    pooled output) e.g. for GLUE tasks.
+    """,
+    DEBERTA_START_DOCSTRING,
+)
+# Copied from transformers.models.deberta.modeling_deberta.DebertaForSequenceClassification with Deberta->DebertaV2
+class DebertaV2ForSequenceClassification(DebertaV2PreTrainedModel):
+
+    def __init__(self, config):
+        super().__init__(config)
+
+        num_labels = getattr(config, 'num_labels', 2)
+        self.num_labels = num_labels
+
+        self.deberta = DebertaV2Model(config)
+        self.pooler = ContextPooler(config)
+        output_dim = self.pooler.output_dim
+
+        self.classifier = nn.Linear(output_dim, num_labels)
+        drop_out = getattr(config, 'cls_dropout', None)
+        drop_out = self.config.hidden_dropout_prob if drop_out is None else drop_out
+        self.dropout = StableDropout(drop_out)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.deberta.get_input_embeddings()
+
+    def set_input_embeddings(self, new_embeddings):
+        self.deberta.set_input_embeddings(new_embeddings)
+
+    @add_start_docstrings_to_model_forward(
+        DEBERTA_INPUTS_DOCSTRING.format('batch_size, sequence_length'))
+    @add_code_sample_docstrings(
+        processor_class=_TOKENIZER_FOR_DOC,
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=SequenceClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, SequenceClassifierOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.deberta(
+            input_ids,
+            token_type_ids=token_type_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        encoder_layer = outputs[0]
+        pooled_output = self.pooler(encoder_layer)
+        pooled_output = self.dropout(pooled_output)
+        logits = self.classifier(pooled_output)
+
+        loss = None
+        if labels is not None:
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    # regression task
+                    loss_fn = nn.MSELoss()
+                    logits = logits.view(-1).to(labels.dtype)
+                    loss = loss_fn(logits, labels.view(-1))
+                elif labels.dim() == 1 or labels.size(-1) == 1:
+                    label_index = (labels >= 0).nonzero()
+                    labels = labels.long()
+                    if label_index.size(0) > 0:
+                        labeled_logits = torch.gather(
+                            logits, 0,
+                            label_index.expand(
+                                label_index.size(0), logits.size(1)))
+                        labels = torch.gather(labels, 0, label_index.view(-1))
+                        loss_fct = CrossEntropyLoss()
+                        loss = loss_fct(
+                            labeled_logits.view(-1, self.num_labels).float(),
+                            labels.view(-1))
+                    else:
+                        loss = torch.tensor(0).to(logits)
+                else:
+                    log_softmax = nn.LogSoftmax(-1)
+                    loss = -((log_softmax(logits) * labels).sum(-1)).mean()
+            elif self.config.problem_type == 'regression':
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(logits, labels)
+            elif self.config.problem_type == 'single_label_classification':
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(
+                    logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == 'multi_label_classification':
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(logits, labels)
+        if not return_dict:
+            output = (logits, ) + outputs[1:]
+            return ((loss, ) + output) if loss is not None else output
+
+        return SequenceClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions)
+
+
+@add_start_docstrings(
+    """
+    DeBERTa Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
+    Named-Entity-Recognition (NER) tasks.
+    """,
+    DEBERTA_START_DOCSTRING,
+)
+# Copied from transformers.models.deberta.modeling_deberta.DebertaForTokenClassification with Deberta->DebertaV2
+class DebertaV2ForTokenClassification(DebertaV2PreTrainedModel):
+    _keys_to_ignore_on_load_unexpected = [r'pooler']
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+
+        self.deberta = DebertaV2Model(config)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(
+        DEBERTA_INPUTS_DOCSTRING.format('batch_size, sequence_length'))
+    @add_code_sample_docstrings(
+        processor_class=_TOKENIZER_FOR_DOC,
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=TokenClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, TokenClassifierOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.deberta(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+
+        sequence_output = self.dropout(sequence_output)
+        logits = self.classifier(sequence_output)
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+
+        if not return_dict:
+            output = (logits, ) + outputs[1:]
+            return ((loss, ) + output) if loss is not None else output
+
+        return TokenClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions)
+
+
+@add_start_docstrings(
+    """
+    DeBERTa Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
+    layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
+    """,
+    DEBERTA_START_DOCSTRING,
+)
+# Copied from transformers.models.deberta.modeling_deberta.DebertaForQuestionAnswering with Deberta->DebertaV2
+class DebertaV2ForQuestionAnswering(DebertaV2PreTrainedModel):
+    _keys_to_ignore_on_load_unexpected = [r'pooler']
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+
+        self.deberta = DebertaV2Model(config)
+        self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(
+        DEBERTA_INPUTS_DOCSTRING.format('batch_size, sequence_length'))
+    @add_code_sample_docstrings(
+        processor_class=_TOKENIZER_FOR_DOC,
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=QuestionAnsweringModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        start_positions: Optional[torch.Tensor] = None,
+        end_positions: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, QuestionAnsweringModelOutput]:
+        r"""
+        start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for position (index) of the start of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+            are not taken into account for computing the loss.
+        end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for position (index) of the end of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+            are not taken into account for computing the loss.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.deberta(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+
+        logits = self.qa_outputs(sequence_output)
+        start_logits, end_logits = logits.split(1, dim=-1)
+        start_logits = start_logits.squeeze(-1).contiguous()
+        end_logits = end_logits.squeeze(-1).contiguous()
+
+        total_loss = None
+        if start_positions is not None and end_positions is not None:
+            # If we are on multi-GPU, split add a dimension
+            if len(start_positions.size()) > 1:
+                start_positions = start_positions.squeeze(-1)
+            if len(end_positions.size()) > 1:
+                end_positions = end_positions.squeeze(-1)
+            # sometimes the start/end positions are outside our model inputs, we ignore these terms
+            ignored_index = start_logits.size(1)
+            start_positions = start_positions.clamp(0, ignored_index)
+            end_positions = end_positions.clamp(0, ignored_index)
+
+            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
+            start_loss = loss_fct(start_logits, start_positions)
+            end_loss = loss_fct(end_logits, end_positions)
+            total_loss = (start_loss + end_loss) / 2
+
+        if not return_dict:
+            output = (start_logits, end_logits) + outputs[1:]
+            return ((total_loss, )
+                    + output) if total_loss is not None else output
+
+        return QuestionAnsweringModelOutput(
+            loss=total_loss,
+            start_logits=start_logits,
+            end_logits=end_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    DeBERTa Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a
+    softmax) e.g. for RocStories/SWAG tasks.
+    """,
+    DEBERTA_START_DOCSTRING,
+)
+class DebertaV2ForMultipleChoice(DebertaV2PreTrainedModel):
+
+    def __init__(self, config):
+        super().__init__(config)
+
+        num_labels = getattr(config, 'num_labels', 2)
+        self.num_labels = num_labels
+
+        self.deberta = DebertaV2Model(config)
+        self.pooler = ContextPooler(config)
+        output_dim = self.pooler.output_dim
+
+        self.classifier = nn.Linear(output_dim, 1)
+        drop_out = getattr(config, 'cls_dropout', None)
+        drop_out = self.config.hidden_dropout_prob if drop_out is None else drop_out
+        self.dropout = StableDropout(drop_out)
+
+        self.init_weights()
+
+    def get_input_embeddings(self):
+        return self.deberta.get_input_embeddings()
+
+    def set_input_embeddings(self, new_embeddings):
+        self.deberta.set_input_embeddings(new_embeddings)
+
+    @add_start_docstrings_to_model_forward(
+        DEBERTA_INPUTS_DOCSTRING.format('batch_size, sequence_length'))
+    @add_code_sample_docstrings(
+        processor_class=_TOKENIZER_FOR_DOC,
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=MultipleChoiceModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        inputs_embeds=None,
+        labels=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
+            num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
+            `input_ids` above)
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        num_choices = input_ids.shape[
+            1] if input_ids is not None else inputs_embeds.shape[1]
+
+        flat_input_ids = input_ids.view(
+            -1, input_ids.size(-1)) if input_ids is not None else None
+        flat_position_ids = position_ids.view(
+            -1, position_ids.size(-1)) if position_ids is not None else None
+        flat_token_type_ids = token_type_ids.view(
+            -1,
+            token_type_ids.size(-1)) if token_type_ids is not None else None
+        flat_attention_mask = attention_mask.view(
+            -1,
+            attention_mask.size(-1)) if attention_mask is not None else None
+        flat_inputs_embeds = (
+            inputs_embeds.view(-1, inputs_embeds.size(-2),
+                               inputs_embeds.size(-1))
+            if inputs_embeds is not None else None)
+
+        outputs = self.deberta(
+            flat_input_ids,
+            position_ids=flat_position_ids,
+            token_type_ids=flat_token_type_ids,
+            attention_mask=flat_attention_mask,
+            inputs_embeds=flat_inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        encoder_layer = outputs[0]
+        pooled_output = self.pooler(encoder_layer)
+        pooled_output = self.dropout(pooled_output)
+        logits = self.classifier(pooled_output)
+        reshaped_logits = logits.view(-1, num_choices)
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(reshaped_logits, labels)
+
+        if not return_dict:
+            output = (reshaped_logits, ) + outputs[1:]
+            return ((loss, ) + output) if loss is not None else output
+
+        return MultipleChoiceModelOutput(
+            loss=loss,
+            logits=reshaped_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
diff --git a/modelscope/models/nlp/deberta_v2/tokenization_deberta_v2.py b/modelscope/models/nlp/deberta_v2/tokenization_deberta_v2.py
new file mode 100644
index 00000000..adb60288
--- /dev/null
+++ b/modelscope/models/nlp/deberta_v2/tokenization_deberta_v2.py
@@ -0,0 +1,546 @@
+# Copyright 2021-2022 The Alibaba DAMO NLP Team Authors.
+# Copyright 2020 Microsoft and the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization classes for DeBERTa. mainly copied from :module:`~transformers.tokenization_deberta`"""
+
+import os
+import unicodedata
+from typing import Any, Dict, List, Optional, Tuple
+
+import sentencepiece as sp
+from transformers.tokenization_utils import PreTrainedTokenizer
+
+PRETRAINED_VOCAB_FILES_MAP = {'vocab_file': {}}
+
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {}
+
+PRETRAINED_INIT_CONFIGURATION = {}
+
+VOCAB_FILES_NAMES = {'vocab_file': 'spm.model'}
+
+
+class DebertaV2Tokenizer(PreTrainedTokenizer):
+    r"""
+    Constructs a DeBERTa-v2 tokenizer. Based on [SentencePiece](https://github.com/google/sentencepiece)
+    and [jieba](https://github.com/fxsjy/jieba).
+
+    Args:
+        vocab_file (`str`):
+            [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that
+            contains the vocabulary necessary to instantiate a tokenizer.
+        do_lower_case (`bool`, *optional*, defaults to `False`):
+            Whether or not to lowercase the input when tokenizing.
+        bos_token (`string`, *optional*, defaults to `"[CLS]"`):
+            The beginning of sequence token that was used during pre-training. Can be used a sequence classifier token.
+            When building a sequence using special tokens, this is not the token that is used for the beginning of
+            sequence. The token used is the `cls_token`.
+        eos_token (`string`, *optional*, defaults to `"[SEP]"`):
+            The end of sequence token. When building a sequence using special tokens, this is not the token that is
+            used for the end of sequence. The token used is the `sep_token`.
+        unk_token (`str`, *optional*, defaults to `"[UNK]"`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        sep_token (`str`, *optional*, defaults to `"[SEP]"`):
+            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
+            sequence classification or for a text and a question for question answering. It is also used as the last
+            token of a sequence built with special tokens.
+        pad_token (`str`, *optional*, defaults to `"[PAD]"`):
+            The token used for padding, for example when batching sequences of different lengths.
+        cls_token (`str`, *optional*, defaults to `"[CLS]"`):
+            The classifier token which is used when doing sequence classification (classification of the whole sequence
+            instead of per-token classification). It is the first token of the sequence when built with special tokens.
+        mask_token (`str`, *optional*, defaults to `"[MASK]"`):
+            The token used for masking values. This is the token used when training this model with masked language
+            modeling. This is the token which the model will try to predict.
+        sp_model_kwargs (`dict`, *optional*):
+            Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
+            SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things,
+            to set:
+
+            - `enable_sampling`: Enable subword regularization.
+            - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.
+
+              - `nbest_size = {0,1}`: No sampling is performed.
+              - `nbest_size > 1`: samples from the nbest_size results.
+              - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
+                using forward-filtering-and-backward-sampling algorithm.
+
+            - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
+              BPE-dropout.
+    """
+
+    vocab_files_names = VOCAB_FILES_NAMES
+    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+    pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+
+    def __init__(self,
+                 vocab_file,
+                 do_lower_case=False,
+                 split_by_punct=False,
+                 split_chinese=True,
+                 bos_token='[CLS]',
+                 eos_token='[SEP]',
+                 unk_token='[UNK]',
+                 sep_token='[SEP]',
+                 pad_token='[PAD]',
+                 cls_token='[CLS]',
+                 mask_token='[MASK]',
+                 sp_model_kwargs: Optional[Dict[str, Any]] = None,
+                 **kwargs) -> None:
+        self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
+
+        super().__init__(
+            do_lower_case=do_lower_case,
+            bos_token=bos_token,
+            eos_token=eos_token,
+            unk_token=unk_token,
+            sep_token=sep_token,
+            pad_token=pad_token,
+            cls_token=cls_token,
+            mask_token=mask_token,
+            split_by_punct=split_by_punct,
+            split_chinese=split_chinese,
+            sp_model_kwargs=self.sp_model_kwargs,
+            **kwargs,
+        )
+
+        if not os.path.isfile(vocab_file):
+            raise ValueError(
+                f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained"
+                ' model use `tokenizer = AutoTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`'
+            )
+        self.do_lower_case = do_lower_case
+        self.split_by_punct = split_by_punct
+        self.split_chinese = split_chinese
+        self.vocab_file = vocab_file
+        self._tokenizer = SPMTokenizer(
+            vocab_file,
+            split_by_punct=split_by_punct,
+            sp_model_kwargs=self.sp_model_kwargs)
+        self.jieba = None
+        if self.split_chinese:
+            try:
+                import jieba
+            except ImportError:
+                raise ImportError(
+                    'You need to install jieba to split chinese and use DebertaV2Tokenizer. '
+                    'See https://pypi.org/project/jieba/ for installation.')
+            self.jieba = jieba
+
+    @property
+    def vocab_size(self):
+        return len(self.vocab)
+
+    @property
+    def vocab(self):
+        return self._tokenizer.vocab
+
+    def get_vocab(self):
+        vocab = self.vocab.copy()
+        vocab.update(self.get_added_vocab())
+        return vocab
+
+    def _tokenize(self, text: str) -> List[str]:
+        """Take as input a string and return a list of strings (tokens) for words/sub-words"""
+        if self.do_lower_case:
+            text = text.lower()
+        if self.split_chinese:
+            seg_list = [x for x in self.jieba.cut(text)]
+            text = ' '.join(seg_list)
+        return self._tokenizer.tokenize(text)
+
+    def _convert_token_to_id(self, token):
+        """Converts a token (str) in an id using the vocab."""
+        return self._tokenizer.spm.PieceToId(token)
+
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab."""
+        return self._tokenizer.spm.IdToPiece(
+            index) if index < self.vocab_size else self.unk_token
+
+    def convert_tokens_to_string(self, tokens):
+        """Converts a sequence of tokens (string) in a single string."""
+        return self._tokenizer.decode(tokens)
+
+    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
+        """
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
+        adding special tokens. A DeBERTa sequence has the following format:
+
+        - single sequence: [CLS] X [SEP]
+        - pair of sequences: [CLS] A [SEP] B [SEP]
+
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs to which the special tokens will be added.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
+        """
+
+        if token_ids_1 is None:
+            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
+        cls = [self.cls_token_id]
+        sep = [self.sep_token_id]
+        return cls + token_ids_0 + sep + token_ids_1 + sep
+
+    def get_special_tokens_mask(self,
+                                token_ids_0,
+                                token_ids_1=None,
+                                already_has_special_tokens=False):
+        """
+        Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
+        special tokens using the tokenizer `prepare_for_model` or `encode_plus` methods.
+
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
+                Whether or not the token list is already formatted with special tokens for the model.
+
+        Returns:
+            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+        """
+
+        if already_has_special_tokens:
+            return super().get_special_tokens_mask(
+                token_ids_0=token_ids_0,
+                token_ids_1=token_ids_1,
+                already_has_special_tokens=True)
+
+        if token_ids_1 is not None:
+            return [1] + ([0] * len(token_ids_0)) + [1] + (
+                [0] * len(token_ids_1)) + [1]
+        return [1] + ([0] * len(token_ids_0)) + [1]
+
+    def create_token_type_ids_from_sequences(self,
+                                             token_ids_0,
+                                             token_ids_1=None):
+        """
+        Create a mask from the two sequences passed to be used in a sequence-pair classification task. A DeBERTa
+        sequence pair mask has the following format:
+
+        ```
+        0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
+        | first sequence    | second sequence |
+        ```
+
+        If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).
+
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
+        """
+        sep = [self.sep_token_id]
+        cls = [self.cls_token_id]
+        if token_ids_1 is None:
+            return len(cls + token_ids_0 + sep) * [0]
+        return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1
+                                                        + sep) * [1]
+
+    def prepare_for_tokenization(self,
+                                 text,
+                                 is_split_into_words=False,
+                                 **kwargs):
+        add_prefix_space = kwargs.pop('add_prefix_space', False)
+        if is_split_into_words or add_prefix_space:
+            text = ' ' + text
+        return (text, kwargs)
+
+    def save_vocabulary(self,
+                        save_directory: str,
+                        filename_prefix: Optional[str] = None) -> Tuple[str]:
+        return self._tokenizer.save_pretrained(
+            save_directory, filename_prefix=filename_prefix)
+
+
+class SPMTokenizer:
+    r"""
+    Constructs a tokenizer based on [SentencePiece](https://github.com/google/sentencepiece).
+
+    Args:
+        vocab_file (`str`):
+            [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that
+            contains the vocabulary necessary to instantiate a tokenizer.
+        sp_model_kwargs (`dict`, *optional*):
+            Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
+            SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things,
+            to set:
+
+            - `enable_sampling`: Enable subword regularization.
+            - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.
+
+              - `nbest_size = {0,1}`: No sampling is performed.
+              - `nbest_size > 1`: samples from the nbest_size results.
+              - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
+                using forward-filtering-and-backward-sampling algorithm.
+
+            - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
+              BPE-dropout.
+    """
+
+    def __init__(self,
+                 vocab_file,
+                 split_by_punct=False,
+                 sp_model_kwargs: Optional[Dict[str, Any]] = None):
+        self.split_by_punct = split_by_punct
+        self.vocab_file = vocab_file
+        self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
+        spm = sp.SentencePieceProcessor(**self.sp_model_kwargs)
+        if not os.path.exists(vocab_file):
+            raise FileNotFoundError(f'{vocab_file} does not exist!')
+        spm.load(vocab_file)
+        bpe_vocab_size = spm.GetPieceSize()
+        # Token map
+        # <unk> 0+1
+        # <s> 1+1
+        # </s> 2+1
+        self.vocab = {spm.IdToPiece(i): i for i in range(bpe_vocab_size)}
+        self.ids_to_tokens = [spm.IdToPiece(i) for i in range(bpe_vocab_size)]
+        # self.vocab['[PAD]'] = 0
+        # self.vocab['[CLS]'] = 1
+        # self.vocab['[SEP]'] = 2
+        # self.vocab['[UNK]'] = 3
+
+        self.spm = spm
+
+    def __getstate__(self):
+        state = self.__dict__.copy()
+        state['spm'] = None
+        return state
+
+    def __setstate__(self, d):
+        self.__dict__ = d
+
+        # for backward compatibility
+        if not hasattr(self, 'sp_model_kwargs'):
+            self.sp_model_kwargs = {}
+
+        self.spm = sp.SentencePieceProcessor(**self.sp_model_kwargs)
+        self.spm.Load(self.vocab_file)
+
+    def tokenize(self, text):
+        return self._encode_as_pieces(text)
+
+    def convert_ids_to_tokens(self, ids):
+        tokens = []
+        for i in ids:
+            tokens.append(self.ids_to_tokens[i])
+        return tokens
+
+    def decode(self, tokens, start=-1, end=-1, raw_text=None):
+        if raw_text is None:
+            return self.spm.decode_pieces([t for t in tokens])
+        else:
+            words = self.split_to_words(raw_text)
+            word_tokens = [self.tokenize(w) for w in words]
+            token2words = [0] * len(tokens)
+            tid = 0
+            for i, w in enumerate(word_tokens):
+                for k, t in enumerate(w):
+                    token2words[tid] = i
+                    tid += 1
+            word_start = token2words[start]
+            word_end = token2words[end] if end < len(tokens) else len(words)
+            text = ''.join(words[word_start:word_end])
+            return text
+
+    def add_special_token(self, token):
+        if token not in self.special_tokens:
+            self.special_tokens.append(token)
+            if token not in self.vocab:
+                self.vocab[token] = len(self.vocab) - 1
+                self.ids_to_tokens.append(token)
+        return self.id(token)
+
+    def part_of_whole_word(self, token, is_bos=False):
+        if is_bos:
+            return True
+        if (len(token) == 1 and (_is_whitespace(list(token)[0]))):
+            return False
+        if _is_control(list(token)[0]):
+            return False
+        if _is_punctuation(list(token)[0]):
+            return False
+        if token in self.add_special_token:
+            return False
+
+        word_start = b'\xe2\x96\x81'.decode('utf-8')
+        return not token.startswith(word_start)
+
+    def pad(self):
+        return '[PAD]'
+
+    def bos(self):
+        return '[CLS]'
+
+    def eos(self):
+        return '[SEP]'
+
+    def unk(self):
+        return '[UNK]'
+
+    def mask(self):
+        return '[MASK]'
+
+    def sym(self, id):
+        return self.ids_to_tokens[id]
+
+    def id(self, sym):
+        return self.vocab[sym] if sym in self.vocab else 1
+
+    def _encode_as_pieces(self, text):
+        text = convert_to_unicode(text)
+        if self.split_by_punct:
+            words = self._run_split_on_punc(text)
+            pieces = [self.spm.encode(w, out_type=str) for w in words]
+            return [p for w in pieces for p in w]
+        else:
+            return self.spm.encode(text, out_type=str)
+
+    def split_to_words(self, text):
+        pieces = self._encode_as_pieces(text)
+        word_start = b'\xe2\x96\x81'.decode('utf-8')
+        words = []
+        offset = 0
+        prev_end = 0
+        for i, p in enumerate(pieces):
+            if p.startswith(word_start):
+                if offset > prev_end:
+                    words.append(text[prev_end:offset])
+                prev_end = offset
+                w = p.replace(word_start, '')
+            else:
+                w = p
+            try:
+                s = text.index(w, offset)
+                pn = ''
+                k = i + 1
+                while k < len(pieces):
+                    pn = pieces[k].replace(word_start, '')
+                    if len(pn) > 0:
+                        break
+                    k += 1
+
+                if len(pn) > 0 and pn in text[offset:s]:
+                    offset = offset + 1
+                else:
+                    offset = s + len(w)
+            except Exception:
+                offset = offset + 1
+
+        if prev_end < offset:
+            words.append(text[prev_end:offset])
+
+        return words
+
+    def _run_strip_accents(self, text):
+        """Strips accents from a piece of text."""
+        text = unicodedata.normalize('NFD', text)
+        output = []
+        for char in text:
+            cat = unicodedata.category(char)
+            if cat == 'Mn':
+                continue
+            output.append(char)
+        return ''.join(output)
+
+    def _run_split_on_punc(self, text):
+        """Splits punctuation on a piece of text."""
+        chars = list(text)
+        i = 0
+        start_new_word = True
+        output = []
+        while i < len(chars):
+            char = chars[i]
+            if _is_punctuation(char):
+                output.append([char])
+                start_new_word = True
+            else:
+                if start_new_word:
+                    output.append([])
+                start_new_word = False
+                output[-1].append(char)
+            i += 1
+
+        return [''.join(x) for x in output]
+
+    def save_pretrained(self, path: str, filename_prefix: str = None):
+        filename = VOCAB_FILES_NAMES[list(VOCAB_FILES_NAMES.keys())[0]]
+        if filename_prefix is not None:
+            filename = filename_prefix + '-' + filename
+        full_path = os.path.join(path, filename)
+        with open(full_path, 'wb') as fs:
+            fs.write(self.spm.serialized_model_proto())
+        return (full_path, )
+
+
+def _is_whitespace(char):
+    """Checks whether `chars` is a whitespace character."""
+    # \t, \n, and \r are technically control characters but we treat them
+    # as whitespace since they are generally considered as such.
+    if char == ' ' or char == '\t' or char == '\n' or char == '\r':
+        return True
+    cat = unicodedata.category(char)
+    if cat == 'Zs':
+        return True
+    return False
+
+
+def _is_control(char):
+    """Checks whether `chars` is a control character."""
+    # These are technically control characters but we count them as whitespace
+    # characters.
+    if char == '\t' or char == '\n' or char == '\r':
+        return False
+    cat = unicodedata.category(char)
+    if cat.startswith('C'):
+        return True
+    return False
+
+
+def _is_punctuation(char):
+    """Checks whether `chars` is a punctuation character."""
+    cp = ord(char)
+    # We treat all non-letter/number ASCII as punctuation.
+    # Characters such as "^", "$", and "`" are not in the Unicode
+    # Punctuation class but we treat them as punctuation anyways, for
+    # consistency.
+    if (cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or (
+            cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126):
+        return True
+    cat = unicodedata.category(char)
+    if cat.startswith('P'):
+        return True
+    return False
+
+
+def convert_to_unicode(text):
+    """Converts `text` to Unicode (if it's not already), assuming utf-8 input."""
+    if isinstance(text, str):
+        return text
+    elif isinstance(text, bytes):
+        return text.decode('utf-8', 'ignore')
+    else:
+        raise ValueError(f'Unsupported string type: {type(text)}')
diff --git a/modelscope/models/nlp/deberta_v2/tokenization_deberta_v2_fast.py b/modelscope/models/nlp/deberta_v2/tokenization_deberta_v2_fast.py
new file mode 100644
index 00000000..a1fcecf4
--- /dev/null
+++ b/modelscope/models/nlp/deberta_v2/tokenization_deberta_v2_fast.py
@@ -0,0 +1,241 @@
+# Copyright 2021-2022 The Alibaba DAMO NLP Team Authors.
+# Copyright 2020 Microsoft and the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Fast Tokenization class for model DeBERTa."""
+
+import os
+from shutil import copyfile
+from typing import Optional, Tuple
+
+from transformers.file_utils import is_sentencepiece_available
+from transformers.tokenization_utils_fast import PreTrainedTokenizerFast
+
+from modelscope.utils import logger as logging
+
+if is_sentencepiece_available():
+    from .tokenization_deberta_v2 import DebertaV2Tokenizer
+else:
+    DebertaV2Tokenizer = None
+
+logger = logging.get_logger(__name__)
+
+VOCAB_FILES_NAMES = {
+    'vocab_file': 'spm.model',
+    'tokenizer_file': 'tokenizer.json'
+}
+
+PRETRAINED_VOCAB_FILES_MAP = {'vocab_file': {}}
+
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {}
+
+PRETRAINED_INIT_CONFIGURATION = {}
+
+
+class DebertaV2TokenizerFast(PreTrainedTokenizerFast):
+    r"""
+    Constructs a DeBERTa-v2 fast tokenizer. Based on [SentencePiece](https://github.com/google/sentencepiece)
+    and [rjieba-py](https://github.com/messense/rjieba-py).
+
+    Args:
+        vocab_file (`str`):
+            [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that
+            contains the vocabulary necessary to instantiate a tokenizer.
+        do_lower_case (`bool`, *optional*, defaults to `False`):
+            Whether or not to lowercase the input when tokenizing.
+        bos_token (`string`, *optional*, defaults to `"[CLS]"`):
+            The beginning of sequence token that was used during pre-training. Can be used a sequence classifier token.
+            When building a sequence using special tokens, this is not the token that is used for the beginning of
+            sequence. The token used is the `cls_token`.
+        eos_token (`string`, *optional*, defaults to `"[SEP]"`):
+            The end of sequence token. When building a sequence using special tokens, this is not the token that is
+            used for the end of sequence. The token used is the `sep_token`.
+        unk_token (`str`, *optional*, defaults to `"[UNK]"`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        sep_token (`str`, *optional*, defaults to `"[SEP]"`):
+            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
+            sequence classification or for a text and a question for question answering. It is also used as the last
+            token of a sequence built with special tokens.
+        pad_token (`str`, *optional*, defaults to `"[PAD]"`):
+            The token used for padding, for example when batching sequences of different lengths.
+        cls_token (`str`, *optional*, defaults to `"[CLS]"`):
+            The classifier token which is used when doing sequence classification (classification of the whole sequence
+            instead of per-token classification). It is the first token of the sequence when built with special tokens.
+        mask_token (`str`, *optional*, defaults to `"[MASK]"`):
+            The token used for masking values. This is the token used when training this model with masked language
+            modeling. This is the token which the model will try to predict.
+        sp_model_kwargs (`dict`, *optional*):
+            Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
+            SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things,
+            to set:
+
+            - `enable_sampling`: Enable subword regularization.
+            - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.
+
+              - `nbest_size = {0,1}`: No sampling is performed.
+              - `nbest_size > 1`: samples from the nbest_size results.
+              - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
+                using forward-filtering-and-backward-sampling algorithm.
+
+            - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
+              BPE-dropout.
+    """
+
+    vocab_files_names = VOCAB_FILES_NAMES
+    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+    pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+    slow_tokenizer_class = DebertaV2Tokenizer
+
+    def __init__(self,
+                 vocab_file=None,
+                 tokenizer_file=None,
+                 do_lower_case=False,
+                 split_by_punct=False,
+                 split_chinese=True,
+                 bos_token='[CLS]',
+                 eos_token='[SEP]',
+                 unk_token='[UNK]',
+                 sep_token='[SEP]',
+                 pad_token='[PAD]',
+                 cls_token='[CLS]',
+                 mask_token='[MASK]',
+                 **kwargs) -> None:
+        super().__init__(
+            vocab_file,
+            tokenizer_file=tokenizer_file,
+            do_lower_case=do_lower_case,
+            bos_token=bos_token,
+            eos_token=eos_token,
+            unk_token=unk_token,
+            sep_token=sep_token,
+            pad_token=pad_token,
+            cls_token=cls_token,
+            mask_token=mask_token,
+            split_by_punct=split_by_punct,
+            split_chinese=split_chinese,
+            **kwargs,
+        )
+
+        self.do_lower_case = do_lower_case
+        self.split_by_punct = split_by_punct
+        self.split_chinese = split_chinese
+        self.vocab_file = vocab_file
+        self.can_save_slow_tokenizer = False if not self.vocab_file else True
+
+    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
+        """
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
+        adding special tokens. A DeBERTa sequence has the following format:
+
+        - single sequence: [CLS] X [SEP]
+        - pair of sequences: [CLS] A [SEP] B [SEP]
+
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs to which the special tokens will be added.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
+        """
+
+        if token_ids_1 is None:
+            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
+        cls = [self.cls_token_id]
+        sep = [self.sep_token_id]
+        return cls + token_ids_0 + sep + token_ids_1 + sep
+
+    def get_special_tokens_mask(self,
+                                token_ids_0,
+                                token_ids_1=None,
+                                already_has_special_tokens=False):
+        """
+        Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
+        special tokens using the tokenizer `prepare_for_model` or `encode_plus` methods.
+
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
+                Whether or not the token list is already formatted with special tokens for the model.
+
+        Returns:
+            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+        """
+
+        if already_has_special_tokens:
+            return super().get_special_tokens_mask(
+                token_ids_0=token_ids_0,
+                token_ids_1=token_ids_1,
+                already_has_special_tokens=True)
+
+        if token_ids_1 is not None:
+            return [1] + ([0] * len(token_ids_0)) + [1] + (
+                [0] * len(token_ids_1)) + [1]
+        return [1] + ([0] * len(token_ids_0)) + [1]
+
+    def create_token_type_ids_from_sequences(self,
+                                             token_ids_0,
+                                             token_ids_1=None):
+        """
+        Create a mask from the two sequences passed to be used in a sequence-pair classification task. A DeBERTa
+        sequence pair mask has the following format:
+
+        ```
+        0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
+        | first sequence    | second sequence |
+        ```
+
+        If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).
+
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
+        """
+        sep = [self.sep_token_id]
+        cls = [self.cls_token_id]
+        if token_ids_1 is None:
+            return len(cls + token_ids_0 + sep) * [0]
+        return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1
+                                                        + sep) * [1]
+
+    def save_vocabulary(self,
+                        save_directory: str,
+                        filename_prefix: Optional[str] = None) -> Tuple[str]:
+        if not self.can_save_slow_tokenizer:
+            raise ValueError(
+                'Your fast tokenizer does not have the necessary information to save the vocabulary for a slow '
+                'tokenizer.')
+
+        if not os.path.isdir(save_directory):
+            logger.error(
+                f'Vocabulary path ({save_directory}) should be a directory')
+            return
+        out_vocab_file = os.path.join(
+            save_directory, (filename_prefix + '-' if filename_prefix else '')
+            + VOCAB_FILES_NAMES['vocab_file'])
+
+        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
+            copyfile(self.vocab_file, out_vocab_file)
+
+        return (out_vocab_file, )
diff --git a/modelscope/models/nlp/gpt3/gpt3_for_text_generation.py b/modelscope/models/nlp/gpt3/gpt3_for_text_generation.py
index 7cff9ad4..d686ea30 100644
--- a/modelscope/models/nlp/gpt3/gpt3_for_text_generation.py
+++ b/modelscope/models/nlp/gpt3/gpt3_for_text_generation.py
@@ -1,3 +1,4 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
 from typing import Dict
 
 from modelscope.metainfo import Models
@@ -60,5 +61,6 @@ class GPT3ForTextGeneration(TorchModel):
         sample_output = self.model.generate(**gen_params)
         return {
             OutputKeys.TEXT:
-            self.tokenizer.decode(sample_output[0], skip_special_tokens=True)
+            self.tokenizer.decode(sample_output[0],
+                                  skip_special_tokens=True).replace(' ', '')
         }
diff --git a/modelscope/models/nlp/gpt3/modeling_gpt3.py b/modelscope/models/nlp/gpt3/modeling_gpt3.py
index f7024713..498d15de 100644
--- a/modelscope/models/nlp/gpt3/modeling_gpt3.py
+++ b/modelscope/models/nlp/gpt3/modeling_gpt3.py
@@ -1,3 +1,4 @@
+# Copyright 2021-2022 The Alibaba DAMO NLP Team Authors.
 # Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -16,9 +17,10 @@ import math
 import os
 from typing import Optional, Union
 
+import addict
 import torch
-from addict import Dict
-from torch.nn import Dropout, Embedding, LayerNorm, Linear, Module, Softmax
+from torch.nn import (CrossEntropyLoss, Dropout, Embedding, LayerNorm, Linear,
+                      Module, Softmax)
 from torch.nn import functional as F
 from transformers.modeling_utils import PreTrainedModel
 
@@ -308,20 +310,25 @@ class GPT3Model(PreTrainedModel):
                 input_ids,
                 attention_mask=None,
                 position_ids=None,
+                labels=None,
                 **kwargs):
         seq_length = input_ids.size(1)
-        if attention_mask is None:
-            attention_mask = torch.tril(
-                torch.ones((1, seq_length, seq_length),
-                           dtype=torch.long,
-                           device=input_ids.device))
+        attention_mask = torch.tril(
+            torch.ones((1, 1, seq_length, seq_length),
+                       dtype=torch.long,
+                       device=input_ids.device))
         if position_ids is None:
             position_ids = torch.arange(
                 seq_length, dtype=torch.long, device=input_ids.device)
             position_ids = position_ids.unsqueeze(0).expand_as(input_ids)
 
         logits = self.language_model(input_ids, attention_mask, position_ids)
-        return Dict(logits=logits)
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(
+                logits.view(-1, self.config.vocab_size), labels.view(-1))
+        return addict.Dict(loss=loss, logits=logits)
 
     @classmethod
     def from_pretrained(
@@ -333,5 +340,9 @@ class GPT3Model(PreTrainedModel):
         state_dict_file = os.path.join(pretrained_model_name_or_path,
                                        ModelFile.TORCH_MODEL_BIN_FILE)
         state_dict = torch.load(state_dict_file)
+        state_dict = {
+            k.replace('model.language_model', 'language_model'): v
+            for k, v in state_dict.items()
+        }
         model.load_state_dict(state_dict)
         return model
diff --git a/modelscope/models/nlp/heads/fill_mask_head.py b/modelscope/models/nlp/heads/fill_mask_head.py
new file mode 100644
index 00000000..6b0c5e05
--- /dev/null
+++ b/modelscope/models/nlp/heads/fill_mask_head.py
@@ -0,0 +1,101 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+#
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Dict
+
+import torch
+import torch.nn.functional as F
+from torch import nn
+from torch.nn import CrossEntropyLoss
+from transformers.activations import ACT2FN
+
+from modelscope.metainfo import Heads
+from modelscope.models.base import TorchHead
+from modelscope.models.builder import HEADS
+from modelscope.outputs import OutputKeys
+from modelscope.utils.constant import Tasks
+
+
+@HEADS.register_module(Tasks.fill_mask, module_name=Heads.bert_mlm)
+class BertFillMaskHead(TorchHead):
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        self.cls = BertOnlyMLMHead(self.config)
+
+    def forward(self, sequence_output):
+        prediction_scores = self.cls(sequence_output)
+        return {OutputKeys.LOGITS: prediction_scores}
+
+    def compute_loss(self, outputs: Dict[str, torch.Tensor],
+                     labels) -> Dict[str, torch.Tensor]:
+        loss_fct = CrossEntropyLoss()  # -100 index = padding token
+        masked_lm_loss = loss_fct(
+            outputs.view(-1, self.config.vocab_size), labels.view(-1))
+        return {OutputKeys.LOSS: masked_lm_loss}
+
+
+class BertPredictionHeadTransform(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        if isinstance(config.hidden_act, str):
+            self.transform_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.transform_act_fn = config.hidden_act
+        self.LayerNorm = nn.LayerNorm(
+            config.hidden_size, eps=config.layer_norm_eps)
+
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.transform_act_fn(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states)
+        return hidden_states
+
+
+class BertLMPredictionHead(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.transform = BertPredictionHeadTransform(config)
+
+        # The output weights are the same as the input embeddings, but there is
+        # an output-only bias for each token.
+        self.decoder = nn.Linear(
+            config.hidden_size, config.vocab_size, bias=False)
+
+        self.bias = nn.Parameter(torch.zeros(config.vocab_size))
+
+        # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
+        self.decoder.bias = self.bias
+
+    def forward(self, hidden_states):
+        hidden_states = self.transform(hidden_states)
+        hidden_states = self.decoder(hidden_states)
+        return hidden_states
+
+
+class BertOnlyMLMHead(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.predictions = BertLMPredictionHead(config)
+
+    def forward(self, sequence_output: torch.Tensor) -> torch.Tensor:
+        prediction_scores = self.predictions(sequence_output)
+        return prediction_scores
diff --git a/modelscope/models/nlp/heads/infromation_extraction_head.py b/modelscope/models/nlp/heads/infromation_extraction_head.py
new file mode 100644
index 00000000..6c3388f0
--- /dev/null
+++ b/modelscope/models/nlp/heads/infromation_extraction_head.py
@@ -0,0 +1,103 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import torch
+from torch import nn
+
+from modelscope.metainfo import Heads
+from modelscope.models.base import TorchHead
+from modelscope.models.builder import HEADS
+from modelscope.utils.constant import Tasks
+
+
+@HEADS.register_module(
+    Tasks.information_extraction, module_name=Heads.information_extraction)
+class InformationExtractionHead(TorchHead):
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        config = self.config
+        assert config.get('labels') is not None
+        self.labels = config.labels
+        self.s_layer = nn.Linear(config.hidden_size, 2)  # head, tail, bce
+        self.o_layer = nn.Linear(2 * config.hidden_size, 2)  # head, tail, bce
+        self.p_layer = nn.Linear(config.hidden_size,
+                                 len(self.labels))  # label, ce
+        self.mha = nn.MultiheadAttention(config.hidden_size, 4)
+
+    def forward(self, sequence_output, text, offsets, threshold=0.5):
+        # assert batch size == 1
+        spos = []
+        s_head_logits, s_tail_logits = self.s_layer(sequence_output).split(
+            1, dim=-1)  # (b, seq_len, 2)
+        s_head_logits = s_head_logits[0, :, 0].sigmoid()  # (seq_len)
+        s_tail_logits = s_tail_logits[0, :, 0].sigmoid()  # (seq_len)
+        s_masks, subjects = self._get_masks_and_mentions(
+            text, offsets, s_head_logits, s_tail_logits, None, threshold)
+        for s_mask, subject in zip(s_masks, subjects):
+            masked_sequence_output = sequence_output * s_mask.unsqueeze(
+                0).unsqueeze(-1)  # (b, s, h)
+            subjected_sequence_output = self.mha(
+                sequence_output.permute(1, 0, 2),
+                masked_sequence_output.permute(1, 0, 2),
+                masked_sequence_output.permute(1, 0,
+                                               2))[0].permute(1, 0,
+                                                              2)  # (b, s, h)
+            cat_sequence_output = torch.cat(
+                (sequence_output, subjected_sequence_output), dim=-1)
+            o_head_logits, o_tail_logits = self.o_layer(
+                cat_sequence_output).split(
+                    1, dim=-1)
+            o_head_logits = o_head_logits[0, :, 0].sigmoid()  # (seq_len)
+            o_tail_logits = o_tail_logits[0, :, 0].sigmoid()  # (seq_len)
+            so_masks, objects = self._get_masks_and_mentions(
+                text, offsets, o_head_logits, o_tail_logits, s_mask, threshold)
+            for so_mask, object in zip(so_masks, objects):
+                masked_sequence_output = (
+                    sequence_output * so_mask.unsqueeze(0).unsqueeze(-1)).sum(
+                        1)  # (b, h)
+                lengths = so_mask.unsqueeze(0).sum(-1, keepdim=True)  # (b, 1)
+                pooled_subject_object = masked_sequence_output / lengths  # (b, h)
+                label = self.p_layer(pooled_subject_object).sigmoid().squeeze(
+                    0)
+                for i in range(label.size(-1)):
+                    if label[i] > threshold:
+                        predicate = self.labels[i]
+                        spos.append((subject, predicate, object))
+        return spos
+
+    def _get_masks_and_mentions(self,
+                                text,
+                                offsets,
+                                heads,
+                                tails,
+                                init_mask=None,
+                                threshold=0.5):
+        '''
+        text: str
+        heads: tensor (len(heads))
+        tails: tensor (len(tails))
+        '''
+        seq_len = heads.size(-1)
+        potential_heads = []
+        for i in range(seq_len - 1):
+            if heads[i] > threshold:
+                potential_heads.append(i)
+        potential_heads.append(seq_len - 1)
+        masks = []
+        mentions = []
+        for i in range(len(potential_heads) - 1):
+            head_index = potential_heads[i]
+            tail_index, max_val = None, 0
+            for j in range(head_index, potential_heads[i + 1]):
+                if tails[j] > max_val and tails[j] > threshold:
+                    tail_index = j
+                    max_val = tails[j]
+            if tail_index is not None:
+                mask = torch.zeros_like(
+                    heads) if init_mask is None else init_mask.clone()
+                mask[head_index:tail_index + 1] = 1
+                masks.append(mask)  # (seq_len)
+                char_head = offsets[head_index][0]
+                char_tail = offsets[tail_index][1]
+                mention = text[char_head:char_tail]
+                mentions.append(mention)
+        return masks, mentions
diff --git a/modelscope/models/nlp/heads/sequence_classification_head.py b/modelscope/models/nlp/heads/sequence_classification_head.py
index 92f3a4ec..fb03b7ff 100644
--- a/modelscope/models/nlp/heads/sequence_classification_head.py
+++ b/modelscope/models/nlp/heads/sequence_classification_head.py
@@ -1,3 +1,4 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
 from typing import Dict
 
 import torch
@@ -19,7 +20,6 @@ class SequenceClassificationHead(TorchHead):
         super().__init__(**kwargs)
         config = self.config
         self.num_labels = config.num_labels
-        self.config = config
         classifier_dropout = (
             config['classifier_dropout'] if config.get('classifier_dropout')
             is not None else config['hidden_dropout_prob'])
diff --git a/modelscope/models/nlp/heads/token_classification_head.py b/modelscope/models/nlp/heads/token_classification_head.py
new file mode 100644
index 00000000..ace3deac
--- /dev/null
+++ b/modelscope/models/nlp/heads/token_classification_head.py
@@ -0,0 +1,43 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import Dict
+
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+from modelscope.metainfo import Heads
+from modelscope.models.base import TorchHead
+from modelscope.models.builder import HEADS
+from modelscope.outputs import OutputKeys
+from modelscope.utils.constant import Tasks
+
+
+@HEADS.register_module(
+    Tasks.token_classification, module_name=Heads.token_classification)
+class TokenClassificationHead(TorchHead):
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        config = self.config
+        self.num_labels = config.num_labels
+        classifier_dropout = (
+            config['classifier_dropout'] if config.get('classifier_dropout')
+            is not None else config['hidden_dropout_prob'])
+        self.dropout = nn.Dropout(classifier_dropout)
+        self.classifier = nn.Linear(config['hidden_size'],
+                                    config['num_labels'])
+
+    def forward(self, inputs=None):
+        if isinstance(inputs, dict):
+            assert inputs.get('sequence_output') is not None
+            sequence_output = inputs.get('sequence_output')
+        else:
+            sequence_output = inputs
+        sequence_output = self.dropout(sequence_output)
+        logits = self.classifier(sequence_output)
+        return {OutputKeys.LOGITS: logits}
+
+    def compute_loss(self, outputs: Dict[str, torch.Tensor],
+                     labels) -> Dict[str, torch.Tensor]:
+        logits = outputs[OutputKeys.LOGITS]
+        return {OutputKeys.LOSS: F.cross_entropy(logits, labels)}
diff --git a/modelscope/models/nlp/heads/torch_pretrain_head.py b/modelscope/models/nlp/heads/torch_pretrain_head.py
index 6ff6c96f..e477533f 100644
--- a/modelscope/models/nlp/heads/torch_pretrain_head.py
+++ b/modelscope/models/nlp/heads/torch_pretrain_head.py
@@ -1,3 +1,4 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
 from typing import Dict
 
 import torch
@@ -10,7 +11,7 @@ from modelscope.models.builder import HEADS
 from modelscope.utils.constant import Tasks
 
 
-@HEADS.register_module(Tasks.fill_mask, module_name=Heads.bert_mlm)
+# @HEADS.register_module(Tasks.fill_mask, module_name=Heads.bert_mlm)
 class BertMLMHead(BertOnlyMLMHead, TorchHead):
 
     def compute_loss(self, outputs: Dict[str, torch.Tensor],
diff --git a/modelscope/models/nlp/masked_language.py b/modelscope/models/nlp/masked_language.py
index 17324be9..b7a890c1 100644
--- a/modelscope/models/nlp/masked_language.py
+++ b/modelscope/models/nlp/masked_language.py
@@ -1,11 +1,11 @@
-from typing import Any, Dict, Optional, Union
-
-import numpy as np
-from transformers import BertForMaskedLM as BertForMaskedLMTransformer
-
+# Copyright (c) Alibaba, Inc. and its affiliates.
 from modelscope.metainfo import Models
 from modelscope.models.base import TorchModel
 from modelscope.models.builder import MODELS
+from modelscope.models.nlp.bert import \
+    BertForMaskedLM as BertForMaskedLMTransformer
+from modelscope.models.nlp.deberta_v2 import \
+    DebertaV2ForMaskedLM as DebertaV2ForMaskedLMTransformer
 from modelscope.models.nlp.structbert import SbertForMaskedLM
 from modelscope.models.nlp.veco import \
     VecoForMaskedLM as VecoForMaskedLMTransformer
@@ -125,3 +125,40 @@ class VecoForMaskedLM(TorchModel, VecoForMaskedLMTransformer):
                      VecoForMaskedLM).from_pretrained(
                          pretrained_model_name_or_path=model_dir,
                          model_dir=model_dir)
+
+
+@MODELS.register_module(Tasks.fill_mask, module_name=Models.deberta_v2)
+class DebertaV2ForMaskedLM(TorchModel, DebertaV2ForMaskedLMTransformer):
+    """Deberta v2 for MLM model.
+
+    Inherited from deberta_v2.DebertaV2ForMaskedLM and TorchModel, so this class can be registered into Model sets.
+    """
+
+    def __init__(self, config, model_dir):
+        super(TorchModel, self).__init__(model_dir)
+        DebertaV2ForMaskedLMTransformer.__init__(self, config)
+
+    def forward(self,
+                input_ids=None,
+                attention_mask=None,
+                token_type_ids=None,
+                position_ids=None,
+                head_mask=None,
+                labels=None):
+        output = DebertaV2ForMaskedLMTransformer.forward(
+            self,
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            labels=labels)
+        output[OutputKeys.INPUT_IDS] = input_ids
+        return output
+
+    @classmethod
+    def _instantiate(cls, **kwargs):
+        model_dir = kwargs.get('model_dir')
+        return super(DebertaV2ForMaskedLMTransformer,
+                     DebertaV2ForMaskedLM).from_pretrained(
+                         pretrained_model_name_or_path=model_dir,
+                         model_dir=model_dir)
diff --git a/modelscope/models/nlp/nncrf_for_named_entity_recognition.py b/modelscope/models/nlp/nncrf_for_named_entity_recognition.py
index 2015997f..8b0c59b2 100644
--- a/modelscope/models/nlp/nncrf_for_named_entity_recognition.py
+++ b/modelscope/models/nlp/nncrf_for_named_entity_recognition.py
@@ -1,3 +1,7 @@
+# Copyright 2021-2022 The Alibaba DAMO NLP Team Authors. All rights reserved.
+# The CRF implementation borrows mostly from AllenNLP CRF module (https://github.com/allenai/allennlp)
+# and pytorch-crf (https://github.com/kmkurn/pytorch-crf) with some modifications.
+
 import os
 from typing import Any, Dict, List, Optional
 
@@ -10,27 +14,25 @@ from modelscope.models import TorchModel
 from modelscope.models.builder import MODELS
 from modelscope.utils.constant import ModelFile, Tasks
 
-__all__ = ['TransformerCRFForNamedEntityRecognition']
+__all__ = [
+    'TransformerCRFForNamedEntityRecognition',
+    'LSTMCRFForNamedEntityRecognition'
+]
 
 
-@MODELS.register_module(
-    Tasks.named_entity_recognition, module_name=Models.tcrf)
-class TransformerCRFForNamedEntityRecognition(TorchModel):
-    """This model wraps the TransformerCRF model to register into model sets.
-    """
+class SequenceLabelingForNamedEntityRecognition(TorchModel):
 
     def __init__(self, model_dir, *args, **kwargs):
         super().__init__(model_dir, *args, **kwargs)
-
-        self.config = AutoConfig.from_pretrained(model_dir)
-        num_labels = self.config.num_labels
-
-        self.model = TransformerCRF(model_dir, num_labels)
+        self.model = self.init_model(model_dir, *args, **kwargs)
 
         model_ckpt = os.path.join(model_dir, ModelFile.TORCH_MODEL_BIN_FILE)
         self.model.load_state_dict(
             torch.load(model_ckpt, map_location=torch.device('cpu')))
 
+    def init_model(self, model_dir, *args, **kwargs):
+        raise NotImplementedError
+
     def train(self):
         return self.model.train()
 
@@ -39,12 +41,9 @@ class TransformerCRFForNamedEntityRecognition(TorchModel):
 
     def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
         input_tensor = {
-            'input_ids':
-            torch.tensor(input['input_ids']).unsqueeze(0),
-            'attention_mask':
-            torch.tensor(input['attention_mask']).unsqueeze(0),
-            'label_mask':
-            torch.tensor(input['label_mask'], dtype=torch.bool).unsqueeze(0)
+            'input_ids': input['input_ids'],
+            'attention_mask': input['attention_mask'],
+            'label_mask': input['label_mask'],
         }
         output = {
             'text': input['text'],
@@ -64,6 +63,39 @@ class TransformerCRFForNamedEntityRecognition(TorchModel):
         return output
 
 
+@MODELS.register_module(
+    Tasks.named_entity_recognition, module_name=Models.tcrf)
+class TransformerCRFForNamedEntityRecognition(
+        SequenceLabelingForNamedEntityRecognition):
+    """This model wraps the TransformerCRF model to register into model sets.
+    """
+
+    def init_model(self, model_dir, *args, **kwargs):
+        self.config = AutoConfig.from_pretrained(model_dir)
+        num_labels = self.config.num_labels
+
+        model = TransformerCRF(model_dir, num_labels)
+        return model
+
+
+@MODELS.register_module(
+    Tasks.named_entity_recognition, module_name=Models.lcrf)
+class LSTMCRFForNamedEntityRecognition(
+        SequenceLabelingForNamedEntityRecognition):
+    """This model wraps the LSTMCRF model to register into model sets.
+    """
+
+    def init_model(self, model_dir, *args, **kwargs):
+        self.config = AutoConfig.from_pretrained(model_dir)
+        vocab_size = self.config.vocab_size
+        embed_width = self.config.embed_width
+        num_labels = self.config.num_labels
+        lstm_hidden_size = self.config.lstm_hidden_size
+
+        model = LSTMCRF(vocab_size, embed_width, num_labels, lstm_hidden_size)
+        return model
+
+
 class TransformerCRF(nn.Module):
     """A transformer based model to NER tasks.
 
@@ -105,6 +137,56 @@ class TransformerCRF(nn.Module):
         return outputs
 
 
+class LSTMCRF(nn.Module):
+    """
+    A standard bilstm-crf model for fast prediction.
+    """
+
+    def __init__(self,
+                 vocab_size,
+                 embed_width,
+                 num_labels,
+                 lstm_hidden_size=100,
+                 **kwargs):
+        super(LSTMCRF, self).__init__()
+        self.embedding = Embedding(vocab_size, embed_width)
+        self.lstm = nn.LSTM(
+            embed_width,
+            lstm_hidden_size,
+            num_layers=1,
+            bidirectional=True,
+            batch_first=True)
+        self.ffn = nn.Linear(lstm_hidden_size * 2, num_labels)
+        self.crf = CRF(num_labels, batch_first=True)
+
+    def forward(self, inputs):
+        embedding = self.embedding(inputs['input_ids'])
+        lstm_output, _ = self.lstm(embedding)
+        logits = self.ffn(lstm_output)
+
+        if 'label_mask' in inputs:
+            mask = inputs['label_mask']
+            masked_lengths = mask.sum(-1).long()
+            masked_logits = torch.zeros_like(logits)
+            for i in range(len(mask)):
+                masked_logits[
+                    i, :masked_lengths[i], :] = logits[i].masked_select(
+                        mask[i].unsqueeze(-1)).view(masked_lengths[i], -1)
+            logits = masked_logits
+
+        outputs = {'logits': logits}
+        return outputs
+
+    def decode(self, inputs):
+        seq_lens = inputs['label_mask'].sum(-1).long()
+        mask = torch.arange(
+            inputs['label_mask'].shape[1],
+            device=seq_lens.device)[None, :] < seq_lens[:, None]
+        predicts = self.crf.decode(inputs['logits'], mask=mask).squeeze(0)
+        outputs = {'predicts': predicts}
+        return outputs
+
+
 class CRF(nn.Module):
     """Conditional random field.
     This module implements a conditional random field [LMP01]_. The forward computation
@@ -127,8 +209,6 @@ class CRF(nn.Module):
        Learning*. Morgan Kaufmann. pp. 282–289.
     .. _Viterbi algorithm: https://en.wikipedia.org/wiki/Viterbi_algorithm
 
-    The implementation borrows mostly from AllenNLP CRF module (https://github.com/allenai/allennlp)
-    and pytorch-crf (https://github.com/kmkurn/pytorch-crf) with some modifications.
     """
 
     def __init__(self, num_tags: int, batch_first: bool = False) -> None:
@@ -547,3 +627,14 @@ class CRF(nn.Module):
 
         return torch.where(mask.unsqueeze(-1), best_tags_arr,
                            oor_tag).permute(2, 1, 0)
+
+
+class Embedding(nn.Module):
+
+    def __init__(self, vocab_size, embed_width):
+        super(Embedding, self).__init__()
+
+        self.embedding = nn.Embedding(vocab_size, embed_width)
+
+    def forward(self, input_ids):
+        return self.embedding(input_ids)
diff --git a/modelscope/models/nlp/palm_v2/modeling_palm.py b/modelscope/models/nlp/palm_v2/modeling_palm.py
index 1cbf4f58..f395ebd4 100644
--- a/modelscope/models/nlp/palm_v2/modeling_palm.py
+++ b/modelscope/models/nlp/palm_v2/modeling_palm.py
@@ -1,3 +1,19 @@
+# Copyright 2021-2022 The Alibaba DAMO NLP Team Authors.
+# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import codecs
 import copy
 import math
@@ -6,6 +22,7 @@ import subprocess
 from dataclasses import dataclass
 from typing import Any, Dict, List, Optional, Union
 
+import addict
 import json
 import numpy as np
 import torch
@@ -591,11 +608,11 @@ class AbsSummarizer(PalmPreTrainedModel):  # Model
         self.generator.dense.weight = self.decoder.embeddings.weight
 
         if checkpoint is not None:
-            for key in list(checkpoint['model'].keys()):
-                checkpoint['model'][key.replace('module.',
-                                                '')] = checkpoint['model'][key]
-            msg = self.load_state_dict(checkpoint['model'], strict=False)
-            print(msg)
+            if 'model' in checkpoint:
+                checkpoint = checkpoint['model']
+            for key in list(checkpoint.keys()):
+                checkpoint[key.replace('model.palm.', '')] = checkpoint[key]
+            self.load_state_dict(checkpoint, strict=False)
         else:
             for module in self.decoder.modules():
                 if isinstance(module, (nn.Linear, nn.Embedding)):
@@ -726,13 +743,14 @@ class PalmForConditionalGeneration(PalmPreTrainedModel):
                                    self.palm.vocab_size,
                                    config.label_smoothing)
 
-    def forward(self, src, tgt, mask_src):
-        output = self.palm(src, tgt, mask_src)[0]
-        loss = self.loss(tgt, output)
-        return loss
+    def forward(self, input_ids, attention_mask, labels):
+        output = self.palm(
+            src=input_ids, tgt=labels, mask_src=attention_mask)[0]
+        loss = self.loss(labels, output)
+        return addict.Dict(loss=loss)
 
 
-class Translator(nn.Module):
+class Translator(object):
     """
     Uses a model to translate a batch of sentences.
     """
@@ -1296,8 +1314,8 @@ class Translator(nn.Module):
 
         return results
 
-    def forward(self, input_ids: torch.Tensor,
-                attention_mask: torch.Tensor) -> Dict[str, torch.Tensor]:
+    def __call__(self, input_ids: torch.Tensor,
+                 attention_mask: torch.Tensor) -> Dict[str, torch.Tensor]:
         batch = self.Batch(
             batch_size=input_ids.size()[0],
             src=input_ids,
diff --git a/modelscope/models/nlp/palm_v2/palm_for_text_generation.py b/modelscope/models/nlp/palm_v2/palm_for_text_generation.py
index e432cc58..2c37afd6 100644
--- a/modelscope/models/nlp/palm_v2/palm_for_text_generation.py
+++ b/modelscope/models/nlp/palm_v2/palm_for_text_generation.py
@@ -1,3 +1,4 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
 from typing import Dict, List
 
 from modelscope.metainfo import Models
@@ -29,20 +30,19 @@ class PalmForTextGeneration(TorchModel):
         self.generator = Translator(self.model)
 
     def _evaluate_postprocess(self, ids_list: List[List[int]]) -> List[str]:
-        replace_tokens_bert = (('[unused0]', ''), ('[PAD]', ''),
-                               ('[unused1]', ''), (r' +', ' '), ('[SEP]', ''),
-                               ('[unused2]', ''), ('[CLS]', ''), ('[UNK]', ''))
+        replace_tokens_bert = (('[unused0]', ''), ('[PAD]', ''), ('[unused1]',
+                                                                  ''),
+                               (r' +', ' '), ('[SEP]', ''), ('[unused2]', ''),
+                               ('[CLS]', ''), ('[UNK]', ''), (' ', ''))
         replace_tokens_roberta = ((r' +', ' '), ('<mask>', '. '),
                                   ('<pad>', ''), ('<s>', ''), ('</s>', ''),
                                   ('<unk>', ' '), ('<q>', '. '))
 
+        replace_tokens = replace_tokens_roberta \
+            if self.model.config.encoder == 'roberta' else replace_tokens_bert
         strings = [self.tokenizer.decode(pred_ids) for pred_ids in ids_list]
-        for _old, _new in replace_tokens_bert:
+        for _old, _new in replace_tokens:
             strings = [s.replace(_old, _new) for s in strings]
-        for _old, _new in replace_tokens_roberta:
-            strings = [s.replace(_old, _new) for s in strings]
-        for s in strings:
-            s.strip()
         return strings
 
     def forward(self, input: Dict[str, Tensor]) -> Dict[str, Tensor]:
@@ -64,14 +64,15 @@ class PalmForTextGeneration(TorchModel):
                     }
         """
         if self.training:
-            return {'loss': self.model(**input)}
+            return self.model(**input)
         else:
-            outputs = self.generator(input['src'], input['mask_src'])
+            outputs = self.generator(input['input_ids'],
+                                     input['attention_mask'])
             preds = outputs['predictions']
             pred_ids_list = [
                 pred_batch[0].cpu().numpy().tolist() for pred_batch in preds
             ]
-            tgt_ids_list = input['tgt'].cpu().numpy().tolist()
+            tgt_ids_list = input['labels'].cpu().numpy().tolist()
             return {
                 'preds': self._evaluate_postprocess(pred_ids_list),
                 'tgts': self._evaluate_postprocess(tgt_ids_list)
diff --git a/modelscope/models/nlp/passage_ranking.py b/modelscope/models/nlp/passage_ranking.py
new file mode 100644
index 00000000..2a06ce45
--- /dev/null
+++ b/modelscope/models/nlp/passage_ranking.py
@@ -0,0 +1,80 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+from typing import Any, Dict
+
+import numpy as np
+import torch
+
+from modelscope.metainfo import Models
+from modelscope.models import TorchModel
+from modelscope.models.builder import MODELS
+from modelscope.models.nlp import SbertForSequenceClassification
+from modelscope.models.nlp.structbert import SbertPreTrainedModel
+from modelscope.outputs import OutputKeys
+from modelscope.utils.constant import Tasks
+
+__all__ = ['PassageRanking']
+
+
+@MODELS.register_module(Tasks.passage_ranking, module_name=Models.bert)
+class PassageRanking(SbertForSequenceClassification, SbertPreTrainedModel):
+    base_model_prefix: str = 'bert'
+    supports_gradient_checkpointing = True
+    _keys_to_ignore_on_load_missing = [r'position_ids']
+
+    def __init__(self, config, model_dir, *args, **kwargs):
+        if hasattr(config, 'base_model_prefix'):
+            PassageRanking.base_model_prefix = config.base_model_prefix
+        super().__init__(config, model_dir)
+        self.train_batch_size = kwargs.get('train_batch_size', 4)
+        self.register_buffer(
+            'target_label',
+            torch.zeros(self.train_batch_size, dtype=torch.long))
+
+    def build_base_model(self):
+        from .structbert import SbertModel
+        return SbertModel(self.config, add_pooling_layer=True)
+
+    def forward(self, input: Dict[str, Any]) -> Dict[str, np.ndarray]:
+        outputs = self.base_model.forward(**input)
+
+        # backbone model should return pooled_output as its second output
+        pooled_output = outputs[1]
+        pooled_output = self.dropout(pooled_output)
+        logits = self.classifier(pooled_output)
+        if self.base_model.training:
+            scores = logits.view(self.train_batch_size, -1)
+            loss_fct = torch.nn.CrossEntropyLoss()
+            loss = loss_fct(scores, self.target_label)
+            return {OutputKeys.LOGITS: logits, OutputKeys.LOSS: loss}
+        return {OutputKeys.LOGITS: logits}
+
+    def sigmoid(self, logits):
+        return np.exp(logits) / (1 + np.exp(logits))
+
+    def postprocess(self, inputs: Dict[str, np.ndarray],
+                    **kwargs) -> Dict[str, np.ndarray]:
+        logits = inputs['logits'].squeeze(-1).detach().cpu().numpy()
+        logits = self.sigmoid(logits).tolist()
+        result = {OutputKeys.SCORES: logits}
+        return result
+
+    @classmethod
+    def _instantiate(cls, **kwargs):
+        """Instantiate the model.
+
+        @param kwargs: Input args.
+                    model_dir: The model dir used to load the checkpoint and the label information.
+                    num_labels: An optional arg to tell the model how many classes to initialize.
+                                    Method will call utils.parse_label_mapping if num_labels not supplied.
+                                    If num_labels is not found, the model will use the default setting (1 classes).
+        @return: The loaded model, which is initialized by transformers.PreTrainedModel.from_pretrained
+        """
+
+        num_labels = kwargs.get('num_labels', 1)
+        model_args = {} if num_labels is None else {'num_labels': num_labels}
+
+        return super(SbertPreTrainedModel, PassageRanking).from_pretrained(
+            pretrained_model_name_or_path=kwargs.get('model_dir'),
+            model_dir=kwargs.get('model_dir'),
+            **model_args)
diff --git a/modelscope/models/nlp/plug/__init__.py b/modelscope/models/nlp/plug/__init__.py
new file mode 100644
index 00000000..dbc20751
--- /dev/null
+++ b/modelscope/models/nlp/plug/__init__.py
@@ -0,0 +1,25 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .configuration_plug import PlugNLGConfig
+    from .modeling_plug import PlugModel
+    from .distributed_plug import DistributedPlug
+else:
+    _import_structure = {
+        'configuration_plug': ['PlugNLGConfig'],
+        'modeling_plug': ['PlugModel'],
+        'distributed_plug': ['DistributedPlug'],
+    }
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/models/nlp/plug/configuration_plug.py b/modelscope/models/nlp/plug/configuration_plug.py
new file mode 100644
index 00000000..64807392
--- /dev/null
+++ b/modelscope/models/nlp/plug/configuration_plug.py
@@ -0,0 +1,232 @@
+# Copyright 2021-2022 The Alibaba DAMO NLP Team Authors.
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+
+import json
+from transformers import PretrainedConfig
+
+from modelscope.utils import logger as logging
+
+logger = logging.get_logger(__name__)
+
+
+class PlugNLUConfig(PretrainedConfig):
+    model_type = 'plugNLU'
+
+    def __init__(self,
+                 vocab_size=21504,
+                 original_vocab_size=21128,
+                 hidden_size=8192,
+                 num_hidden_layers=24,
+                 num_attention_heads=128,
+                 intermediate_size=32768,
+                 hidden_act='gelu',
+                 hidden_dropout_prob=0.1,
+                 attention_probs_dropout_prob=0.1,
+                 max_position_embeddings=2048,
+                 type_vocab_size=3,
+                 initializer_range=0.00707,
+                 deep_init=False,
+                 deepspeed=False,
+                 lr_decay_style='linear',
+                 weight_decay=1e-2,
+                 clip_grad=1.0,
+                 warmup=0.0333,
+                 pre_ln=True,
+                 fp16=True,
+                 fp32_layernorm=True,
+                 fp32_embedding=False,
+                 fp32_tokentypes=False,
+                 layernorm_epsilon=1e-5,
+                 dec_hidden_layers=6,
+                 pruning_method=None,
+                 pruning_mask_init='constant',
+                 pruning_mask_scale=0.0,
+                 pruning_initial_threshold=1.0,
+                 pruning_final_threshold=0.01,
+                 pruning_initial_warmup=1,
+                 pruning_final_warmup=20,
+                 pruning_module='decoder',
+                 pruning_decay_step=50,
+                 pruning_decay_type='exp',
+                 ft_module=None,
+                 attn_separate=False,
+                 LR_weight_rank=8,
+                 LR_mask_rank=8,
+                 **kwargs):
+        super().__init__(layer_norm_eps=layernorm_epsilon, **kwargs)
+
+        self.vocab_size = vocab_size
+        self.original_vocab_size = original_vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.hidden_act = hidden_act
+        self.intermediate_size = intermediate_size
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.type_vocab_size = type_vocab_size
+        self.initializer_range = initializer_range
+        self.deep_init = deep_init
+        self.deepspeed = deepspeed
+        self.lr_decay_style = lr_decay_style
+        self.weight_decay = weight_decay
+        self.clip_grad = clip_grad
+        self.warmup = warmup
+        self.pre_ln = pre_ln
+        self.fp16 = fp16
+        self.fp32_layernorm = fp32_layernorm
+        self.fp32_embedding = fp32_embedding
+        self.layernorm_epsilon = layernorm_epsilon
+        self.fp32_tokentypes = fp32_tokentypes
+        self.dec_hidden_layers = dec_hidden_layers
+        self.pruning_method = pruning_method
+        self.pruning_mask_init = pruning_mask_init
+        self.pruning_mask_scale = pruning_mask_scale
+        self.pruning_module = pruning_module
+        self.pruning_initial_threshold = pruning_initial_threshold
+        self.pruning_final_threshold = pruning_final_threshold
+        self.pruning_initial_warmup = pruning_initial_warmup
+        self.pruning_final_warmup = pruning_final_warmup
+        self.pruning_decay_step = pruning_decay_step
+        self.pruning_decay_type = pruning_decay_type
+        self.ft_module = ft_module
+        self.attn_separate = attn_separate
+        self.LR_weight_rank = LR_weight_rank
+        self.LR_mask_rank = LR_mask_rank
+
+    @classmethod
+    def from_dict(cls, json_object):
+        """Constructs a `BertConfig` from a Python dictionary of parameters."""
+        config = PlugNLUConfig()
+        for key, value in json_object.items():
+            config.__dict__[key] = value
+        return config
+
+    @classmethod
+    def from_json_file(cls, json_file):
+        """Constructs a `BertConfig` from a json file of parameters."""
+        with open(json_file, 'r', encoding='utf-8') as reader:
+            text = reader.read()
+        return cls.from_dict(json.loads(text))
+
+    def merge_args(self, args):
+        """merge values a `BertConfig` from a json file of parameters."""
+        local_keys = self.__dict__.keys()
+        for key, value in args.__dict__.items():
+            if key in local_keys:
+                continue
+            self.__dict__[key] = value
+        return self
+
+    def __repr__(self):
+        return str(self.to_json_string())
+
+    def to_dict(self):
+        """Serializes this instance to a Python dictionary."""
+        output = copy.deepcopy(self.__dict__)
+        return output
+
+    def to_json_string(self):
+        """Serializes this instance to a JSON string."""
+        return json.dumps(self.to_dict(), indent=2, sort_keys=True) + '\n'
+
+
+class PlugNLGConfig(PlugNLUConfig):
+    model_type = 'plugNLG'
+
+    def __init__(self,
+                 vocab_size=21504,
+                 hidden_size=768,
+                 num_hidden_layers=12,
+                 num_attention_heads=12,
+                 intermediate_size=3072,
+                 hidden_act='gelu',
+                 hidden_dropout_prob=0.1,
+                 attention_probs_dropout_prob=0.1,
+                 max_position_embeddings=512,
+                 type_vocab_size=2,
+                 initializer_range=0.00707,
+                 deep_init=False,
+                 deepspeed=False,
+                 lr_decay_style='linear',
+                 weight_decay=1e-2,
+                 clip_grad=1.0,
+                 warmup=0.01,
+                 pre_ln=False,
+                 fp16=False,
+                 fp32_layernorm=False,
+                 fp32_embedding=False,
+                 fp32_tokentypes=False,
+                 layernorm_epsilon=1e-12,
+                 dec_hidden_layers=6,
+                 pruning_method=None,
+                 pruning_mask_init='constant',
+                 pruning_mask_scale=0.0,
+                 pruning_initial_threshold=1.0,
+                 pruning_final_threshold=0.01,
+                 pruning_initial_warmup=1,
+                 pruning_final_warmup=20,
+                 pruning_module='decoder',
+                 pruning_decay_step=50,
+                 pruning_decay_type='exp',
+                 ft_module=None,
+                 attn_separate=False,
+                 LR_weight_rank=8,
+                 LR_mask_rank=8,
+                 **kwargs):
+        super().__init__(layer_norm_eps=layernorm_epsilon, **kwargs)
+
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.hidden_act = hidden_act
+        self.intermediate_size = intermediate_size
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.type_vocab_size = type_vocab_size
+        self.initializer_range = initializer_range
+        self.deep_init = deep_init
+        self.deepspeed = deepspeed
+        self.lr_decay_style = lr_decay_style
+        self.weight_decay = weight_decay
+        self.clip_grad = clip_grad
+        self.warmup = warmup
+        self.pre_ln = pre_ln
+        self.fp16 = fp16
+        self.fp32_layernorm = fp32_layernorm
+        self.fp32_embedding = fp32_embedding
+        self.layernorm_epsilon = layernorm_epsilon
+        self.fp32_tokentypes = fp32_tokentypes
+        self.dec_hidden_layers = dec_hidden_layers
+        self.pruning_method = pruning_method
+        self.pruning_mask_init = pruning_mask_init
+        self.pruning_mask_scale = pruning_mask_scale
+        self.pruning_module = pruning_module
+        self.pruning_initial_threshold = pruning_initial_threshold
+        self.pruning_final_threshold = pruning_final_threshold
+        self.pruning_initial_warmup = pruning_initial_warmup
+        self.pruning_final_warmup = pruning_final_warmup
+        self.pruning_decay_step = pruning_decay_step
+        self.pruning_decay_type = pruning_decay_type
+        self.ft_module = ft_module
+        self.attn_separate = attn_separate
+        self.LR_weight_rank = LR_weight_rank
+        self.LR_mask_rank = LR_mask_rank
diff --git a/modelscope/models/nlp/plug/distributed_plug.py b/modelscope/models/nlp/plug/distributed_plug.py
new file mode 100644
index 00000000..2992f595
--- /dev/null
+++ b/modelscope/models/nlp/plug/distributed_plug.py
@@ -0,0 +1,191 @@
+import os
+from typing import Dict
+
+import torch
+import torch.nn.functional as F
+from megatron import mpu
+from megatron.fp16 import FP16_Module
+from megatron.utils import print_rank_0
+
+from modelscope.models import TorchModel
+from modelscope.models.base import Tensor
+from modelscope.utils.logger import get_logger
+from modelscope.utils.nlp.distributed import initialize_distributed
+from modelscope.utils.nlp.load_checkpoint import pre_load
+from modelscope.utils.torch_utils import set_random_seed_mpu
+from . import PlugModel
+from .configuration_plug import PlugNLGConfig
+
+logger = get_logger(__name__)
+
+
+class DistributedPlug(TorchModel):
+
+    def __init__(self, model_dir, rank, **kwargs):
+        super().__init__(model_dir, **kwargs)
+        self.rank = rank
+        self.model_cfg = kwargs
+        self.config = PlugNLGConfig.from_pretrained(model_dir)
+        initialize_distributed(rank, mpu, kwargs['world_size'],
+                               kwargs['model_parallel_size'],
+                               kwargs['master_ip'], kwargs['master_port'])
+        seed = 0 if 'seed' not in kwargs else kwargs['seed']
+        set_random_seed_mpu(seed)
+        self.iteration = 0
+        self.dist_model = self.initialize_model(path_load_tag='model')
+
+    def initialize_model(self, path_load_tag='model'):
+        """Build the model."""
+        print_rank_0('Building Plug model. It will take a few minutes ...')
+        model = PlugModel(self.config)
+
+        if mpu.get_data_parallel_rank() == 0:
+            logger.info(
+                ' > number of parameters on model parallel rank {}: {}'.format(
+                    mpu.get_model_parallel_rank(),
+                    sum([p.nelement() for p in model.parameters()])))
+
+        if self.config.deepspeed and self.config.fp16:
+            model.half()
+
+        # GPU allocation.
+        model.cuda(torch.cuda.current_device())
+
+        # Fp16 conversion.
+        if self.config.fp16:
+            model = FP16_Module(model)
+            if self.config.fp32_embedding:
+                model.module.model.bert.embeddings.word_embeddings.float()
+                model.module.model.bert.embeddings.position_embeddings.float()
+                model.module.model.bert.embeddings.token_type_embeddings.float(
+                )
+            if self.config.fp32_tokentypes:
+                model.module.model.bert.embeddings.token_type_embeddings.float(
+                )
+            if self.config.fp32_layernorm:
+                for name, _module in model.named_modules():
+                    if 'LayerNorm' in name:
+                        _module.float()
+
+        load_model = pre_load(mpu, self.model_dir, tag=path_load_tag)
+        model_dict = model.module.model.state_dict()
+        for key in load_model:
+            if key not in model_dict.keys():
+                print_rank_0('Skip key: ' + key)
+            else:
+                print_rank_0('Loading key: ' + key)
+        model.module.model.load_state_dict(load_model, strict=False)
+        return model
+
+    @staticmethod
+    def top_k_logits(logits, top_k=0, top_p=0.0, filter_value=-float('Inf')):
+        # This function has been mostly taken from huggingface conversational ai code at
+        # https://medium.com/huggingface/how-to-build-a-state-of-the-art-
+        # conversational-ai-with-transfer-learning-2d818ac26313
+
+        if top_k > 0:
+            # Remove all tokens with a probability less than the last token of the top-k
+            indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1,
+                                                                      None]
+            logits[indices_to_remove] = filter_value
+
+        if top_p > 0.0:
+            # convert to 1D
+            logits = logits.view(logits.size()[1]).contiguous()
+            sorted_logits, sorted_indices = torch.sort(logits, descending=True)
+            cumulative_probs = torch.cumsum(
+                F.softmax(sorted_logits, dim=-1), dim=-1)
+
+            # Remove tokens with cumulative probability above the threshold
+            sorted_indices_to_remove = cumulative_probs > top_p
+            # Shift the indices to the right to keep also the first token above the threshold
+            sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[
+                ..., :-1].clone()
+            sorted_indices_to_remove[..., 0] = 0
+            indices_to_remove = sorted_indices[sorted_indices_to_remove]
+            logits[indices_to_remove] = filter_value
+            # going back to 2D
+            logits = logits.view(1, -1).contiguous()
+        return logits
+
+    def generate(self, input: Dict[str, Tensor], out_length=128, *kwargs):
+        device = torch.cuda.current_device()
+        batch_size = input['input_ids'].shape[0]
+        tokens = input['input_ids'].view(1, -1).contiguous().to(device)
+        dec_input_ids = input['dec_input_ids'].to(device)
+        attention_mask = input['attention_mask'].to(device)
+        self.dist_model.eval()
+        with torch.no_grad():
+            # Only supports batch_size=1
+            all_generate_tokens = []
+            generate_tokens = []
+            counter = 0
+            sequence_output = None
+            vocab_size = self.config.original_vocab_size
+            sep_token_idx = 102  # index of [SEP] token in BertTokenizer
+            while counter < out_length:
+                if counter % 128 == 0 and counter != 0:
+                    # Sliding window
+                    generate_tokens.append(sep_token_idx)
+                    start = (tokens == sep_token_idx).nonzero(
+                        as_tuple=True)[-1]
+                    if start + len(generate_tokens) >= 512:
+                        tokens = torch.cat([
+                            tokens[:start],
+                            torch.cuda.LongTensor(generate_tokens)
+                        ], -1)[-512:]
+                    else:
+                        tokens[0][start:start + len(generate_tokens
+                                                    )] = torch.cuda.LongTensor(
+                                                        generate_tokens)
+
+                    attention_mask = (tokens != 0)
+                    dec_input_ids = input['dec_input_ids'].to(device)
+                    generate_tokens = []
+                    sequence_output = None
+
+                position_ids = torch.full([batch_size, 1],
+                                          len(generate_tokens),
+                                          dtype=torch.long,
+                                          device=device)
+                _, logits, sequence_output = self.dist_model(
+                    tokens,
+                    None,
+                    attention_mask,
+                    dec_input_ids,
+                    attention_mask,
+                    position_ids,
+                    is_infer=True,
+                    sequence_output=sequence_output,
+                    parallel_output=False)
+                logits = logits[:, -1, :]
+                logits = logits / self.model_cfg['temperature']
+                logits = self.top_k_logits(
+                    logits,
+                    top_k=self.model_cfg['top_k'],
+                    top_p=self.model_cfg['top_p'])
+                log_probs = F.softmax(logits, dim=-1)
+                prev = torch.multinomial(log_probs, num_samples=1)
+                prev_token = prev[0].item()
+                if prev_token >= vocab_size:
+                    prev_token = 100
+                    prev[0] = 100
+                if prev_token == 102 and len(all_generate_tokens) > int(
+                        max(1, out_length) * 0.8):
+                    break
+                if prev_token == 102:
+                    counter += 1
+                    continue
+                dec_input_ids = torch.cat([dec_input_ids, prev], dim=1)
+                generate_tokens.append(prev_token)
+                all_generate_tokens.append(prev_token)
+                counter += 1
+
+            generate_context = []
+            for token in all_generate_tokens:
+                if generate_context and generate_context[
+                        -1] == 100 and token == 100:
+                    continue
+                else:
+                    generate_context.append(token)
+            return {'generate_context': generate_context}
diff --git a/modelscope/models/nlp/plug/modeling_plug.py b/modelscope/models/nlp/plug/modeling_plug.py
new file mode 100644
index 00000000..9d2bb14f
--- /dev/null
+++ b/modelscope/models/nlp/plug/modeling_plug.py
@@ -0,0 +1,1054 @@
+# Copyright 2021-2022 The Alibaba DAMO NLP Team Authors.
+# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import (absolute_import, division, print_function,
+                        unicode_literals)
+import logging
+import math
+import os
+
+import torch
+import torch.nn.functional as F
+from deepspeed.utils.timer import SynchronizedWallClockTimer
+from megatron import mpu
+from torch import nn
+
+from modelscope.utils.nlp.distributed import (normal_init_method,
+                                              scaled_init_method)
+from .configuration_plug import PlugNLGConfig, PlugNLUConfig
+
+logger = logging.getLogger(__name__)
+
+
+def gelu(x):
+    """Implementation of the gelu activation function.
+        For information: OpenAI GPT's gelu is slightly different (and gives slightly different results):
+        0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
+    """
+    return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))
+
+
+def swish(x):
+    return x * torch.sigmoid(x)
+
+
+ACT2FN = {'gelu': gelu, 'relu': torch.nn.functional.relu, 'swish': swish}
+
+
+class BertLayerNorm(nn.Module):
+
+    def __init__(self, hidden_size, eps=1e-12):
+        """Construct a layernorm module in the TF style (epsilon inside the square root).
+        """
+        super(BertLayerNorm, self).__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.bias = nn.Parameter(torch.zeros(hidden_size))
+        self.variance_epsilon = eps
+
+    def forward(self, x):
+        u = x.mean(-1, keepdim=True)
+        s = (x - u).pow(2).mean(-1, keepdim=True)
+        x = (x - u) / torch.sqrt(s + self.variance_epsilon)
+        return self.weight * x + self.bias
+
+
+class BertEmbeddings(nn.Module):
+    """Construct the embeddings from word, position and token_type embeddings.
+    """
+
+    def __init__(self, config):
+        super(BertEmbeddings, self).__init__()
+        self.word_embeddings = mpu.VocabParallelEmbedding(
+            config.vocab_size,
+            config.hidden_size,
+            init_method=normal_init_method(
+                mean=0.0, std=config.initializer_range))
+        self.position_embeddings = nn.Embedding(config.max_position_embeddings,
+                                                config.hidden_size)
+        self.token_type_embeddings = nn.Embedding(config.type_vocab_size,
+                                                  config.hidden_size)
+
+        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
+        # any TensorFlow checkpoint file
+        self.fp32_layernorm = config.fp32_layernorm
+        self.fp32_embedding = config.fp32_embedding
+        self.fp32_tokentypes = config.fp32_tokentypes
+        self.LayerNorm = BertLayerNorm(
+            config.hidden_size, eps=config.layernorm_epsilon)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, input_ids, token_type_ids=None, position_ids=None):
+        seq_length = input_ids.size(1)
+        if position_ids is None:
+            position_ids = torch.arange(
+                seq_length, dtype=torch.long, device=input_ids.device)
+            position_ids = position_ids.unsqueeze(0).expand_as(input_ids)
+        if token_type_ids is None:
+            token_type_ids = torch.zeros_like(input_ids)
+
+        words_embeddings = self.word_embeddings(input_ids)
+        position_embeddings = self.position_embeddings(position_ids)
+        token_type_embeddings = self.token_type_embeddings(token_type_ids)
+        if not self.fp32_tokentypes:
+
+            embeddings = words_embeddings + position_embeddings + token_type_embeddings
+            if self.fp32_embedding and not self.fp32_layernorm:
+                embeddings = embeddings.half()
+            previous_type = embeddings.type()
+            if self.fp32_layernorm:
+                embeddings = embeddings.float()
+            embeddings = self.LayerNorm(embeddings)
+            if self.fp32_layernorm:
+                if self.fp32_embedding:
+                    embeddings = embeddings.half()
+                else:
+                    embeddings = embeddings.type(previous_type)
+        else:
+            embeddings = words_embeddings.float() + position_embeddings.float(
+            ) + token_type_embeddings.float()
+            if self.fp32_tokentypes and not self.fp32_layernorm:
+                embeddings = embeddings.half()
+            previous_type = embeddings.type()
+            if self.fp32_layernorm:
+                embeddings = embeddings.float()
+            embeddings = self.LayerNorm(embeddings)
+            if self.fp32_layernorm:
+                if self.fp32_tokentypes:
+                    embeddings = embeddings.half()
+                else:
+                    embeddings = embeddings.type(previous_type)
+        embeddings = self.dropout(embeddings)
+        return embeddings
+
+
+class BertSelfOutput(nn.Module):
+
+    def __init__(self, config):
+        super(BertSelfOutput, self).__init__()
+        if hasattr(config, 'deep_init') and config.deep_init:
+            init_method = scaled_init_method(
+                mean=0.0,
+                std=config.initializer_range,
+                num_layers=config.num_hidden_layers)
+        else:
+            init_method = normal_init_method(
+                mean=0.0, std=config.initializer_range)
+        self.dense = mpu.RowParallelLinear(
+            input_size=config.hidden_size,
+            output_size=config.hidden_size,
+            bias=True,
+            input_is_parallel=True,
+            stride=1,
+            init_method=init_method,
+            pruning_method=config.pruning_method if config.pruning_module in [
+                'all', 'encoder', 'encoder_self', 'encoder_selfvo',
+                'encoder_selfo'
+            ] else None,
+            pruning_mask_init=config.pruning_mask_init,
+            pruning_mask_scale=config.pruning_mask_scale,
+            LR_weight_rank=config.LR_weight_rank,
+            LR_mask_rank=config.LR_mask_rank)
+        self.fp32_layernorm = config.fp32_layernorm
+        if not config.pre_ln:
+            self.LayerNorm = BertLayerNorm(
+                config.hidden_size, eps=config.layernorm_epsilon)
+        else:
+            self.LayerNorm = None
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(
+        self,
+        hidden_states,
+        input_tensor,
+        pruning_threshold=None,
+    ):
+        hidden_states = self.dense(
+            hidden_states,
+            pruning_threshold=pruning_threshold,
+        )
+        hidden_states = self.dropout(hidden_states)
+        ln_input = hidden_states + input_tensor
+        if self.LayerNorm is not None:
+            previous_type = ln_input.type()
+            if self.fp32_layernorm:
+                ln_input = ln_input.float()
+            hidden_states = self.LayerNorm(ln_input)
+            if self.fp32_layernorm:
+                hidden_states = hidden_states.type(previous_type)
+        else:
+            hidden_states = ln_input
+        return hidden_states
+
+
+class BertAttention(nn.Module):
+
+    def __init__(self, config):
+        super(BertAttention, self).__init__()
+        self.fp32_layernorm = config.fp32_layernorm
+        if config.pre_ln:
+            self.LayerNorm = BertLayerNorm(
+                config.hidden_size, eps=config.layernorm_epsilon)
+        else:
+            self.LayerNorm = None
+        self.self = mpu.BertParallelSelfAttention(
+            hidden_size=config.hidden_size,
+            num_attention_heads=config.num_attention_heads,
+            dropout_prob=config.attention_probs_dropout_prob,
+            output_parallel=True,
+            init_method=normal_init_method(
+                mean=0.0, std=config.initializer_range),
+            separate=config.attn_separate,
+            pruning_method=config.pruning_method,
+            pruning_mask_init=config.pruning_mask_init,
+            pruning_mask_scale=config.pruning_mask_scale,
+            pruning_module=config.pruning_module,
+            LR_weight_rank=config.LR_weight_rank,
+            LR_mask_rank=config.LR_mask_rank)
+        self.output = BertSelfOutput(config)
+
+    def forward(
+        self,
+        input_tensor,
+        attention_mask,
+        pruning_threshold=None,
+    ):
+        if self.LayerNorm is not None:
+            ln_input = input_tensor
+            previous_type = input_tensor.type()
+            if self.fp32_layernorm:
+                ln_input = input_tensor.float()
+            ln_output = self.LayerNorm(ln_input)
+            if self.fp32_layernorm:
+                ln_output = ln_output.type(previous_type)
+            self_output = self.self(
+                ln_output,
+                attention_mask,
+                pruning_threshold=pruning_threshold,
+            )
+        else:
+            self_output = self.self(
+                input_tensor,
+                attention_mask,
+                pruning_threshold=pruning_threshold,
+            )
+        output_pruning_threshold = pruning_threshold
+
+        attention_output = self.output(
+            self_output,
+            input_tensor,
+            pruning_threshold=output_pruning_threshold,
+        )
+        return attention_output
+
+
+class BertIntermediate(nn.Module):
+
+    def __init__(self, config):
+        super(BertIntermediate, self).__init__()
+        self.dense = mpu.ColumnParallelLinear(
+            input_size=config.hidden_size,
+            output_size=config.intermediate_size,
+            bias=True,
+            gather_output=False,
+            stride=1,
+            init_method=normal_init_method(
+                mean=0.0, std=config.initializer_range),
+            pruning_method=config.pruning_method if config.pruning_module
+            in ['all', 'encoder', 'encoder_ffn'] else None,
+            pruning_mask_init=config.pruning_mask_init,
+            pruning_mask_scale=config.pruning_mask_scale,
+            LR_weight_rank=config.LR_weight_rank,
+            LR_mask_rank=config.LR_mask_rank)
+        self.intermediate_act_fn = ACT2FN[config.hidden_act] \
+            if isinstance(config.hidden_act, str) else config.hidden_act
+
+    def forward(
+        self,
+        hidden_states,
+        pruning_threshold=None,
+    ):
+        hidden_states = self.dense(
+            hidden_states,
+            pruning_threshold=pruning_threshold,
+        )
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        return hidden_states
+
+
+class BertOutput(nn.Module):
+
+    def __init__(self, config):
+        super(BertOutput, self).__init__()
+        if hasattr(config, 'deep_init') and config.deep_init:
+            init_method = scaled_init_method(
+                mean=0.0,
+                std=config.initializer_range,
+                num_layers=config.num_hidden_layers)
+        else:
+            init_method = normal_init_method(
+                mean=0.0, std=config.initializer_range)
+        self.dense = mpu.RowParallelLinear(
+            input_size=config.intermediate_size,
+            output_size=config.hidden_size,
+            bias=True,
+            input_is_parallel=True,
+            stride=1,
+            init_method=init_method,
+            pruning_method=config.pruning_method if config.pruning_module
+            in ['all', 'encoder', 'encoder_ffn'] else None,
+            pruning_mask_init=config.pruning_mask_init,
+            pruning_mask_scale=config.pruning_mask_scale,
+            LR_weight_rank=config.LR_weight_rank,
+            LR_mask_rank=config.LR_mask_rank)
+        self.fp32_layernorm = config.fp32_layernorm
+        if not config.pre_ln:
+            self.LayerNorm = BertLayerNorm(
+                config.hidden_size, eps=config.layernorm_epsilon)
+        else:
+            self.LayerNorm = None
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(
+        self,
+        hidden_states,
+        input_tensor,
+        pruning_threshold=None,
+    ):
+        hidden_states = self.dense(
+            hidden_states,
+            pruning_threshold=pruning_threshold,
+        )
+        hidden_states = self.dropout(hidden_states)
+        ln_input = hidden_states + input_tensor
+        if self.LayerNorm is not None:
+            previous_type = ln_input.type()
+            if self.fp32_layernorm:
+                ln_input = ln_input.float()
+            hidden_states = self.LayerNorm(ln_input)
+            if self.fp32_layernorm:
+                hidden_states = hidden_states.type(previous_type)
+        else:
+            hidden_states = ln_input
+        return hidden_states
+
+
+class BertLayer(nn.Module):
+
+    def __init__(self, config):
+        super(BertLayer, self).__init__()
+        self.attention = BertAttention(config)
+        self.intermediate = BertIntermediate(config)
+        self.output = BertOutput(config)
+        self.fp32_layernorm = config.fp32_layernorm
+        if config.pre_ln:
+            self.LayerNorm = BertLayerNorm(
+                config.hidden_size, eps=config.layernorm_epsilon)
+        else:
+            self.LayerNorm = None
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask,
+        pruning_threshold=None,
+    ):
+        attention_output = self.attention(
+            hidden_states, attention_mask, pruning_threshold=pruning_threshold)
+        if self.LayerNorm is not None:
+            ln_input = attention_output
+            previous_type = attention_output.type()
+            if self.fp32_layernorm:
+                ln_input = attention_output.float()
+            ln_output = self.LayerNorm(ln_input)
+            if self.fp32_layernorm:
+                ln_output = ln_output.type(previous_type)
+            intermediate_output = self.intermediate(
+                ln_output, pruning_threshold=pruning_threshold)
+        else:
+            intermediate_output = self.intermediate(
+                attention_output, pruning_threshold=pruning_threshold)
+        layer_output = self.output(
+            intermediate_output,
+            attention_output,
+            pruning_threshold=pruning_threshold)
+        return layer_output
+
+
+class BertEncoder(nn.Module):
+
+    def __init__(self, config):
+        super(BertEncoder, self).__init__()
+        self.layer = nn.ModuleList(
+            [BertLayer(config) for _ in range(config.num_hidden_layers)])
+        self.fp32_layernorm = config.fp32_layernorm
+        if config.pre_ln:
+            self.LayerNorm = BertLayerNorm(
+                config.hidden_size, eps=config.layernorm_epsilon)
+        else:
+            self.LayerNorm = None
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask,
+        output_all_encoded_layers=True,
+        checkpoint_activations=False,
+        detach_index=-1,
+        pruning_threshold=None,
+    ):
+        all_encoder_layers = []
+
+        def custom(start, end):
+
+            def custom_forward(*inputs):
+                layers = self.layer[start:end]
+                x_ = inputs[0]
+                for layer in layers:
+                    x_ = layer(
+                        x_, inputs[1], pruning_threshold=pruning_threshold)
+                return x_
+
+            return custom_forward
+
+        if checkpoint_activations:
+            layer_idx = 0
+            num_layers = len(self.layer)
+            chunk_length = 1
+            while layer_idx < num_layers:
+                hidden_states = mpu.checkpoint(
+                    custom(layer_idx, layer_idx + chunk_length), hidden_states,
+                    attention_mask * 1)
+                if detach_index == layer_idx:
+                    hidden_states.detach_()
+                layer_idx += chunk_length
+            # decoder layers
+        else:
+            for i, layer_module in enumerate(self.layer):
+                hidden_states = layer_module(hidden_states, attention_mask)
+                if detach_index == i:
+                    hidden_states.detach_()
+                if i == len(self.layer) - 1 and self.LayerNorm is not None:
+                    previous_type = hidden_states.type()
+                    if self.fp32_layernorm:
+                        hidden_states = hidden_states.float()
+                    hidden_states = self.LayerNorm(hidden_states)
+                    if self.fp32_layernorm:
+                        hidden_states = hidden_states.type(previous_type)
+                if output_all_encoded_layers:
+                    all_encoder_layers.append(hidden_states)
+
+        if not output_all_encoded_layers or checkpoint_activations:
+            if self.LayerNorm is not None:
+                previous_type = hidden_states.type()
+                if self.fp32_layernorm:
+                    hidden_states = hidden_states.float()
+                hidden_states = self.LayerNorm(hidden_states)
+                if self.fp32_layernorm:
+                    hidden_states = hidden_states.type(previous_type)
+            all_encoder_layers.append(hidden_states)
+        return all_encoder_layers
+
+
+class BertPooler(nn.Module):
+
+    def __init__(self, config):
+        super(BertPooler, self).__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.activation = nn.Tanh()
+
+    def forward(self, hidden_states):
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the first token.
+        first_token_tensor = hidden_states[:, 0]
+        pooled_output = self.dense(first_token_tensor)
+        pooled_output = self.activation(pooled_output)
+        return pooled_output
+
+
+class BertPredictionHeadTransform(nn.Module):
+
+    def __init__(self, config):
+        super(BertPredictionHeadTransform, self).__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.transform_act_fn = ACT2FN[config.hidden_act] \
+            if isinstance(config.hidden_act, str) else config.hidden_act
+        self.LayerNorm = BertLayerNorm(
+            config.hidden_size, eps=config.layernorm_epsilon)
+        self.fp32_layernorm = config.fp32_layernorm
+
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.transform_act_fn(hidden_states)
+        previous_type = hidden_states.type()
+        if self.fp32_layernorm:
+            hidden_states = hidden_states.float()
+        hidden_states = self.LayerNorm(hidden_states)
+        if self.fp32_layernorm:
+            hidden_states = hidden_states.type(previous_type)
+        return hidden_states
+
+
+class BertLMPredictionHead(nn.Module):
+
+    def __init__(self, config, bert_model_embedding_weights):
+        super(BertLMPredictionHead, self).__init__()
+        self.transform = BertPredictionHeadTransform(config)
+
+        # The output weights are the same as the input embeddings, but there is
+        # an output-only bias for each token.
+        self.decoder_weight = bert_model_embedding_weights
+        self.bias = nn.Parameter(
+            torch.zeros(bert_model_embedding_weights.size(0)))
+        self.bias.model_parallel = True
+        self.fp32_embedding = config.fp32_embedding
+        self.fp32_layernorm = config.fp32_layernorm
+
+        def convert_to_type(tensor):
+            if self.fp32_embedding:
+                return tensor.half()
+            else:
+                return tensor
+
+        self.type_converter = convert_to_type
+        self.converted = False
+        self.timers = SynchronizedWallClockTimer()
+
+    def forward(self, hidden_states):
+        if not self.converted:
+            self.converted = True
+            if self.fp32_embedding:
+                self.transform.half()
+                if self.fp32_layernorm:
+                    self.transform.LayerNorm.float()
+        hidden_states = self.transform(self.type_converter(hidden_states))
+        self.timers('final linear gather').start()
+        hidden_states = mpu.copy_to_model_parallel_region(hidden_states)
+        self.timers('final linear gather').stop()
+        hidden_states = F.linear(
+            self.type_converter(hidden_states),
+            self.type_converter(self.decoder_weight),
+            self.type_converter(self.bias))
+        return hidden_states
+
+
+class BertPreTrainingHeads(nn.Module):
+
+    def __init__(self, config, bert_model_embedding_weights):
+        super(BertPreTrainingHeads, self).__init__()
+        self.predictions = BertLMPredictionHead(config,
+                                                bert_model_embedding_weights)
+        self.seq_relationship = nn.Linear(config.hidden_size, 3)
+
+    def forward(self, sequence_output, pooled_output):
+        prediction_scores = self.predictions(sequence_output)
+        for p in self.seq_relationship.parameters():
+            if p is None:
+                continue
+            pooled_output = pooled_output.type_as(p)
+        seq_relationship_score = self.seq_relationship(pooled_output)
+        return prediction_scores, seq_relationship_score
+
+
+class PreTrainedBertModel(nn.Module):
+    """ An abstract class to handle weights initialization and
+        a simple interface for dowloading and loading pretrained models.
+    """
+
+    def __init__(self, config, *inputs, **kwargs):
+        super(PreTrainedBertModel, self).__init__()
+        if not isinstance(config, PlugNLUConfig) and not isinstance(
+                config, PlugNLGConfig):
+            raise ValueError(
+                'Parameter config in `{}(config)` should be an instance of class `BertConfig`. '
+                'To create a model from a Google pretrained model use '
+                '`model = {}.from_pretrained(PRETRAINED_MODEL_NAME)`'.format(
+                    self.__class__.__name__, self.__class__.__name__))
+        self.config = config
+
+    def init_bert_weights(self, module):
+        """ Initialize the weights.
+        """
+        if isinstance(module, (nn.Linear, nn.Embedding)):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(
+                mean=0.0, std=self.config.initializer_range)
+        elif isinstance(module, BertLayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        if isinstance(module, nn.Linear) and module.bias is not None:
+            module.bias.data.zero_()
+
+
+class BertModel(PreTrainedBertModel):
+    """BERT model ("Bidirectional Embedding Representations from a Transformer").
+
+    Params:
+        config: a BertConfig class instance with the configuration to build a new model
+
+    Inputs:
+        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
+            with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
+            `extract_features.py`, `run_classifier.py` and `run_squad.py`)
+        `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
+            types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
+            a `sentence B` token (see BERT paper for more details).
+        `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
+            selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
+            input sequence length in the current batch. It's the mask that we typically use for attention when
+            a batch has varying length sentences.
+        `output_all_encoded_layers`: boolean which controls the content of the `encoded_layers` output as
+            described below. Default: `True`.
+
+    Outputs: Tuple of (encoded_layers, pooled_output)
+        `encoded_layers`: controled by `output_all_encoded_layers` argument:
+            - `output_all_encoded_layers=True`: outputs a list of the full sequences of encoded-hidden-states at the end
+                of each attention block (i.e. 12 full sequences for BERT-base, 24 for BERT-large), each
+                encoded-hidden-state is a torch.FloatTensor of size [batch_size, sequence_length, hidden_size],
+            - `output_all_encoded_layers=False`: outputs only the full sequence of hidden-states corresponding
+                to the last attention block of shape [batch_size, sequence_length, hidden_size],
+        `pooled_output`: a torch.FloatTensor of size [batch_size, hidden_size] which is the output of a
+            classifier pretrained on top of the hidden state associated to the first character of the
+            input (`CLF`) to train on the Next-Sentence task (see BERT's paper).
+
+    Example usage:
+    ```python
+    # Already been converted into WordPiece token ids
+    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
+    input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
+    token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
+
+    config = modeling.BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
+        num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
+
+    model = modeling.BertModel(config=config)
+    all_encoder_layers, pooled_output = model(input_ids, token_type_ids, input_mask)
+    ```
+    """
+
+    def __init__(self, config):
+        super(BertModel, self).__init__(config)
+        self.embeddings = BertEmbeddings(config)
+        self.encoder = BertEncoder(config)
+        self.pooler = BertPooler(config)
+        self.apply(self.init_bert_weights)
+
+    def forward(
+        self,
+        input_ids,
+        token_type_ids=None,
+        attention_mask=None,
+        output_all_encoded_layers=True,
+        checkpoint_activations=False,
+        detach_index=-1,
+        pruning_threshold=None,
+    ):
+        if attention_mask is None:
+            attention_mask = torch.ones_like(input_ids)
+        if token_type_ids is None:
+            token_type_ids = torch.zeros_like(input_ids)
+
+        # We create a 3D attention mask from a 2D tensor mask.
+        # Sizes are [batch_size, 1, 1, to_seq_length]
+        # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
+        # this attention mask is more simple than the triangular masking of causal attention
+        # used in OpenAI GPT, we just need to prepare the broadcast dimension here.
+        extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
+
+        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
+        # masked positions, this operation will create a tensor which is 0.0 for
+        # positions we want to attend and -10000.0 for masked positions.
+        # Since we are adding it to the raw scores before the softmax, this is
+        # effectively the same as removing these entirely.
+        extended_attention_mask = extended_attention_mask.to(
+            dtype=next(self.encoder.parameters()).dtype)  # fp16 compatibility
+        extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
+
+        embedding_output = self.embeddings(input_ids, token_type_ids)
+        encoded_layers = self.encoder(
+            embedding_output,
+            extended_attention_mask,
+            output_all_encoded_layers=output_all_encoded_layers,
+            checkpoint_activations=checkpoint_activations,
+            detach_index=detach_index,
+            pruning_threshold=pruning_threshold)
+        sequence_output = encoded_layers[-1]
+        for p in self.pooler.parameters():
+            if p is None:
+                continue
+            sequence_output = sequence_output.type_as(p)
+            break
+
+        pooled_output = sequence_output[:, 0]
+        if not output_all_encoded_layers or checkpoint_activations:
+            encoded_layers = encoded_layers[-1]
+        return encoded_layers, pooled_output
+
+
+class DecodeLayer(nn.Module):
+
+    def __init__(self, config):
+        super(DecodeLayer, self).__init__()
+        init_method = normal_init_method(
+            mean=0.0, std=config.initializer_range)
+        output_layer_init_method = scaled_init_method(
+            mean=0.0,
+            std=config.initializer_range,
+            num_layers=config.num_hidden_layers)
+
+        self_pruning_method = config.pruning_method
+        cross_pruning_method = config.pruning_method
+        ffn_pruning_method = config.pruning_method
+
+        if config.ft_module is not None:
+            if 'decoder_self' in config.ft_module:
+                self_pruning_method = 'finetune'
+            if 'decoder_cross' in config.ft_module:
+                cross_pruning_method = 'finetune'
+            if 'decoder_ffn' in config.ft_module:
+                ffn_pruning_method = 'finetune'
+
+        self.attention = mpu.GPT2ParallelSelfAttention(
+            hidden_size=config.hidden_size,
+            num_attention_heads=config.num_attention_heads,
+            attention_dropout_prob=config.attention_probs_dropout_prob,
+            output_dropout_prob=config.hidden_dropout_prob,
+            init_method=init_method,
+            output_layer_init_method=output_layer_init_method,
+            pruning_method=self_pruning_method if config.pruning_module in [
+                'all', 'decoder', 'decoder_self', 'decoder_self+ffn'
+            ] else None,
+            pruning_mask_init=config.pruning_mask_init,
+            pruning_mask_scale=config.pruning_mask_scale,
+            LR_weight_rank=config.LR_weight_rank,
+            LR_mask_rank=config.LR_mask_rank,
+        )
+
+        self.cross_attention = mpu.PalmParallelCrossAttention(
+            hidden_size=config.hidden_size,
+            num_attention_heads=config.num_attention_heads,
+            attention_dropout_prob=config.attention_probs_dropout_prob,
+            output_dropout_prob=config.hidden_dropout_prob,
+            init_method=init_method,
+            attn_separate=False,
+            output_layer_init_method=output_layer_init_method,
+            pruning_method=cross_pruning_method,
+            pruning_mask_init=config.pruning_mask_init,
+            pruning_mask_scale=config.pruning_mask_scale,
+            pruning_module=config.pruning_module,
+            LR_weight_rank=config.LR_weight_rank,
+            LR_mask_rank=config.LR_mask_rank,
+        )
+
+        self.input_layernorm = BertLayerNorm(
+            config.hidden_size, eps=config.layernorm_epsilon)
+        self.post_attention_layernorm = BertLayerNorm(
+            config.hidden_size, eps=config.layernorm_epsilon)
+        self.post_cross_attention_layernorm = BertLayerNorm(
+            config.hidden_size, eps=config.layernorm_epsilon)
+
+        self.intermediate = mpu.ColumnParallelLinear(
+            config.hidden_size,
+            config.intermediate_size,
+            gather_output=False,
+            init_method=init_method,
+            pruning_method=ffn_pruning_method if config.pruning_module
+            in ['all', 'decoder', 'decoder_ffn', 'decoder_self+ffn'] else None,
+            pruning_mask_init=config.pruning_mask_init,
+            pruning_mask_scale=config.pruning_mask_scale,
+            LR_weight_rank=config.LR_weight_rank,
+            LR_mask_rank=config.LR_mask_rank,
+        )
+        self.intermediate_act_fn = ACT2FN[config.hidden_act] \
+            if isinstance(config.hidden_act, str) else config.hidden_act
+        self.output = mpu.RowParallelLinear(
+            config.intermediate_size,
+            config.hidden_size,
+            input_is_parallel=True,
+            init_method=output_layer_init_method,
+            pruning_method=ffn_pruning_method if config.pruning_module
+            in ['all', 'decoder', 'decoder_ffn', 'decoder_self+ffn'] else None,
+            pruning_mask_init=config.pruning_mask_init,
+            pruning_mask_scale=config.pruning_mask_scale,
+            LR_weight_rank=config.LR_weight_rank,
+            LR_mask_rank=config.LR_mask_rank,
+        )
+
+        self.dropout = torch.nn.Dropout(config.hidden_dropout_prob)
+        self.fp32_layernorm = config.fp32_layernorm
+
+        def convert_to_type(tensor):
+            if self.fp32_layernorm:
+                return tensor.float()
+            else:
+                return tensor
+
+        self.type_converter = convert_to_type
+
+    # def forward(self, hidden_states, enc_attn_mask, dec_attn_mask):
+    def forward(self,
+                hidden_states,
+                enc_hidden_states,
+                enc_attn_mask,
+                dec_attn_mask,
+                is_infer=False,
+                pruning_threshold=None):
+        residual = hidden_states
+        previous_type = hidden_states.type()
+        hidden_states = self.input_layernorm(
+            self.type_converter(hidden_states))
+        if self.fp32_layernorm:
+            hidden_states = hidden_states.type(previous_type)
+        hidden_states = self.attention(
+            hidden_states,
+            dec_attn_mask,
+            is_infer=is_infer,
+            pruning_threshold=pruning_threshold)
+
+        hidden_states = residual + hidden_states
+
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(
+            self.type_converter(hidden_states))
+        if self.fp32_layernorm:
+            hidden_states = hidden_states.type(previous_type)
+        hidden_states = self.cross_attention(
+            hidden_states,
+            enc_hidden_states,
+            enc_attn_mask,
+            pruning_threshold=pruning_threshold)
+        hidden_states = residual + hidden_states
+        residual = hidden_states
+        hidden_states = self.post_cross_attention_layernorm(
+            self.type_converter(hidden_states))
+        if self.fp32_layernorm:
+            hidden_states = hidden_states.type(previous_type)
+        hidden_states = self.intermediate(
+            hidden_states, pruning_threshold=pruning_threshold)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+
+        hidden_states = self.output(
+            hidden_states, pruning_threshold=pruning_threshold)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = residual + hidden_states
+
+        return hidden_states
+
+
+class BertDecoder(nn.Module):
+
+    def __init__(self, config):
+        super(BertDecoder, self).__init__()
+        self.layer = nn.ModuleList(
+            [DecodeLayer(config) for _ in range(config.dec_hidden_layers)])
+
+        self.final_layernorm = BertLayerNorm(
+            config.hidden_size, eps=config.layernorm_epsilon)
+        self.fp32_layernorm = config.fp32_layernorm
+
+    def forward(self,
+                hidden_states,
+                enc_hidden_states,
+                enc_attn_mask,
+                dec_attn_mask,
+                checkpoint_activations=False,
+                output_all_encoded_layers=False,
+                is_infer=False,
+                pruning_threshold=None):
+
+        def custom(start, end):
+
+            def custom_forward(*inputs):
+                layers = self.layer[start:end]
+                x_ = inputs[0]
+                for layer in layers:
+                    x_ = layer(
+                        x_,
+                        inputs[1],
+                        inputs[2],
+                        dec_attn_mask * 1,
+                        is_infer=is_infer,
+                        pruning_threshold=pruning_threshold)
+                return x_
+
+            return custom_forward
+
+        pre_enc_hidden = enc_hidden_states.data
+        if checkpoint_activations:
+            layer_idx = 0
+            num_layers = len(self.layer)
+            chunk_length = 1
+            while layer_idx < num_layers:
+                hidden_states = mpu.checkpoint(
+                    custom(layer_idx, layer_idx + chunk_length), hidden_states,
+                    enc_hidden_states, enc_attn_mask * 1)
+                enc_hidden_states.data = pre_enc_hidden
+                layer_idx += chunk_length
+        else:
+            for i, layer_module in enumerate(self.layer):
+                hidden_states = layer_module(
+                    hidden_states,
+                    enc_hidden_states,
+                    enc_attn_mask,
+                    dec_attn_mask,
+                    is_infer=is_infer,
+                    pruning_threshold=pruning_threshold)
+
+        previous_type = hidden_states.type()
+        if self.fp32_layernorm:
+            hidden_states = hidden_states.float()
+        hidden_states = self.final_layernorm(hidden_states)
+        if self.fp32_layernorm:
+            hidden_states = hidden_states.type(previous_type)
+
+        return [hidden_states]
+
+
+class DecodeModel(PreTrainedBertModel):
+
+    def __init__(self, config):
+        super(DecodeModel, self).__init__(config)
+        self.decoder = BertDecoder(config)
+        self.apply(self.init_bert_weights)
+
+    def forward(self,
+                embeddings,
+                sequence_output,
+                decode_input_ids,
+                position_ids=None,
+                enc_attn_mask=None,
+                dec_attn_mask=None,
+                checkpoint_activations=False,
+                is_infer=False,
+                pruning_threshold=None):
+        extended_attention_mask = enc_attn_mask.unsqueeze(1).unsqueeze(2)
+        extended_attention_mask = extended_attention_mask.to(
+            dtype=next(self.decoder.parameters()).dtype)  # fp16 compatibility
+        extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
+
+        embedding_output = embeddings(decode_input_ids)
+        sequence_output = self.decoder(
+            embedding_output,
+            sequence_output,
+            extended_attention_mask,
+            dec_attn_mask,
+            checkpoint_activations=False,
+            is_infer=is_infer,
+            pruning_threshold=pruning_threshold)
+        return sequence_output[-1]
+
+
+class PalmForPreTraining(PreTrainedBertModel):
+
+    def __init__(self, config):
+        super(PalmForPreTraining, self).__init__(config)
+        self.bert = BertModel(config)
+        self.cls = BertPreTrainingHeads(
+            config, self.bert.embeddings.word_embeddings.weight)
+        self.decoder = DecodeModel(config)
+        self.apply(self.init_bert_weights)
+
+    def forward(self,
+                input_ids,
+                token_type_ids=None,
+                attention_mask=None,
+                decode_input_ids=None,
+                position_ids=None,
+                decode_attention_mask=None,
+                lm_labels=None,
+                checkpoint_activations=False,
+                is_infer=False,
+                sequence_output=None,
+                parallel_output=True,
+                pruning_threshold=None):
+        if sequence_output is None:
+            sequence_output, pooled_output = self.bert(
+                input_ids,
+                token_type_ids,
+                attention_mask,
+                output_all_encoded_layers=False,
+                checkpoint_activations=checkpoint_activations,
+                pruning_threshold=pruning_threshold)
+            prediction_scores, seq_relationship_score = self.cls(
+                sequence_output, pooled_output)
+        else:
+            prediction_scores = None
+            sequence_output = sequence_output.to(
+                dtype=next(self.decoder.parameters()).dtype)
+        if attention_mask is None:
+            attention_mask = torch.ones_like(input_ids)
+        decode_output = self.decoder(
+            self.bert.embeddings,
+            sequence_output,
+            decode_input_ids,
+            position_ids,
+            attention_mask,
+            decode_attention_mask,
+            checkpoint_activations=checkpoint_activations,
+            is_infer=is_infer,
+            pruning_threshold=pruning_threshold)
+
+        transformer_output_parallel = mpu.copy_to_model_parallel_region(
+            decode_output)
+
+        logits_parallel = F.linear(transformer_output_parallel,
+                                   self.bert.embeddings.word_embeddings.weight)
+
+        if parallel_output:
+            return prediction_scores, logits_parallel
+        if is_infer:
+            return prediction_scores, mpu.gather_from_model_parallel_region(
+                logits_parallel), sequence_output
+        return prediction_scores, mpu.gather_from_model_parallel_region(
+            logits_parallel)
+
+
+class PlugModel(torch.nn.Module):
+
+    def __init__(self, config):
+        super(PlugModel, self).__init__()
+        self.config = config
+        self.model = PalmForPreTraining(self.config)
+
+    def forward(self,
+                input_tokens,
+                token_type_ids=None,
+                attention_mask=None,
+                target_tokens=None,
+                position_ids=None,
+                decode_attention_mask=None,
+                checkpoint_activations=False,
+                is_infer=False,
+                sequence_output=None,
+                parallel_output=True):
+        return self.model(
+            input_tokens,
+            token_type_ids,
+            attention_mask,
+            target_tokens,
+            position_ids,
+            decode_attention_mask,
+            checkpoint_activations=checkpoint_activations,
+            is_infer=is_infer,
+            sequence_output=sequence_output,
+            parallel_output=parallel_output)
+
+    def state_dict(self, destination=None, prefix='', keep_vars=False):
+        return self.model.state_dict(
+            destination=destination, prefix=prefix, keep_vars=keep_vars)
+
+    def load_state_dict(self, state_dict, strict=True):
+        return self.model.load_state_dict(state_dict, strict=strict)
diff --git a/modelscope/models/nlp/ponet/__init__.py b/modelscope/models/nlp/ponet/__init__.py
new file mode 100644
index 00000000..6d26b194
--- /dev/null
+++ b/modelscope/models/nlp/ponet/__init__.py
@@ -0,0 +1,41 @@
+# Copyright 2021-2022 The Alibaba DAMO Team Authors.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .configuration_ponet import PoNetConfig
+    from .modeling_ponet import (PoNetForMaskedLM, PoNetModel,
+                                 PoNetPreTrainedModel)
+    from .tokenization_ponet import PoNetTokenizer
+else:
+    _import_structure = {
+        'configuration_ponet': ['PoNetConfig'],
+        'modeling_ponet':
+        ['PoNetForMaskedLM', 'PoNetModel', 'PoNetPreTrainedModel'],
+        'tokenization_ponet': ['PoNetTokenizer'],
+    }
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/models/nlp/ponet/configuration_ponet.py b/modelscope/models/nlp/ponet/configuration_ponet.py
new file mode 100644
index 00000000..70294fc2
--- /dev/null
+++ b/modelscope/models/nlp/ponet/configuration_ponet.py
@@ -0,0 +1,117 @@
+# Copyright 2021-2022 The Alibaba DAMO Team Authors.
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PoNet model configuration, mainly copied from :class:`~transformers.BertConfig` """
+from transformers import PretrainedConfig
+
+from modelscope.utils import logger as logging
+
+logger = logging.get_logger(__name__)
+
+
+class PoNetConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration
+    of a :class:`~modelscope.models.nlp.ponet.PoNetModel`.
+    It is used to instantiate a PoNet model according to the specified arguments.
+
+    Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
+    outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
+
+
+    Args:
+        vocab_size (:obj:`int`, `optional`, defaults to 30522):
+            Vocabulary size of the BERT model. Defines the number of different tokens that can be represented by the
+            :obj:`inputs_ids` passed when calling :class:`~transformers.BertModel` or
+            :class:`~transformers.TFBertModel`.
+        hidden_size (:obj:`int`, `optional`, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (:obj:`int`, `optional`, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (:obj:`int`, `optional`, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (:obj:`int`, `optional`, defaults to 3072):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
+        hidden_act (:obj:`str` or :obj:`Callable`, `optional`, defaults to :obj:`"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string,
+            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
+        hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
+            The dropout ratio for the attention probabilities.
+        max_position_embeddings (:obj:`int`, `optional`, defaults to 512):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        type_vocab_size (:obj:`int`, `optional`, defaults to 2):
+            The vocabulary size of the :obj:`token_type_ids` passed when calling :class:`~transformers.BertModel` or
+            :class:`~transformers.TFBertModel`.
+        initializer_range (:obj:`float`, `optional`, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (:obj:`float`, `optional`, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        position_embedding_type (:obj:`str`, `optional`, defaults to :obj:`"absolute"`):
+            Type of position embedding. Choose one of :obj:`"absolute"`, :obj:`"relative_key"`,
+            :obj:`"relative_key_query"`. For positional embeddings use :obj:`"absolute"`. For more information on
+            :obj:`"relative_key"`, please refer to `Self-Attention with Relative Position Representations (Shaw et al.)
+            <https://arxiv.org/abs/1803.02155>`__. For more information on :obj:`"relative_key_query"`, please refer to
+            `Method 4` in `Improve Transformer Models with Better Relative Position Embeddings (Huang et al.)
+            <https://arxiv.org/abs/2009.13658>`__.
+        use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if ``config.is_decoder=True``.
+        classifier_dropout (:obj:`float`, `optional`):
+            The dropout ratio for the classification head.
+        clsgsepg (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            Whether or not use a trick to make sure the segment and local information will not leak.
+    """
+    model_type = 'ponet'
+
+    def __init__(self,
+                 vocab_size=30522,
+                 hidden_size=768,
+                 num_hidden_layers=12,
+                 num_attention_heads=12,
+                 intermediate_size=3072,
+                 hidden_act='gelu',
+                 hidden_dropout_prob=0.1,
+                 attention_probs_dropout_prob=0.1,
+                 max_position_embeddings=512,
+                 type_vocab_size=2,
+                 initializer_range=0.02,
+                 layer_norm_eps=1e-12,
+                 pad_token_id=0,
+                 position_embedding_type='absolute',
+                 use_cache=True,
+                 classifier_dropout=None,
+                 clsgsepg=True,
+                 **kwargs):
+        super().__init__(pad_token_id=pad_token_id, **kwargs)
+
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.hidden_act = hidden_act
+        self.intermediate_size = intermediate_size
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.type_vocab_size = type_vocab_size
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.position_embedding_type = position_embedding_type
+        self.use_cache = use_cache
+        self.classifier_dropout = classifier_dropout
+        self.clsgsepg = clsgsepg
diff --git a/modelscope/models/nlp/ponet/modeling_ponet.py b/modelscope/models/nlp/ponet/modeling_ponet.py
new file mode 100644
index 00000000..f37954db
--- /dev/null
+++ b/modelscope/models/nlp/ponet/modeling_ponet.py
@@ -0,0 +1,1591 @@
+# Copyright 2021-2022 The Alibaba DAMO Team Authors.
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch PoNet model. """
+
+import math
+from dataclasses import dataclass
+from distutils.version import LooseVersion
+from typing import Optional, Tuple
+
+import torch
+import torch.utils.checkpoint
+from packaging import version
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+from transformers.activations import ACT2FN
+from transformers.file_utils import (ModelOutput, add_code_sample_docstrings,
+                                     add_start_docstrings,
+                                     add_start_docstrings_to_model_forward,
+                                     replace_return_docstrings)
+from transformers.modeling_outputs import (
+    BaseModelOutputWithPastAndCrossAttentions,
+    BaseModelOutputWithPoolingAndCrossAttentions,
+    CausalLMOutputWithCrossAttentions, MaskedLMOutput,
+    SequenceClassifierOutput, TokenClassifierOutput)
+from transformers.modeling_utils import (PreTrainedModel,
+                                         apply_chunking_to_forward,
+                                         find_pruneable_heads_and_indices,
+                                         prune_linear_layer)
+from transformers.models.bert.modeling_bert import \
+    load_tf_weights_in_bert as load_tf_weights_in_ponet
+
+from modelscope.utils.logger import get_logger
+from .configuration_ponet import PoNetConfig
+
+logger = get_logger(__name__)
+
+is_pytorch_12plus = LooseVersion(torch.__version__) >= LooseVersion('1.12.0')
+
+_CHECKPOINT_FOR_DOC = 'ponet-base-uncased'
+_CONFIG_FOR_DOC = 'PoNetConfig'
+_TOKENIZER_FOR_DOC = 'PoNetTokenizer'
+
+CLS_ID = 101
+EOS_ID = 102
+
+
+def segment_max(src, index, dim=1):
+    if is_pytorch_12plus:
+        out = torch.zeros_like(src).scatter_reduce(
+            dim,
+            index[:, :, None].expand_as(src),
+            src,
+            reduce='amax',
+            include_self=False)
+    else:
+        dummy_scatter_index = index[:, :, None].expand_as(src)
+        min_value = src.min() - 1
+        dummpy_scatter_shape = (*src.shape[:-1], index.max() + 1,
+                                src.shape[-1])
+        dummy_scatter_index_expand = dummy_scatter_index.unsqueeze(-2).expand(
+            *dummpy_scatter_shape)
+        index_reconstruct_expand = torch.arange(
+            index.max() + 1,
+            device=src.device)[None, None, :,
+                               None].expand(*dummpy_scatter_shape)
+        src_expand = src.unsqueeze(-2).expand(*dummpy_scatter_shape)
+        out, _ = src_expand.masked_scatter(
+            dummy_scatter_index_expand != index_reconstruct_expand,
+            torch.full_like(src_expand, min_value.item())).max(dim=1)
+
+    dummy = index.unsqueeze(-1).expand(*index.shape[:2], out.size(-1))
+    return torch.gather(out, dim, dummy).to(dtype=src.dtype)
+
+
+def get_segment_index(input_ids, cls_id=CLS_ID, eos_id=EOS_ID):
+    mask = (input_ids == cls_id).to(
+        dtype=torch.long) + (input_ids == eos_id).to(dtype=torch.long)
+    mask = mask + torch.cat([torch.zeros_like(mask[:, 0:1]), mask[:, :-1]],
+                            dim=1)
+    return mask.cumsum(dim=1) - 1
+
+
+def get_token_type_mask(input_ids, cls_id=CLS_ID, eos_id=EOS_ID):
+    mask = (input_ids == cls_id) | (input_ids == eos_id)
+    return mask
+
+
+def get_win_max(hidden_states, kernel_size=3):
+    m = nn.MaxPool1d(kernel_size, stride=1, padding=kernel_size // 2)
+    out = m(hidden_states.permute(0, 2, 1)).permute(0, 2, 1)
+    return out
+
+
+class PoNetEmbeddings(nn.Module):
+    """Construct the embeddings from word, position and token_type embeddings."""
+
+    def __init__(self, config):
+        super().__init__()
+        self.word_embeddings = nn.Embedding(
+            config.vocab_size,
+            config.hidden_size,
+            padding_idx=config.pad_token_id)
+        self.position_embeddings = nn.Embedding(config.max_position_embeddings,
+                                                config.hidden_size)
+        self.token_type_embeddings = nn.Embedding(config.type_vocab_size,
+                                                  config.hidden_size)
+
+        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
+        # any TensorFlow checkpoint file
+        self.LayerNorm = nn.LayerNorm(
+            config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        # position_ids (1, len position emb) is contiguous in memory and exported when serialized
+        self.position_embedding_type = getattr(config,
+                                               'position_embedding_type',
+                                               'absolute')
+        self.register_buffer(
+            'position_ids',
+            torch.arange(config.max_position_embeddings).expand((1, -1)))
+        if version.parse(torch.__version__) > version.parse('1.6.0'):
+            self.register_buffer(
+                'token_type_ids',
+                torch.zeros(
+                    self.position_ids.size(),
+                    dtype=torch.long,
+                    device=self.position_ids.device),
+                persistent=False,
+            )
+
+    def forward(self,
+                input_ids=None,
+                token_type_ids=None,
+                position_ids=None,
+                inputs_embeds=None,
+                past_key_values_length=0):
+        if input_ids is not None:
+            input_shape = input_ids.size()
+        else:
+            input_shape = inputs_embeds.size()[:-1]
+
+        seq_length = input_shape[1]
+
+        if position_ids is None:
+            position_ids = self.position_ids[:,
+                                             past_key_values_length:seq_length
+                                             + past_key_values_length]
+
+        if token_type_ids is None:
+            if hasattr(self, 'token_type_ids'):
+                buffered_token_type_ids = self.token_type_ids[:, :seq_length]
+                buffered_token_type_ids_expanded = buffered_token_type_ids.expand(
+                    input_shape[0], seq_length)
+                token_type_ids = buffered_token_type_ids_expanded
+            else:
+                token_type_ids = torch.zeros(
+                    input_shape,
+                    dtype=torch.long,
+                    device=self.position_ids.device)
+
+        if inputs_embeds is None:
+            inputs_embeds = self.word_embeddings(input_ids)
+        token_type_embeddings = self.token_type_embeddings(token_type_ids)
+
+        embeddings = inputs_embeds + token_type_embeddings
+        if self.position_embedding_type == 'absolute':
+            position_embeddings = self.position_embeddings(position_ids)
+            embeddings += position_embeddings
+        embeddings = self.LayerNorm(embeddings)
+        embeddings = self.dropout(embeddings)
+        return embeddings
+
+
+class PoNetSelfAttention(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+
+        self.dense_local = nn.Linear(config.hidden_size, config.hidden_size)
+        self.dense_segment = nn.Linear(config.hidden_size, config.hidden_size)
+
+        self.num_attention_heads = config.num_attention_heads
+        self.clsgsepg = getattr(config, 'clsgsepg', True)
+        self.attention_head_size = int(config.hidden_size
+                                       / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+
+        self.dense_q = nn.Linear(config.hidden_size, self.all_head_size)
+        self.dense_k = nn.Linear(config.hidden_size, self.all_head_size)
+        self.dense_o = nn.Linear(config.hidden_size, self.all_head_size)
+
+    def transpose_for_scores(self, x):
+        new_x_shape = x.size()[:-1] + (self.num_attention_heads,
+                                       self.attention_head_size)
+        x = x.view(*new_x_shape)
+        return x.permute(0, 2, 1, 3)  # bz, head, len, head_size
+
+    def forward(
+        self,
+        hidden_states,
+        segment_index,
+        token_type_mask,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=False,
+    ):
+
+        context_layer_q = self.transpose_for_scores(
+            self.dense_q(hidden_states))
+        context_layer_k = self.transpose_for_scores(
+            self.dense_k(hidden_states))
+        context_layer_v = context_layer_k
+        context_layer_o = self.transpose_for_scores(
+            self.dense_o(hidden_states))
+
+        if attention_mask is not None:
+            _attention_mask = (attention_mask.squeeze(1).unsqueeze(-1) < -1)
+
+        if attention_mask is not None:
+            context_layer_q.masked_fill_(_attention_mask, 0.0)
+            q = context_layer_q.sum(dim=-2) / torch.ones_like(
+                _attention_mask).to(dtype=context_layer_q.dtype).masked_fill(
+                    _attention_mask, 0.0).sum(dim=-2)
+        else:
+            q = context_layer_q.mean(dim=-2)
+        att = torch.einsum('bdh,bdlh -> bdl', q, context_layer_k) / math.sqrt(
+            context_layer_q.shape[-1])
+        if attention_mask is not None:
+            att = att + attention_mask.squeeze(1)
+        att_prob = att.softmax(dim=-1)
+        v = torch.einsum('bdlh,bdl->bdh', context_layer_v, att_prob)
+
+        context_layer_segment = self.dense_segment(hidden_states)
+        context_layer_local = self.dense_local(hidden_states)
+        if attention_mask is not None:
+            context_layer_local.masked_fill_(
+                _attention_mask.squeeze(1), -10000)
+            context_layer_segment.masked_fill_(
+                _attention_mask.squeeze(1), -10000)
+
+        if self.clsgsepg:
+            # XXX: a trick to make sure the segment and local information will not leak
+            context_layer_local = get_win_max(
+                context_layer_local.masked_fill(
+                    token_type_mask.unsqueeze(dim=-1), -10000))
+            context_layer_segment = segment_max(
+                context_layer_segment, index=segment_index)
+
+            context_layer_segment.masked_fill_(
+                token_type_mask.unsqueeze(dim=-1), 0.0)
+            context_layer_local.masked_fill_(
+                token_type_mask.unsqueeze(dim=-1), 0.0)
+        else:
+            context_layer_local = get_win_max(context_layer_local)
+            context_layer_segment = segment_max(
+                context_layer_segment, index=segment_index)
+
+        context_layer_local = self.transpose_for_scores(context_layer_local)
+        context_layer_segment = self.transpose_for_scores(
+            context_layer_segment)
+
+        context_layer = (v.unsqueeze(dim=-2) + context_layer_segment
+                         ) * context_layer_o + context_layer_local
+        context_layer = context_layer.permute(0, 2, 1, 3).reshape(
+            *hidden_states.shape[:2], -1)
+
+        if attention_mask is not None:
+            context_layer.masked_fill_(_attention_mask.squeeze(1), 0.0)
+
+        outputs = (context_layer,
+                   att_prob) if output_attentions else (context_layer, )
+        return outputs
+
+
+class PoNetSelfOutput(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(
+            config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+class PoNetIntermediate(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        return hidden_states
+
+
+class PoNetOutput(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(
+            config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+class PoNetAttention(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.self = PoNetSelfAttention(config)
+        self.output = PoNetSelfOutput(config)
+        self.pruned_heads = set()
+
+    def prune_heads(self, heads):
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(
+            heads, self.self.num_attention_heads,
+            self.self.attention_head_size, self.pruned_heads)
+
+        # Prune linear layers
+        self.self.query = prune_linear_layer(self.self.query, index)
+        self.self.key = prune_linear_layer(self.self.key, index)
+        self.self.value = prune_linear_layer(self.self.value, index)
+        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
+
+        # Update hyper params and store pruned heads
+        self.self.num_attention_heads = self.self.num_attention_heads - len(
+            heads)
+        self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
+        self.pruned_heads = self.pruned_heads.union(heads)
+
+    def forward(
+        self,
+        hidden_states,
+        segment_index,
+        token_type_mask,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=False,
+    ):
+        self_outputs = self.self(
+            hidden_states,
+            segment_index,
+            token_type_mask,
+            attention_mask,
+            head_mask,
+            encoder_hidden_states,
+            encoder_attention_mask,
+            past_key_value,
+            output_attentions,
+        )
+        attention_output = self.output(self_outputs[0], hidden_states)
+        outputs = (attention_output,
+                   ) + self_outputs[1:]  # add attentions if we output them
+        return outputs
+
+
+class PoNetLayer(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.chunk_size_feed_forward = config.chunk_size_feed_forward
+        self.seq_len_dim = 1
+        self.attention = PoNetAttention(config)
+
+        config.is_decoder = False  # XXX: Decoder is not yet impletemented.
+        self.is_decoder = config.is_decoder
+
+        self.add_cross_attention = config.add_cross_attention
+        if self.add_cross_attention:
+            assert self.is_decoder, f'{self} should be used as a decoder model if cross attention is added'
+            self.crossattention = PoNetAttention(config)
+        self.intermediate = PoNetIntermediate(config)
+        self.output = PoNetOutput(config)
+
+    def forward(
+        self,
+        hidden_states,
+        segment_index,
+        token_type_mask,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=False,
+    ):
+        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
+        self_attn_past_key_value = past_key_value[:
+                                                  2] if past_key_value is not None else None
+        self_attention_outputs = self.attention(
+            hidden_states,
+            segment_index,
+            token_type_mask,
+            attention_mask,
+            head_mask,
+            output_attentions=output_attentions,
+            past_key_value=self_attn_past_key_value,
+        )
+        attention_output = self_attention_outputs[0]
+
+        # if decoder, the last output is tuple of self-attn cache
+        if self.is_decoder:
+            outputs = self_attention_outputs[1:-1]
+            present_key_value = self_attention_outputs[-1]
+        else:
+            outputs = self_attention_outputs[
+                1:]  # add self attentions if we output attention weights
+
+        cross_attn_present_key_value = None
+        if self.is_decoder and encoder_hidden_states is not None:
+            assert hasattr(
+                self, 'crossattention'
+            ), f'If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention layers by setting `config.add_cross_attention=True`'  # noqa *
+
+            cross_attn_past_key_value = past_key_value[
+                -2:] if past_key_value is not None else None
+            cross_attention_outputs = self.crossattention(
+                attention_output,
+                attention_mask,
+                head_mask,
+                encoder_hidden_states,
+                encoder_attention_mask,
+                cross_attn_past_key_value,
+                output_attentions,
+            )
+            attention_output = cross_attention_outputs[0]
+            outputs = outputs + cross_attention_outputs[
+                1:-1]  # add cross attentions if we output attention weights
+
+            # add cross-attn cache to positions 3,4 of present_key_value tuple
+            cross_attn_present_key_value = cross_attention_outputs[-1]
+            present_key_value = present_key_value + cross_attn_present_key_value
+
+        layer_output = apply_chunking_to_forward(self.feed_forward_chunk,
+                                                 self.chunk_size_feed_forward,
+                                                 self.seq_len_dim,
+                                                 attention_output)
+        outputs = (layer_output, ) + outputs
+
+        # if decoder, return the attn key/values as the last output
+        if self.is_decoder:
+            outputs = outputs + (present_key_value, )
+
+        return outputs
+
+    def feed_forward_chunk(self, attention_output):
+        intermediate_output = self.intermediate(attention_output)
+        layer_output = self.output(intermediate_output, attention_output)
+        return layer_output
+
+
+class PoNetEncoder(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.layer = nn.ModuleList(
+            [PoNetLayer(config) for _ in range(config.num_hidden_layers)])
+
+    def forward(
+        self,
+        hidden_states,
+        segment_index,
+        token_type_mask,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_values=None,
+        use_cache=None,
+        output_attentions=False,
+        output_hidden_states=False,
+        return_dict=True,
+    ):
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+        all_cross_attentions = (
+        ) if output_attentions and self.config.add_cross_attention else None
+
+        next_decoder_cache = () if use_cache else None
+        for i, layer_module in enumerate(self.layer):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states, )
+
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+            past_key_value = past_key_values[
+                i] if past_key_values is not None else None
+
+            if getattr(self.config, 'gradient_checkpointing',
+                       False) and self.training:
+
+                if use_cache:
+                    logger.warning(
+                        '`use_cache=True` is incompatible with `config.gradient_checkpointing=True`. Setting '
+                        '`use_cache=False`...')
+                    use_cache = False
+
+                def create_custom_forward(module):
+
+                    def custom_forward(*inputs):
+                        return module(*inputs, past_key_value,
+                                      output_attentions)
+
+                    return custom_forward
+
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(layer_module),
+                    hidden_states,
+                    segment_index,
+                    token_type_mask,
+                    attention_mask,
+                    layer_head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                )
+            else:
+                layer_outputs = layer_module(
+                    hidden_states,
+                    segment_index,
+                    token_type_mask,
+                    attention_mask,
+                    layer_head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    past_key_value,
+                    output_attentions,
+                )
+
+            hidden_states = layer_outputs[0]
+            if use_cache:
+                next_decoder_cache += (layer_outputs[-1], )
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (
+                    layer_outputs[1], )
+                if self.config.add_cross_attention:
+                    all_cross_attentions = all_cross_attentions + (
+                        layer_outputs[2], )
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states, )
+
+        if not return_dict:
+            return tuple(v for v in [
+                hidden_states,
+                next_decoder_cache,
+                all_hidden_states,
+                all_self_attentions,
+                all_cross_attentions,
+            ] if v is not None)
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=next_decoder_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+            cross_attentions=all_cross_attentions,
+        )
+
+
+class PoNetPooler(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.activation = nn.Tanh()
+
+    def forward(self, hidden_states):
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the first token.
+        first_token_tensor = hidden_states[:, 0]
+        pooled_output = self.dense(first_token_tensor)
+        pooled_output = self.activation(pooled_output)
+        return pooled_output
+
+
+class PoNetPredictionHeadTransform(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        if isinstance(config.hidden_act, str):
+            self.transform_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.transform_act_fn = config.hidden_act
+        self.LayerNorm = nn.LayerNorm(
+            config.hidden_size, eps=config.layer_norm_eps)
+
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.transform_act_fn(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states)
+        return hidden_states
+
+
+class PoNetLMPredictionHead(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.transform = PoNetPredictionHeadTransform(config)
+
+        # The output weights are the same as the input embeddings, but there is
+        # an output-only bias for each token.
+        self.decoder = nn.Linear(
+            config.hidden_size, config.vocab_size, bias=False)
+
+        self.bias = nn.Parameter(torch.zeros(config.vocab_size))
+
+        # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
+        self.decoder.bias = self.bias
+
+    def forward(self, hidden_states):
+        hidden_states = self.transform(hidden_states)
+        hidden_states = self.decoder(hidden_states)
+        return hidden_states
+
+
+class PoNetOnlyMLMHead(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.predictions = PoNetLMPredictionHead(config)
+
+    def forward(self, sequence_output):
+        prediction_scores = self.predictions(sequence_output)
+        return prediction_scores
+
+
+class PoNetPreTrainingHeads(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.predictions = PoNetLMPredictionHead(config)
+        self.seq_relationship = nn.Linear(config.hidden_size, 3)
+
+    def forward(self, sequence_output, pooled_output):
+        prediction_scores = self.predictions(sequence_output)
+        seq_relationship_score = self.seq_relationship(pooled_output)
+        return prediction_scores, seq_relationship_score
+
+
+class PoNetPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = PoNetConfig
+    load_tf_weights = load_tf_weights_in_ponet
+    base_model_prefix = 'ponet'
+    _keys_to_ignore_on_load_missing = [r'position_ids']
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, nn.Linear):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(
+                mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(
+                mean=0.0, std=self.config.initializer_range)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+
+
+@dataclass
+class PoNetForPreTrainingOutput(ModelOutput):
+    """
+    Output type of :class:`~transformers.PoNetForPreTraining`.
+
+    Args:
+        loss (`optional`, returned when ``labels`` is provided, ``torch.FloatTensor`` of shape :obj:`(1,)`):
+            Total loss as the sum of the masked language modeling loss and the next sequence prediction
+            (classification) loss.
+        mlm_loss (`optional`, returned when ``labels`` is provided, ``torch.FloatTensor`` of shape :obj:`(1,)`):
+            Masked language modeling loss.
+        sop_loss (`optional`, returned when ``labels`` is provided, ``torch.FloatTensor`` of shape :obj:`(1,)`):
+            sop loss.
+        prediction_logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+        seq_relationship_logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, 2)`):
+            Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation
+            before SoftMax).
+        hidden_states
+            (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed
+            or when ``config.output_hidden_states=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed
+            or when ``config.output_attentions=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
+            sequence_length, sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    mlm_loss: Optional[torch.FloatTensor] = None
+    sop_loss: Optional[torch.FloatTensor] = None
+    prediction_logits: torch.FloatTensor = None
+    seq_relationship_logits: torch.FloatTensor = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+
+
+PONET_START_DOCSTRING = r"""
+
+    This model inherits from :class:`~transformers.PreTrainedModel`. Check the superclass documentation for the generic
+    methods the library implements for all its model (such as downloading or saving, resizing the input embeddings,
+    pruning heads etc.)
+
+    This model is also a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`__
+    subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to
+    general usage and behavior.
+
+    Parameters:
+        config (:class:`~modelscope.models.nlp.ponet.PoNetConfig`):
+            Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model
+            weights.
+"""
+
+PONET_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`):
+            Indices of input sequence tokens in the vocabulary.
+
+            Indices can be obtained using :class:`~modelscope.models.nlp.ponet.PoNetTokenizer`. See
+            :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
+            details.
+
+            `What are input IDs? <../glossary.html#input-ids>`__
+        attention_mask (:obj:`torch.FloatTensor` of shape :obj:`({0})`, `optional`):
+            Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            `What are attention masks? <../glossary.html#attention-mask>`__
+        token_type_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`, `optional`):
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0,
+            1]``:
+
+            - 0 corresponds to a `sentence A` token,
+            - 1 corresponds to a `sentence B` token.
+
+            `What are token type IDs? <../glossary.html#token-type-ids>`_
+        position_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`, `optional`):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
+            config.max_position_embeddings - 1]``.
+
+            `What are position IDs? <../glossary.html#position-ids>`_
+        head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`):
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`({0}, hidden_size)`, `optional`):
+            Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
+            This is useful if you want more control over how to convert :obj:`input_ids` indices into associated
+            vectors than the model's internal embedding lookup matrix.
+        output_attentions (:obj:`bool`, `optional`):
+            Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
+            tensors for more detail.
+        output_hidden_states (:obj:`bool`, `optional`):
+            Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
+            more detail.
+        return_dict (:obj:`bool`, `optional`):
+            Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+    'The bare PoNet Model transformer outputting raw hidden-states without any specific head on top.',
+    PONET_START_DOCSTRING,
+)
+class PoNetModel(PoNetPreTrainedModel):
+    """
+
+    The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
+    cross-attention is added between the self-attention layers, following the architecture described in `Attention is
+    all you need <https://arxiv.org/abs/1706.03762>`__ by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit,
+    Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin.
+
+    To behave as an decoder the model needs to be initialized with the :obj:`is_decoder` argument of the configuration
+    set to :obj:`True`. To be used in a Seq2Seq model, the model needs to initialized with both :obj:`is_decoder`
+    argument and :obj:`add_cross_attention` set to :obj:`True`; an :obj:`encoder_hidden_states` is then expected as an
+    input to the forward pass.
+    """
+
+    def __init__(self, config, add_pooling_layer=True):
+        super().__init__(config)
+        self.config = config
+
+        self.embeddings = PoNetEmbeddings(config)
+        self.encoder = PoNetEncoder(config)
+
+        self.pooler = PoNetPooler(config) if add_pooling_layer else None
+
+        self.init_weights()
+
+    def get_input_embeddings(self):
+        return self.embeddings.word_embeddings
+
+    def set_input_embeddings(self, value):
+        self.embeddings.word_embeddings = value
+
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+
+    @add_start_docstrings_to_model_forward(
+        PONET_INPUTS_DOCSTRING.format('batch_size, sequence_length'))
+    @add_code_sample_docstrings(
+        processor_class=_TOKENIZER_FOR_DOC,
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=BaseModelOutputWithPoolingAndCrossAttentions,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        segment_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_values=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        encoder_hidden_states
+            (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
+            the model is configured as a decoder.
+        encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
+            the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+        past_key_values (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers`
+            with each tuple having 4 tensors of shape :obj:
+            `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+            Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
+
+            If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids`
+            (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
+            instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`.
+        use_cache (:obj:`bool`, `optional`):
+            If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
+            decoding (see :obj:`past_key_values`).
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else
+            self.config.output_hidden_states)
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if self.config.is_decoder:
+            use_cache = use_cache if use_cache is not None else self.config.use_cache
+        else:
+            use_cache = False
+
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError(
+                'You cannot specify both input_ids and inputs_embeds at the same time'
+            )
+        elif input_ids is not None:
+            input_shape = input_ids.size()
+            batch_size, seq_length = input_shape
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+            batch_size, seq_length = input_shape
+        else:
+            raise ValueError(
+                'You have to specify either input_ids or inputs_embeds')
+
+        device = input_ids.device if input_ids is not None else inputs_embeds.device
+
+        # past_key_values_length
+        past_key_values_length = past_key_values[0][0].shape[
+            2] if past_key_values is not None else 0
+
+        if attention_mask is None:
+            attention_mask = torch.ones(
+                ((batch_size, seq_length + past_key_values_length)),
+                device=device)
+        if token_type_ids is None:
+            token_type_ids = torch.zeros(
+                input_shape, dtype=torch.long, device=device)
+
+        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+        # ourselves in which case we just need to make it broadcastable to all heads.
+        extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(
+            attention_mask, input_shape, device)
+
+        # If a 2D or 3D attention mask is provided for the cross-attention
+        # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
+        if self.config.is_decoder and encoder_hidden_states is not None:
+            encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size(
+            )
+            encoder_hidden_shape = (encoder_batch_size,
+                                    encoder_sequence_length)
+            if encoder_attention_mask is None:
+                encoder_attention_mask = torch.ones(
+                    encoder_hidden_shape, device=device)
+            encoder_extended_attention_mask = self.invert_attention_mask(
+                encoder_attention_mask)
+        else:
+            encoder_extended_attention_mask = None
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        head_mask = self.get_head_mask(head_mask,
+                                       self.config.num_hidden_layers)
+
+        embedding_output = self.embeddings(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            token_type_ids=token_type_ids,
+            inputs_embeds=inputs_embeds,
+            past_key_values_length=past_key_values_length,
+        )
+
+        segment_index = get_segment_index(
+            input_ids) if segment_ids is None else segment_ids
+        token_type_mask = get_token_type_mask(input_ids)
+        encoder_outputs = self.encoder(
+            embedding_output,
+            segment_index,
+            token_type_mask,
+            attention_mask=extended_attention_mask,
+            head_mask=head_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_extended_attention_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = encoder_outputs[0]
+        pooled_output = self.pooler(
+            sequence_output) if self.pooler is not None else None
+
+        if not return_dict:
+            return (sequence_output, pooled_output) + encoder_outputs[1:]
+
+        return BaseModelOutputWithPoolingAndCrossAttentions(
+            last_hidden_state=sequence_output,
+            pooler_output=pooled_output,
+            past_key_values=encoder_outputs.past_key_values,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+            cross_attentions=encoder_outputs.cross_attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    PoNet Model with two heads on top as done during the pretraining: a `masked language modeling` head and a `next
+    sentence prediction (classification)` head.
+    """,
+    PONET_START_DOCSTRING,
+)
+class PoNetForPreTraining(PoNetPreTrainedModel):
+
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.ponet = PoNetModel(config)
+        self.cls = PoNetPreTrainingHeads(config)
+
+        self.init_weights()
+
+    def get_output_embeddings(self):
+        return self.cls.predictions.decoder
+
+    def set_output_embeddings(self, new_embeddings):
+        self.cls.predictions.decoder = new_embeddings
+
+    @add_start_docstrings_to_model_forward(
+        PONET_INPUTS_DOCSTRING.format('batch_size, sequence_length'))
+    @replace_return_docstrings(
+        output_type=PoNetForPreTrainingOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        segment_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+        next_sentence_label=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        labels (:obj:`torch.LongTensor` of shape ``(batch_size, sequence_length)``, `optional`):
+            Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ...,
+            config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored
+            (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``
+        next_sentence_label (``torch.LongTensor`` of shape ``(batch_size,)``, `optional`):
+            Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair
+            (see :obj:`input_ids` docstring) Indices should be in ``[0, 1]``:
+
+            - 0 indicates sequence B is a continuation of sequence A,
+            - 1 indicates sequence B is a random sequence.
+        kwargs (:obj:`Dict[str, any]`, optional, defaults to `{}`):
+            Used to hide legacy arguments that have been deprecated.
+
+        Returns:
+
+        Example::
+
+            >>> from transformers import PoNetTokenizer, PoNetForPreTraining
+            >>> import torch
+
+            >>> tokenizer = PoNetTokenizer.from_pretrained('ponet-base-uncased')
+            >>> model = PoNetForPreTraining.from_pretrained('ponet-base-uncased')
+
+            >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
+            >>> outputs = model(**inputs)
+
+            >>> prediction_logits = outputs.prediction_logits
+            >>> seq_relationship_logits = outputs.seq_relationship_logits
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.ponet(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            segment_ids=segment_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output, pooled_output = outputs[:2]
+        prediction_scores, seq_relationship_score = self.cls(
+            sequence_output, pooled_output)
+
+        total_loss = None
+        masked_lm_loss = None
+        next_sentence_loss = None
+        if labels is not None and next_sentence_label is not None:
+            loss_fct = CrossEntropyLoss()
+            masked_lm_loss = loss_fct(
+                prediction_scores.view(-1, self.config.vocab_size),
+                labels.view(-1))
+            next_sentence_loss = loss_fct(
+                seq_relationship_score.view(-1, 3),
+                next_sentence_label.view(-1))
+            total_loss = masked_lm_loss + next_sentence_loss
+
+        if not return_dict:
+            output = (prediction_scores, seq_relationship_score) + outputs[2:]
+            return ((total_loss, masked_lm_loss, next_sentence_loss)
+                    + output) if total_loss is not None else output
+
+        return PoNetForPreTrainingOutput(
+            loss=total_loss,
+            mlm_loss=masked_lm_loss,
+            sop_loss=next_sentence_loss,
+            prediction_logits=prediction_scores,
+            seq_relationship_logits=seq_relationship_score,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """PoNet Model with a `language modeling` head on top for CLM fine-tuning. """,
+    PONET_START_DOCSTRING)
+class PoNetLMHeadModel(PoNetPreTrainedModel):
+    _keys_to_ignore_on_load_unexpected = [r'pooler']
+    _keys_to_ignore_on_load_missing = [
+        r'position_ids', r'predictions.decoder.bias'
+    ]
+
+    def __init__(self, config):
+        super().__init__(config)
+
+        if not config.is_decoder:
+            logger.warning(
+                'If you want to use `PoNetLMHeadModel` as a standalone, add `is_decoder=True.`'
+            )
+
+        self.ponet = PoNetModel(config, add_pooling_layer=False)
+        self.cls = PoNetOnlyMLMHead(config)
+
+        self.init_weights()
+
+    def get_output_embeddings(self):
+        return self.cls.predictions.decoder
+
+    def set_output_embeddings(self, new_embeddings):
+        self.cls.predictions.decoder = new_embeddings
+
+    @add_start_docstrings_to_model_forward(
+        PONET_INPUTS_DOCSTRING.format('batch_size, sequence_length'))
+    @replace_return_docstrings(
+        output_type=CausalLMOutputWithCrossAttentions,
+        config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        segment_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        labels=None,
+        past_key_values=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        encoder_hidden_states  (:obj:`torch.FloatTensor` of shape :obj:
+            `(batch_size, sequence_length, hidden_size)`, `optional`):
+            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
+            the model is configured as a decoder.
+        encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
+            the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
+            ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are
+            ignored (masked), the loss is only computed for the tokens with labels n ``[0, ..., config.vocab_size]``
+        past_key_values (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers`
+            with each tuple having 4 tensors of shape :
+            obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+            Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
+
+            If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids`
+            (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
+            instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`.
+        use_cache (:obj:`bool`, `optional`):
+            If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
+            decoding (see :obj:`past_key_values`).
+
+        Returns:
+
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if labels is not None:
+            use_cache = False
+
+        outputs = self.ponet(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            segment_ids=segment_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+        prediction_scores = self.cls(sequence_output)
+
+        lm_loss = None
+        if labels is not None:
+            # we are doing next-token prediction; shift prediction scores and input ids by one
+            shifted_prediction_scores = prediction_scores[:, :
+                                                          -1, :].contiguous()
+            labels = labels[:, 1:].contiguous()
+            loss_fct = CrossEntropyLoss()
+            lm_loss = loss_fct(
+                shifted_prediction_scores.view(-1, self.config.vocab_size),
+                labels.view(-1))
+
+        if not return_dict:
+            output = (prediction_scores, ) + outputs[2:]
+            return ((lm_loss, ) + output) if lm_loss is not None else output
+
+        return CausalLMOutputWithCrossAttentions(
+            loss=lm_loss,
+            logits=prediction_scores,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            cross_attentions=outputs.cross_attentions,
+        )
+
+    def prepare_inputs_for_generation(self,
+                                      input_ids,
+                                      past=None,
+                                      attention_mask=None,
+                                      **model_kwargs):
+        input_shape = input_ids.shape
+        # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly
+        if attention_mask is None:
+            attention_mask = input_ids.new_ones(input_shape)
+
+        # cut decoder_input_ids if past is used
+        if past is not None:
+            input_ids = input_ids[:, -1:]
+
+        return {
+            'input_ids': input_ids,
+            'attention_mask': attention_mask,
+            'past_key_values': past
+        }
+
+    def _reorder_cache(self, past, beam_idx):
+        reordered_past = ()
+        for layer_past in past:
+            reordered_past += (tuple(
+                past_state.index_select(0, beam_idx)
+                for past_state in layer_past), )
+        return reordered_past
+
+
+@add_start_docstrings(
+    """PoNet Model with a `language modeling` head on top. """,
+    PONET_START_DOCSTRING)
+class PoNetForMaskedLM(PoNetPreTrainedModel):
+    _keys_to_ignore_on_load_unexpected = [r'pooler']
+    _keys_to_ignore_on_load_missing = [
+        r'position_ids', r'predictions.decoder.bias'
+    ]
+
+    def __init__(self, config):
+        super().__init__(config)
+
+        if config.is_decoder:
+            logger.warning(
+                'If you want to use `PoNetForMaskedLM` make sure `config.is_decoder=False` for '
+                'bi-directional self-attention.')
+
+        self.ponet = PoNetModel(config, add_pooling_layer=False)
+        self.cls = PoNetOnlyMLMHead(config)
+
+        self.init_weights()
+
+    def get_output_embeddings(self):
+        return self.cls.predictions.decoder
+
+    def set_output_embeddings(self, new_embeddings):
+        self.cls.predictions.decoder = new_embeddings
+
+    @add_start_docstrings_to_model_forward(
+        PONET_INPUTS_DOCSTRING.format('batch_size, sequence_length'))
+    @add_code_sample_docstrings(
+        processor_class=_TOKENIZER_FOR_DOC,
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=MaskedLMOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        segment_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        labels=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ...,
+            config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored
+            (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``
+        """
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.ponet(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            segment_ids=segment_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+        prediction_scores = self.cls(sequence_output)
+
+        masked_lm_loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()  # -100 index = padding token
+            masked_lm_loss = loss_fct(
+                prediction_scores.view(-1, self.config.vocab_size),
+                labels.view(-1))
+
+        if not return_dict:
+            output = (prediction_scores, ) + outputs[2:]
+            return ((masked_lm_loss, )
+                    + output) if masked_lm_loss is not None else output
+
+        return MaskedLMOutput(
+            loss=masked_lm_loss,
+            logits=prediction_scores,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    PoNet Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled
+    output) e.g. for GLUE tasks.
+    """,
+    PONET_START_DOCSTRING,
+)
+class PoNetForSequenceClassification(PoNetPreTrainedModel):
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.config = config
+
+        self.ponet = PoNetModel(config)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+
+        self.init_weights()
+
+    @add_start_docstrings_to_model_forward(
+        PONET_INPUTS_DOCSTRING.format('batch_size, sequence_length'))
+    @add_code_sample_docstrings(
+        processor_class=_TOKENIZER_FOR_DOC,
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=SequenceClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        segment_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
+            Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ...,
+            config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
+            If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.ponet(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            segment_ids=segment_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        pooled_output = outputs[1]
+
+        pooled_output = self.dropout(pooled_output)
+        logits = self.classifier(pooled_output)
+
+        loss = None
+        if labels is not None:
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = 'regression'
+                elif self.num_labels > 1 and (labels.dtype == torch.long
+                                              or labels.dtype == torch.int):
+                    self.config.problem_type = 'single_label_classification'
+                else:
+                    self.config.problem_type = 'multi_label_classification'
+
+            if self.config.problem_type == 'regression':
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(logits, labels)
+            elif self.config.problem_type == 'single_label_classification':
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(
+                    logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == 'multi_label_classification':
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(logits, labels)
+        if not return_dict:
+            output = (logits, ) + outputs[2:]
+            return ((loss, ) + output) if loss is not None else output
+
+        return SequenceClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    PoNet Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
+    Named-Entity-Recognition (NER) tasks.
+    """,
+    PONET_START_DOCSTRING,
+)
+class PoNetForTokenClassification(PoNetPreTrainedModel):
+    _keys_to_ignore_on_load_unexpected = [r'pooler']
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+
+        self.ponet = PoNetModel(config, add_pooling_layer=False)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+
+        self.init_weights()
+
+    @add_start_docstrings_to_model_forward(
+        PONET_INPUTS_DOCSTRING.format('batch_size, sequence_length'))
+    @add_code_sample_docstrings(
+        processor_class=_TOKENIZER_FOR_DOC,
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=TokenClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        segment_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Labels for computing the token classification loss. Indices should be in ``[0, ..., config.num_labels -
+            1]``.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.ponet(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            segment_ids=segment_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+
+        sequence_output = self.dropout(sequence_output)
+        logits = self.classifier(sequence_output)
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            # Only keep active parts of the loss
+            if attention_mask is not None:
+                active_loss = attention_mask.view(-1) == 1
+                active_logits = logits.view(-1, self.num_labels)
+                active_labels = torch.where(
+                    active_loss, labels.view(-1),
+                    torch.tensor(loss_fct.ignore_index).type_as(labels))
+                loss = loss_fct(active_logits, active_labels)
+            else:
+                loss = loss_fct(
+                    logits.view(-1, self.num_labels), labels.view(-1))
+
+        if not return_dict:
+            output = (logits, ) + outputs[2:]
+            return ((loss, ) + output) if loss is not None else output
+
+        return TokenClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
diff --git a/modelscope/models/nlp/ponet/tokenization_ponet.py b/modelscope/models/nlp/ponet/tokenization_ponet.py
new file mode 100644
index 00000000..21544886
--- /dev/null
+++ b/modelscope/models/nlp/ponet/tokenization_ponet.py
@@ -0,0 +1,155 @@
+# Copyright 2021-2022 The Alibaba DAMO Team Authors.
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization classes for PoNet """
+
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union
+
+from transformers.file_utils import PaddingStrategy
+from transformers.models.bert.tokenization_bert import BertTokenizer
+
+from modelscope.utils.constant import ModelFile
+from modelscope.utils.logger import get_logger
+
+logger = get_logger(__name__)
+
+VOCAB_FILES_NAMES = {'vocab_file': ModelFile.VOCAB_FILE}
+
+PRETRAINED_VOCAB_FILES_MAP = {'vocab_file': {}}
+
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
+    'nlp_ponet_fill-mask_chinese-base': 512,
+    'nlp_ponet_fill-mask_english-base': 512,
+}
+
+PRETRAINED_INIT_CONFIGURATION = {
+    'nlp_ponet_fill-mask_chinese-base': {
+        'do_lower_case': True
+    },
+    'nlp_ponet_fill-mask_english-base': {
+        'do_lower_case': True
+    },
+}
+
+
+class PoNetTokenizer(BertTokenizer):
+    r"""
+    Construct an PoNet tokenizer. Based on BertTokenizer.
+
+    This tokenizer inherits from :class:`~transformers.BertTokenizer` which contains most of the main methods.
+    Users should refer to this superclass for more information regarding those methods.
+
+    Refer to superclass :class:`~transformers.BertTokenizer` for usage examples and documentation concerning
+    parameters.
+    """
+
+    vocab_files_names = VOCAB_FILES_NAMES
+    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+    pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
+
+    def _pad(
+        self,
+        encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding],
+        max_length: Optional[int] = None,
+        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
+        pad_to_multiple_of: Optional[int] = None,
+        return_attention_mask: Optional[bool] = None,
+    ) -> dict:
+        """
+        Pad encoded inputs (on left/right and up to predefined length or max length in the batch)
+
+        Args:
+            encoded_inputs: Dictionary of tokenized inputs (`List[int]`) or
+            batch of tokenized inputs (`List[List[int]]`).
+            max_length: maximum length of the returned list and optionally padding length (see below).
+                Will truncate by taking into account the special tokens.
+            padding_strategy: PaddingStrategy to use for padding.
+
+                - PaddingStrategy.LONGEST Pad to the longest sequence in the batch
+                - PaddingStrategy.MAX_LENGTH: Pad to the max length (default)
+                - PaddingStrategy.DO_NOT_PAD: Do not pad
+                The tokenizer padding sides are defined in self.padding_side:
+
+                    - 'left': pads on the left of the sequences
+                    - 'right': pads on the right of the sequences
+            pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
+                This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
+                >= 7.5 (Volta).
+            return_attention_mask: (optional) Set to False to avoid returning
+            attention mask (default: set to model specifics)
+        """
+        # Load from model defaults
+        if return_attention_mask is None:
+            return_attention_mask = 'attention_mask' in self.model_input_names
+
+        required_input = encoded_inputs[self.model_input_names[0]]
+
+        if padding_strategy == PaddingStrategy.LONGEST:
+            max_length = len(required_input)
+
+        if max_length is not None and pad_to_multiple_of is not None and (
+                max_length % pad_to_multiple_of != 0):
+            max_length = (
+                (max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of
+
+        needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len(
+            required_input) != max_length
+
+        if needs_to_be_padded:
+            difference = max_length - len(required_input)
+            if self.padding_side == 'right':
+                if return_attention_mask:
+                    encoded_inputs['attention_mask'] = [1] * len(
+                        required_input) + [0] * difference
+                if 'token_type_ids' in encoded_inputs:
+                    encoded_inputs['token_type_ids'] = (
+                        encoded_inputs['token_type_ids']
+                        + [self.pad_token_type_id] * difference)
+                if 'special_tokens_mask' in encoded_inputs:
+                    encoded_inputs['special_tokens_mask'] = encoded_inputs[
+                        'special_tokens_mask'] + [1] * difference
+                if 'segment_ids' in encoded_inputs:
+                    encoded_inputs[
+                        'segment_ids'] = encoded_inputs['segment_ids'] + [
+                            encoded_inputs['segment_ids'][-1] + 1
+                        ] * difference  # noqa *
+                encoded_inputs[self.model_input_names[
+                    0]] = required_input + [self.pad_token_id] * difference
+            elif self.padding_side == 'left':
+                if return_attention_mask:
+                    encoded_inputs['attention_mask'] = [0] * difference + [
+                        1
+                    ] * len(required_input)
+                if 'token_type_ids' in encoded_inputs:
+                    encoded_inputs['token_type_ids'] = [
+                        self.pad_token_type_id
+                    ] * difference + encoded_inputs['token_type_ids']
+                if 'segment_ids' in encoded_inputs:
+                    encoded_inputs['segment_ids'] = [encoded_inputs['segment_ids'][-1] + 1] * difference + \
+                                                    encoded_inputs['segment_ids']  # noqa *
+                if 'special_tokens_mask' in encoded_inputs:
+                    encoded_inputs['special_tokens_mask'] = [
+                        1
+                    ] * difference + encoded_inputs['special_tokens_mask']
+                encoded_inputs[self.model_input_names[
+                    0]] = [self.pad_token_id] * difference + required_input
+            else:
+                raise ValueError('Invalid padding strategy:'
+                                 + str(self.padding_side))
+        elif return_attention_mask and 'attention_mask' not in encoded_inputs:
+            encoded_inputs['attention_mask'] = [1] * len(required_input)
+
+        return encoded_inputs
diff --git a/modelscope/models/nlp/ponet_for_masked_language.py b/modelscope/models/nlp/ponet_for_masked_language.py
new file mode 100644
index 00000000..11f4bc11
--- /dev/null
+++ b/modelscope/models/nlp/ponet_for_masked_language.py
@@ -0,0 +1,53 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+from typing import Any, Dict
+
+from modelscope.metainfo import Models
+from modelscope.models.base import TorchModel
+from modelscope.models.builder import MODELS
+from modelscope.models.nlp.ponet import \
+    PoNetForMaskedLM as PoNetForMaskedLMTransformer
+from modelscope.outputs import OutputKeys
+from modelscope.utils.constant import Tasks
+
+__all__ = ['PoNetForMaskedLM']
+
+
+@MODELS.register_module(Tasks.fill_mask, module_name=Models.ponet)
+class PoNetForMaskedLM(TorchModel, PoNetForMaskedLMTransformer):
+    """PoNet for MLM model.'.
+
+    Inherited from ponet.PoNetForMaskedLM and TorchModel, so this class can be registered into Model sets.
+    """
+
+    def __init__(self, config, model_dir):
+        super(TorchModel, self).__init__(model_dir)
+        PoNetForMaskedLMTransformer.__init__(self, config)
+
+    def forward(self,
+                input_ids=None,
+                attention_mask=None,
+                token_type_ids=None,
+                segment_ids=None,
+                position_ids=None,
+                head_mask=None,
+                labels=None):
+        output = PoNetForMaskedLMTransformer.forward(
+            self,
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            segment_ids=segment_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            labels=labels)
+        output[OutputKeys.INPUT_IDS] = input_ids
+        return output
+
+    @classmethod
+    def _instantiate(cls, **kwargs):
+        model_dir = kwargs.get('model_dir')
+        return super(PoNetForMaskedLMTransformer,
+                     PoNetForMaskedLM).from_pretrained(
+                         pretrained_model_name_or_path=model_dir,
+                         model_dir=model_dir)
diff --git a/modelscope/models/nlp/sbert_for_faq_question_answering.py b/modelscope/models/nlp/sbert_for_faq_question_answering.py
new file mode 100644
index 00000000..23ccdcc5
--- /dev/null
+++ b/modelscope/models/nlp/sbert_for_faq_question_answering.py
@@ -0,0 +1,249 @@
+import math
+import os
+from collections import namedtuple
+from typing import Dict
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch import Tensor
+
+from modelscope.metainfo import Models
+from modelscope.models.builder import MODELS
+from modelscope.models.nlp.structbert import SbertConfig, SbertModel
+from modelscope.models.nlp.task_models.task_model import BaseTaskModel
+from modelscope.utils.config import Config, ConfigFields
+from modelscope.utils.constant import ModelFile, Tasks
+
+__all__ = ['SbertForFaqQuestionAnswering']
+
+
+class SbertForFaqQuestionAnsweringBase(BaseTaskModel):
+    """base class for faq models
+    """
+
+    def __init__(self, model_dir, *args, **kwargs):
+        super(SbertForFaqQuestionAnsweringBase,
+              self).__init__(model_dir, *args, **kwargs)
+
+        backbone_cfg = SbertConfig.from_pretrained(model_dir)
+        self.bert = SbertModel(backbone_cfg)
+
+        model_config = Config.from_file(
+            os.path.join(model_dir,
+                         ModelFile.CONFIGURATION)).get(ConfigFields.model, {})
+
+        metric = model_config.get('metric', 'cosine')
+        pooling_method = model_config.get('pooling', 'avg')
+
+        Arg = namedtuple('args', [
+            'metrics', 'proj_hidden_size', 'hidden_size', 'dropout', 'pooling'
+        ])
+        args = Arg(
+            metrics=metric,
+            proj_hidden_size=self.bert.config.hidden_size,
+            hidden_size=self.bert.config.hidden_size,
+            dropout=0.0,
+            pooling=pooling_method)
+
+        self.metrics_layer = MetricsLayer(args)
+        self.pooling = PoolingLayer(args)
+
+    def _get_onehot_labels(self, labels, support_size, num_cls):
+        labels_ = labels.view(support_size, 1)
+        target_oh = torch.zeros(support_size, num_cls).to(labels)
+        target_oh.scatter_(dim=1, index=labels_, value=1)
+        return target_oh.view(support_size, num_cls).float()
+
+    def forward_sentence_embedding(self, inputs: Dict[str, Tensor]):
+        input_ids = inputs['input_ids']
+        input_mask = inputs['attention_mask']
+        if not isinstance(input_ids, Tensor):
+            input_ids = torch.IntTensor(input_ids)
+        if not isinstance(input_mask, Tensor):
+            input_mask = torch.IntTensor(input_mask)
+        rst = self.bert(input_ids, input_mask)
+        last_hidden_states = rst.last_hidden_state
+        if len(input_mask.shape) == 2:
+            input_mask = input_mask.unsqueeze(-1)
+        pooled_representation = self.pooling(last_hidden_states, input_mask)
+        return pooled_representation
+
+
+@MODELS.register_module(
+    Tasks.faq_question_answering, module_name=Models.structbert)
+class SbertForFaqQuestionAnswering(SbertForFaqQuestionAnsweringBase):
+    _backbone_prefix = ''
+
+    def forward(self, input: Dict[str, Tensor]) -> Dict[str, Tensor]:
+        assert not self.training
+        query = input['query']
+        support = input['support']
+        if isinstance(query, list):
+            query = torch.stack(query)
+        if isinstance(support, list):
+            support = torch.stack(support)
+        n_query = query.shape[0]
+        n_support = support.shape[0]
+        query_mask = torch.ne(query, 0).view([n_query, -1])
+        support_mask = torch.ne(support, 0).view([n_support, -1])
+
+        support_labels = input['support_labels']
+        num_cls = torch.max(support_labels) + 1
+        onehot_labels = self._get_onehot_labels(support_labels, n_support,
+                                                num_cls)
+
+        input_ids = torch.cat([query, support])
+        input_mask = torch.cat([query_mask, support_mask], dim=0)
+        pooled_representation = self.forward_sentence_embedding({
+            'input_ids':
+            input_ids,
+            'attention_mask':
+            input_mask
+        })
+        z_query = pooled_representation[:n_query]
+        z_support = pooled_representation[n_query:]
+        cls_n_support = torch.sum(onehot_labels, dim=-2) + 1e-5
+        protos = torch.matmul(onehot_labels.transpose(0, 1),
+                              z_support) / cls_n_support.unsqueeze(-1)
+        scores = self.metrics_layer(z_query, protos).view([n_query, num_cls])
+        if self.metrics_layer.name == 'relation':
+            scores = torch.sigmoid(scores)
+        return {'scores': scores}
+
+
+activations = {
+    'relu': F.relu,
+    'tanh': torch.tanh,
+    'linear': lambda x: x,
+}
+
+activation_coeffs = {
+    'relu': math.sqrt(2),
+    'tanh': 5 / 3,
+    'linear': 1.,
+}
+
+
+class LinearProjection(nn.Module):
+
+    def __init__(self,
+                 in_features,
+                 out_features,
+                 activation='linear',
+                 bias=True):
+        super().__init__()
+        self.activation = activations[activation]
+        activation_coeff = activation_coeffs[activation]
+        linear = nn.Linear(in_features, out_features, bias=bias)
+        nn.init.normal_(
+            linear.weight, std=math.sqrt(1. / in_features) * activation_coeff)
+        if bias:
+            nn.init.zeros_(linear.bias)
+        self.model = nn.utils.weight_norm(linear)
+
+    def forward(self, x):
+        return self.activation(self.model(x))
+
+
+class RelationModule(nn.Module):
+
+    def __init__(self, args):
+        super(RelationModule, self).__init__()
+        input_size = args.proj_hidden_size * 4
+        self.prediction = torch.nn.Sequential(
+            LinearProjection(
+                input_size, args.proj_hidden_size * 4, activation='relu'),
+            nn.Dropout(args.dropout),
+            LinearProjection(args.proj_hidden_size * 4, 1))
+
+    def forward(self, query, protos):
+        n_cls = protos.shape[0]
+        n_query = query.shape[0]
+        protos = protos.unsqueeze(0).repeat(n_query, 1, 1)
+        query = query.unsqueeze(1).repeat(1, n_cls, 1)
+        input_feat = torch.cat(
+            [query, protos, (protos - query).abs(), query * protos], dim=-1)
+        dists = self.prediction(input_feat)  # [bsz,n_query,n_cls,1]
+        return dists.squeeze(-1)
+
+
+class MetricsLayer(nn.Module):
+
+    def __init__(self, args):
+        super(MetricsLayer, self).__init__()
+        self.args = args
+        assert args.metrics in ('relation', 'cosine')
+        if args.metrics == 'relation':
+            self.relation_net = RelationModule(args)
+
+    @property
+    def name(self):
+        return self.args.metrics
+
+    def forward(self, query, protos):
+        """ query : [bsz, n_query, dim]
+            support : [bsz, n_query, n_cls, dim] | [bsz, n_cls, dim]
+        """
+        if self.args.metrics == 'cosine':
+            supervised_dists = self.cosine_similarity(query, protos)
+            if self.training:
+                supervised_dists *= 5
+        elif self.args.metrics in ('relation', ):
+            supervised_dists = self.relation_net(query, protos)
+        else:
+            raise NotImplementedError
+        return supervised_dists
+
+    def cosine_similarity(self, x, y):
+        # x=[bsz, n_query, dim]
+        # y=[bsz, n_cls, dim]
+        n_query = x.shape[0]
+        n_cls = y.shape[0]
+        dim = x.shape[-1]
+        x = x.unsqueeze(1).expand([n_query, n_cls, dim])
+        y = y.unsqueeze(0).expand([n_query, n_cls, dim])
+        return F.cosine_similarity(x, y, -1)
+
+
+class AveragePooling(nn.Module):
+
+    def forward(self, x, mask, dim=1):
+        return torch.sum(
+            x * mask.float(), dim=dim) / torch.sum(
+                mask.float(), dim=dim)
+
+
+class AttnPooling(nn.Module):
+
+    def __init__(self, input_size, hidden_size=None, output_size=None):
+        super().__init__()
+        self.input_proj = nn.Sequential(
+            LinearProjection(input_size, hidden_size), nn.Tanh(),
+            LinearProjection(hidden_size, 1, bias=False))
+        self.output_proj = LinearProjection(
+            input_size, output_size) if output_size else lambda x: x
+
+    def forward(self, x, mask):
+        score = self.input_proj(x)
+        score = score * mask.float() + -1e4 * (1. - mask.float())
+        score = F.softmax(score, dim=1)
+        features = self.output_proj(x)
+        return torch.matmul(score.transpose(1, 2), features).squeeze(1)
+
+
+class PoolingLayer(nn.Module):
+
+    def __init__(self, args):
+        super(PoolingLayer, self).__init__()
+        if args.pooling == 'attn':
+            self.pooling = AttnPooling(args.proj_hidden_size,
+                                       args.proj_hidden_size,
+                                       args.proj_hidden_size)
+        elif args.pooling == 'avg':
+            self.pooling = AveragePooling()
+        else:
+            raise NotImplementedError(args.pooling)
+
+    def forward(self, x, mask):
+        return self.pooling(x, mask)
diff --git a/modelscope/models/nlp/sentence_embedding.py b/modelscope/models/nlp/sentence_embedding.py
new file mode 100644
index 00000000..340c133f
--- /dev/null
+++ b/modelscope/models/nlp/sentence_embedding.py
@@ -0,0 +1,74 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+from typing import Any, Dict
+
+import numpy as np
+
+from modelscope.metainfo import Models
+from modelscope.models import TorchModel
+from modelscope.models.builder import MODELS
+from modelscope.models.nlp.structbert import SbertPreTrainedModel
+from modelscope.utils.constant import Tasks
+
+__all__ = ['SentenceEmbedding']
+
+
+@MODELS.register_module(Tasks.sentence_embedding, module_name=Models.bert)
+class SentenceEmbedding(TorchModel, SbertPreTrainedModel):
+    base_model_prefix: str = 'bert'
+    supports_gradient_checkpointing = True
+    _keys_to_ignore_on_load_missing = [r'position_ids']
+
+    def __init__(self, config, model_dir):
+        super().__init__(model_dir)
+        self.config = config
+        setattr(self, self.base_model_prefix, self.build_base_model())
+
+    def build_base_model(self):
+        from .structbert import SbertModel
+        return SbertModel(self.config, add_pooling_layer=False)
+
+    def forward(self, input: Dict[str, Any]) -> Dict[str, np.ndarray]:
+        """return the result by the model
+
+        Args:
+            input (Dict[str, Any]): the preprocessed data
+
+        Returns:
+            Dict[str, np.ndarray]: results
+                Example:
+                    {
+                        'predictions': array([1]), # lable 0-negative 1-positive
+                        'probabilities': array([[0.11491239, 0.8850876 ]], dtype=float32),
+                        'logits': array([[-0.53860897,  1.5029076 ]], dtype=float32) # true value
+                    }
+        """
+        return self.base_model(**input)
+
+    def postprocess(self, inputs: Dict[str, np.ndarray],
+                    **kwargs) -> Dict[str, np.ndarray]:
+        embs = inputs['last_hidden_state'][:, 0].cpu().numpy()
+        num_sent = embs.shape[0]
+        if num_sent >= 2:
+            scores = np.dot(embs[0:1, ], np.transpose(embs[1:, ],
+                                                      (1, 0))).tolist()[0]
+        else:
+            scores = []
+        result = {'text_embedding': embs, 'scores': scores}
+
+        return result
+
+    @classmethod
+    def _instantiate(cls, **kwargs):
+        """Instantiate the model.
+
+        @param kwargs: Input args.
+                    model_dir: The model dir used to load the checkpoint and the label information.
+        @return: The loaded model, which is initialized by transformers.PreTrainedModel.from_pretrained
+        """
+        model_args = {}
+
+        return super(SbertPreTrainedModel, SentenceEmbedding).from_pretrained(
+            pretrained_model_name_or_path=kwargs.get('model_dir'),
+            model_dir=kwargs.get('model_dir'),
+            **model_args)
diff --git a/modelscope/models/nlp/sequence_classification.py b/modelscope/models/nlp/sequence_classification.py
index e8802dbd..156c615c 100644
--- a/modelscope/models/nlp/sequence_classification.py
+++ b/modelscope/models/nlp/sequence_classification.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 from abc import abstractmethod
 
 from torch import nn
@@ -5,6 +7,7 @@ from torch import nn
 from modelscope.metainfo import Models
 from modelscope.models.base import TorchModel
 from modelscope.models.builder import MODELS
+from modelscope.models.nlp.bert import BertPreTrainedModel
 from modelscope.models.nlp.structbert import SbertPreTrainedModel
 from modelscope.models.nlp.veco import \
     VecoForSequenceClassification as VecoForSequenceClassificationTransform
@@ -14,7 +17,10 @@ from modelscope.utils.hub import parse_label_mapping
 from modelscope.utils.tensor_utils import (torch_nested_detach,
                                            torch_nested_numpify)
 
-__all__ = ['SbertForSequenceClassification', 'VecoForSequenceClassification']
+__all__ = [
+    'SbertForSequenceClassification', 'VecoForSequenceClassification',
+    'BertForSequenceClassification'
+]
 
 
 class SequenceClassificationBase(TorchModel):
@@ -130,7 +136,7 @@ class SbertForSequenceClassification(SequenceClassificationBase,
             label2id = parse_label_mapping(model_dir)
             if label2id is not None and len(label2id) > 0:
                 num_labels = len(label2id)
-
+            cls.id2label = {id: label for label, id in label2id.items()}
         model_args = {} if num_labels is None else {'num_labels': num_labels}
         return super(SbertPreTrainedModel,
                      SbertForSequenceClassification).from_pretrained(
@@ -204,3 +210,78 @@ class VecoForSequenceClassification(TorchModel,
                          pretrained_model_name_or_path=kwargs.get('model_dir'),
                          model_dir=kwargs.get('model_dir'),
                          **model_args)
+
+
+@MODELS.register_module(Tasks.sentence_similarity, module_name=Models.bert)
+@MODELS.register_module(
+    Tasks.sentiment_classification, module_name=Models.bert)
+@MODELS.register_module(Tasks.nli, module_name=Models.bert)
+@MODELS.register_module(Tasks.text_classification, module_name=Models.bert)
+class BertForSequenceClassification(SequenceClassificationBase,
+                                    BertPreTrainedModel):
+    """Bert sequence classification model.
+
+        Inherited from SequenceClassificationBase.
+    """
+    base_model_prefix: str = 'bert'
+    supports_gradient_checkpointing = True
+    _keys_to_ignore_on_load_missing = [r'position_ids']
+
+    def __init__(self, config, model_dir):
+        if hasattr(config, 'base_model_prefix'):
+            BertForSequenceClassification.base_model_prefix = config.base_model_prefix
+        super().__init__(config, model_dir)
+
+    def build_base_model(self):
+        from .bert import BertModel
+        return BertModel(self.config, add_pooling_layer=True)
+
+    def forward(self,
+                input_ids=None,
+                attention_mask=None,
+                token_type_ids=None,
+                position_ids=None,
+                head_mask=None,
+                inputs_embeds=None,
+                labels=None,
+                output_attentions=None,
+                output_hidden_states=None,
+                return_dict=None,
+                **kwargs):
+        return super().forward(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            labels=labels,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict)
+
+    @classmethod
+    def _instantiate(cls, **kwargs):
+        """Instantiate the model.
+
+        @param kwargs: Input args.
+                    model_dir: The model dir used to load the checkpoint and the label information.
+                    num_labels: An optional arg to tell the model how many classes to initialize.
+                                    Method will call utils.parse_label_mapping if num_labels not supplied.
+                                    If num_labels is not found, the model will use the default setting (2 classes).
+        @return: The loaded model, which is initialized by transformers.PreTrainedModel.from_pretrained
+        """
+
+        model_dir = kwargs.get('model_dir')
+        num_labels = kwargs.get('num_labels')
+        if num_labels is None:
+            label2id = parse_label_mapping(model_dir)
+            if label2id is not None and len(label2id) > 0:
+                num_labels = len(label2id)
+
+        model_args = {} if num_labels is None else {'num_labels': num_labels}
+        return super(BertPreTrainedModel,
+                     BertForSequenceClassification).from_pretrained(
+                         pretrained_model_name_or_path=kwargs.get('model_dir'),
+                         model_dir=kwargs.get('model_dir'),
+                         **model_args)
diff --git a/modelscope/models/nlp/space/model/__init__.py b/modelscope/models/nlp/space/model/__init__.py
index 24641f06..bb1d18e4 100644
--- a/modelscope/models/nlp/space/model/__init__.py
+++ b/modelscope/models/nlp/space/model/__init__.py
@@ -1,6 +1,6 @@
 from .configuration_space import SpaceConfig
 from .gen_unified_transformer import GenUnifiedTransformer
-from .generator import Generator as SpaceGenerator
+from .generator import SpaceGenerator
 from .intent_unified_transformer import IntentUnifiedTransformer
 from .model_base import SpaceModelBase
 from .modeling_space import (SpaceForDST, SpaceForMaskedLM,
diff --git a/modelscope/models/nlp/space/model/generator.py b/modelscope/models/nlp/space/model/generator.py
index c1521e3d..0e7833e6 100644
--- a/modelscope/models/nlp/space/model/generator.py
+++ b/modelscope/models/nlp/space/model/generator.py
@@ -38,24 +38,24 @@ def gather(var, idx):
         return var
 
 
-class Generator(object):
+class SpaceGenerator(object):
     """ Genrator class. """
 
     _registry = dict()
 
     @classmethod
     def register(cls, name):
-        Generator._registry[name] = cls
+        SpaceGenerator._registry[name] = cls
         return
 
     @staticmethod
     def by_name(name):
-        return Generator._registry[name]
+        return SpaceGenerator._registry[name]
 
     @staticmethod
     def create(config, *args, **kwargs):
         """ Create generator. """
-        generator_cls = Generator.by_name(config.Generator.generator)
+        generator_cls = SpaceGenerator.by_name(config.Generator.generator)
         return generator_cls(config, *args, **kwargs)
 
     def __init__(self, config, reader):
@@ -83,7 +83,7 @@ class Generator(object):
         raise NotImplementedError
 
 
-class BeamSearch(Generator):
+class BeamSearch(SpaceGenerator):
     """ BeamSearch generator. """
 
     def __init__(self, config, reader):
diff --git a/modelscope/models/nlp/space/space_for_dialog_modeling.py b/modelscope/models/nlp/space/space_for_dialog_modeling.py
index 4c65c7d1..efa9b851 100644
--- a/modelscope/models/nlp/space/space_for_dialog_modeling.py
+++ b/modelscope/models/nlp/space/space_for_dialog_modeling.py
@@ -41,7 +41,7 @@ class SpaceForDialogModeling(TorchModel):
 
         self.text_field = kwargs.pop(
             'text_field',
-            MultiWOZBPETextField(self.model_dir, config=self.config))
+            MultiWOZBPETextField(config=self.config, model_dir=self.model_dir))
         self.generator = SpaceGenerator.create(
             self.config, reader=self.text_field)
         self.model = SpaceModelBase.create(
diff --git a/modelscope/models/nlp/star3/__init__.py b/modelscope/models/nlp/star3/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/modelscope/models/nlp/star3/configuration_star3.py b/modelscope/models/nlp/star3/configuration_star3.py
new file mode 100644
index 00000000..4c5ae677
--- /dev/null
+++ b/modelscope/models/nlp/star3/configuration_star3.py
@@ -0,0 +1,115 @@
+# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+# Copyright 2021-2022 The Alibaba DAMO Team Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch BERT configuration."""
+
+from __future__ import absolute_import, division, print_function
+import copy
+import logging
+
+import json
+
+logger = logging.getLogger(__name__)
+
+
+class Star3Config(object):
+    """Configuration class to store the configuration of a `Star3Model`.
+    """
+
+    def __init__(self,
+                 vocab_size_or_config_json_file,
+                 hidden_size=768,
+                 num_hidden_layers=12,
+                 num_attention_heads=12,
+                 intermediate_size=3072,
+                 hidden_act='gelu',
+                 hidden_dropout_prob=0.1,
+                 attention_probs_dropout_prob=0.1,
+                 max_position_embeddings=512,
+                 type_vocab_size=2,
+                 initializer_range=0.02):
+        """Constructs Star3Config.
+
+        Args:
+            vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `Star3Model`.
+            hidden_size: Size of the encoder layers and the pooler layer.
+            num_hidden_layers: Number of hidden layers in the Transformer encoder.
+            num_attention_heads: Number of attention heads for each attention layer in
+                the Transformer encoder.
+            intermediate_size: The size of the "intermediate" (i.e., feed-forward)
+                layer in the Transformer encoder.
+            hidden_act: The non-linear activation function (function or string) in the
+                encoder and pooler. If string, "gelu", "relu" and "swish" are supported.
+            hidden_dropout_prob: The dropout probabilitiy for all fully connected
+                layers in the embeddings, encoder, and pooler.
+            attention_probs_dropout_prob: The dropout ratio for the attention
+                probabilities.
+            max_position_embeddings: The maximum sequence length that this model might
+                ever be used with. Typically set this to something large just in case
+                (e.g., 512 or 1024 or 2048).
+            type_vocab_size: The vocabulary size of the `token_type_ids` passed into `Star3Model`.
+            initializer_range: The sttdev of the truncated_normal_initializer for
+                initializing all weight matrices.
+        """
+        if isinstance(vocab_size_or_config_json_file, str):
+            with open(
+                    vocab_size_or_config_json_file, 'r',
+                    encoding='utf-8') as reader:
+                json_config = json.loads(reader.read())
+            for key, value in json_config.items():
+                self.__dict__[key] = value
+        elif isinstance(vocab_size_or_config_json_file, int):
+            self.vocab_size = vocab_size_or_config_json_file
+            self.hidden_size = hidden_size
+            self.num_hidden_layers = num_hidden_layers
+            self.num_attention_heads = num_attention_heads
+            self.hidden_act = hidden_act
+            self.intermediate_size = intermediate_size
+            self.hidden_dropout_prob = hidden_dropout_prob
+            self.attention_probs_dropout_prob = attention_probs_dropout_prob
+            self.max_position_embeddings = max_position_embeddings
+            self.type_vocab_size = type_vocab_size
+            self.initializer_range = initializer_range
+        else:
+            raise ValueError(
+                'First argument must be either a vocabulary size (int)'
+                'or the path to a pretrained model config file (str)')
+
+    @classmethod
+    def from_dict(cls, json_object):
+        """Constructs a `Star3Config` from a Python dictionary of parameters."""
+        config = Star3Config(vocab_size_or_config_json_file=-1)
+        for key, value in json_object.items():
+            config.__dict__[key] = value
+        return config
+
+    @classmethod
+    def from_json_file(cls, json_file):
+        """Constructs a `Star3Config` from a json file of parameters."""
+        with open(json_file, 'r', encoding='utf-8') as reader:
+            text = reader.read()
+        return cls.from_dict(json.loads(text))
+
+    def __repr__(self):
+        return str(self.to_json_string())
+
+    def to_dict(self):
+        """Serializes this instance to a Python dictionary."""
+        output = copy.deepcopy(self.__dict__)
+        return output
+
+    def to_json_string(self):
+        """Serializes this instance to a JSON string."""
+        return json.dumps(self.to_dict(), indent=2, sort_keys=True) + '\n'
diff --git a/modelscope/models/nlp/star3/modeling_star3.py b/modelscope/models/nlp/star3/modeling_star3.py
new file mode 100644
index 00000000..13f7136a
--- /dev/null
+++ b/modelscope/models/nlp/star3/modeling_star3.py
@@ -0,0 +1,1001 @@
+# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+# Copyright 2021-2022 The Alibaba DAMO Team Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch BERT model."""
+
+from __future__ import absolute_import, division, print_function
+import copy
+import math
+import os
+import shutil
+import tarfile
+import tempfile
+
+import numpy as np
+import torch
+from torch import nn
+
+from modelscope.models.nlp.star3.configuration_star3 import Star3Config
+from modelscope.utils.constant import ModelFile
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+CONFIG_NAME = ModelFile.CONFIGURATION
+WEIGHTS_NAME = ModelFile.TORCH_MODEL_BIN_FILE
+
+
+def gelu(x):
+    """Implementation of the gelu activation function.
+        For information: OpenAI GPT's gelu is slightly different (and gives slightly different results):
+        0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
+    """
+    return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))
+
+
+def swish(x):
+    return x * torch.sigmoid(x)
+
+
+ACT2FN = {'gelu': gelu, 'relu': torch.nn.functional.relu, 'swish': swish}
+
+
+class BertLayerNorm(nn.Module):
+
+    def __init__(self, hidden_size, eps=1e-12):
+        """Construct a layernorm module in the TF style (epsilon inside the square root).
+        """
+        super(BertLayerNorm, self).__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.bias = nn.Parameter(torch.zeros(hidden_size))
+        self.variance_epsilon = eps
+
+    def forward(self, x):
+        u = x.mean(-1, keepdim=True)
+        s = (x - u).pow(2).mean(-1, keepdim=True)
+        x = (x - u) / torch.sqrt(s + self.variance_epsilon)
+        return self.weight * x + self.bias
+
+
+class BertEmbeddings(nn.Module):
+    """Construct the embeddings from word, position and token_type embeddings.
+    """
+
+    def __init__(self, config):
+        super(BertEmbeddings, self).__init__()
+        self.word_embeddings = nn.Embedding(config.vocab_size,
+                                            config.hidden_size)
+        self.position_embeddings = nn.Embedding(config.max_position_embeddings,
+                                                config.hidden_size)
+        self.token_type_embeddings = nn.Embedding(config.type_vocab_size,
+                                                  config.hidden_size)
+        self.match_type_embeddings = nn.Embedding(11, config.hidden_size)
+        self.type_embeddings = nn.Embedding(6, config.hidden_size)
+
+        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
+        # any TensorFlow checkpoint file
+        self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self,
+                input_ids,
+                header_ids,
+                token_type_ids=None,
+                match_type_ids=None,
+                l_hs=None,
+                header_len=None,
+                type_idx=None,
+                col_dict_list=None,
+                ids=None,
+                header_flatten_tokens=None,
+                header_flatten_index=None,
+                header_flatten_output=None,
+                token_column_id=None,
+                token_column_mask=None,
+                column_start_index=None,
+                headers_length=None):
+        seq_length = input_ids.size(1)
+        position_ids = torch.arange(
+            seq_length, dtype=torch.long, device=input_ids.device)
+        position_ids = position_ids.unsqueeze(0).expand_as(input_ids)
+        if token_type_ids is None:
+            token_type_ids = torch.zeros_like(input_ids)
+        words_embeddings = self.word_embeddings(input_ids)
+        header_embeddings = self.word_embeddings(header_ids)
+
+        if col_dict_list is not None and l_hs is not None:
+            col_dict_list = np.array(col_dict_list)[ids.cpu().numpy()].tolist()
+            header_len = np.array(
+                header_len, dtype=object)[ids.cpu().numpy()].tolist()
+            for bi, col_dict in enumerate(col_dict_list):
+                for ki, vi in col_dict.items():
+                    length = header_len[bi][vi]
+                    if length == 0:
+                        continue
+                    words_embeddings[bi, ki, :] = torch.mean(
+                        header_embeddings[bi, vi, :length, :], dim=0)
+
+        position_embeddings = self.position_embeddings(position_ids)
+        token_type_embeddings = self.token_type_embeddings(token_type_ids)
+        embeddings = words_embeddings + position_embeddings + token_type_embeddings
+
+        if match_type_ids is not None:
+            match_type_embeddings = self.match_type_embeddings(match_type_ids)
+            embeddings += match_type_embeddings
+
+        if type_idx is not None:
+            type_embeddings = self.type_embeddings(type_idx)
+            embeddings += type_embeddings
+
+        embeddings = self.LayerNorm(embeddings)
+        embeddings = self.dropout(embeddings)
+
+        return embeddings
+
+
+class BertSelfAttention(nn.Module):
+
+    def __init__(self, config):
+        super(BertSelfAttention, self).__init__()
+        if config.hidden_size % config.num_attention_heads != 0:
+            raise ValueError(
+                'The hidden size (%d) is not a multiple of the number of attention '
+                'heads (%d)' %
+                (config.hidden_size, config.num_attention_heads))
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size
+                                       / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+
+        self.query = nn.Linear(config.hidden_size, self.all_head_size)
+        self.key = nn.Linear(config.hidden_size, self.all_head_size)
+        self.value = nn.Linear(config.hidden_size, self.all_head_size)
+
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+
+    def transpose_for_scores(self, x):
+        new_x_shape = x.size()[:-1] + (self.num_attention_heads,
+                                       self.attention_head_size)
+        x = x.view(*new_x_shape)
+        return x.permute(0, 2, 1, 3)
+
+    def forward(self, hidden_states, attention_mask, schema_link_matrix=None):
+        mixed_query_layer = self.query(hidden_states)
+        mixed_key_layer = self.key(hidden_states)
+        mixed_value_layer = self.value(hidden_states)
+
+        query_layer = self.transpose_for_scores(mixed_query_layer)
+        key_layer = self.transpose_for_scores(mixed_key_layer)
+        value_layer = self.transpose_for_scores(mixed_value_layer)
+
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = torch.matmul(query_layer,
+                                        key_layer.transpose(-1, -2))
+        attention_scores = attention_scores / math.sqrt(
+            self.attention_head_size)
+        # Apply the attention mask is (precomputed for all layers in BertModel forward() function)
+        attention_scores = attention_scores + attention_mask
+
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.Softmax(dim=-1)(attention_scores)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(attention_probs)
+
+        context_layer = torch.matmul(attention_probs, value_layer)
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (
+            self.all_head_size, )
+        context_layer = context_layer.view(*new_context_layer_shape)
+        return context_layer
+
+
+class BertSelfAttentionWithRelationsRAT(nn.Module):
+    '''
+    Adapted from https://github.com/microsoft/rat-sql/blob/master/ratsql/models/transformer.py
+    '''
+
+    def __init__(self, config):
+        super(BertSelfAttentionWithRelationsRAT, self).__init__()
+        if config.hidden_size % config.num_attention_heads != 0:
+            raise ValueError(
+                'The hidden size (%d) is not a multiple of the number of attention '
+                'heads (%d)' %
+                (config.hidden_size, config.num_attention_heads))
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size
+                                       / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+
+        self.query = nn.Linear(config.hidden_size, self.all_head_size)
+        self.key = nn.Linear(config.hidden_size, self.all_head_size)
+        self.value = nn.Linear(config.hidden_size, self.all_head_size)
+
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+
+        self.relation_k_emb = nn.Embedding(
+            7, config.hidden_size // config.num_attention_heads)
+        self.relation_v_emb = nn.Embedding(
+            7, config.hidden_size // config.num_attention_heads)
+
+    def transpose_for_scores(self, x):
+        new_x_shape = x.size()[:-1] + (self.num_attention_heads,
+                                       self.attention_head_size)
+        x = x.view(*new_x_shape)
+        return x.permute(0, 2, 1, 3)
+
+    def forward(self, hidden_states, attention_mask, relation):
+        '''
+        relation is [batch, seq len, seq len]
+        '''
+        mixed_query_layer = self.query(
+            hidden_states)  # [batch, seq len, hidden dim]
+        mixed_key_layer = self.key(hidden_states)
+        mixed_value_layer = self.value(hidden_states)
+
+        relation_k = self.relation_k_emb(
+            relation)  # [batch, seq len, seq len, head dim]
+        relation_v = self.relation_v_emb(
+            relation)  # [batch, seq len, seq len, head dim]
+
+        query_layer = self.transpose_for_scores(
+            mixed_query_layer)  # [batch, num attn heads, seq len, head dim]
+        key_layer = self.transpose_for_scores(
+            mixed_key_layer)  # [batch, num attn heads, seq len, head dim]
+        value_layer = self.transpose_for_scores(
+            mixed_value_layer)  # [batch, num attn heads, seq len, head dim]
+
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = torch.matmul(query_layer, key_layer.transpose(
+            -1, -2))  # [batch, num attn heads, seq len, seq len]
+
+        # relation_k_t is [batch, seq len, head dim, seq len]
+        relation_k_t = relation_k.transpose(-2, -1)
+        # query_layer_t is [batch, seq len, num attn heads, head dim]
+        query_layer_t = query_layer.permute(0, 2, 1, 3)
+        # relation_attention_scores is [batch, seq len, num attn heads, seq len]
+        relation_attention_scores = torch.matmul(query_layer_t, relation_k_t)
+        # relation_attention_scores_t is [batch, num attn heads, seq len, seq len]
+        relation_attention_scores_t = relation_attention_scores.permute(
+            0, 2, 1, 3)
+
+        merged_attention_scores = (attention_scores
+                                   + relation_attention_scores_t) / math.sqrt(
+                                       self.attention_head_size)
+
+        # Apply the attention mask is (precomputed for all layers in BertModel forward() function)
+        merged_attention_scores = merged_attention_scores + attention_mask
+
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.Softmax(dim=-1)(merged_attention_scores)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        # attention_probs is [batch, num attn heads, seq len, seq len]
+        attention_probs = self.dropout(attention_probs)
+
+        context_layer = torch.matmul(attention_probs, value_layer)
+
+        # attention_probs_t is [batch, seq len, num attn heads, seq len]
+        attention_probs_t = attention_probs.permute(0, 2, 1, 3)
+
+        #   [batch, seq len, num attn heads, seq len]
+        # * [batch, seq len, seq len, head dim]
+        # = [batch, seq len, num attn heads, head dim]
+        context_relation = torch.matmul(attention_probs_t, relation_v)
+
+        # context_relation_t is [batch, num attn heads, seq len, head dim]
+        context_relation_t = context_relation.permute(0, 2, 1, 3)
+
+        merged_context_layer = context_layer + context_relation_t
+        merged_context_layer = merged_context_layer.permute(0, 2, 1,
+                                                            3).contiguous()
+        new_context_layer_shape = merged_context_layer.size()[:-2] + (
+            self.all_head_size, )
+        merged_context_layer = merged_context_layer.view(
+            *new_context_layer_shape)
+        return merged_context_layer
+
+
+class BertSelfAttentionWithRelationsTableformer(nn.Module):
+
+    def __init__(self, config):
+        super(BertSelfAttentionWithRelationsTableformer, self).__init__()
+        if config.hidden_size % config.num_attention_heads != 0:
+            raise ValueError(
+                'The hidden size (%d) is not a multiple of the number of attention '
+                'heads (%d)' %
+                (config.hidden_size, config.num_attention_heads))
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size
+                                       / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+
+        self.query = nn.Linear(config.hidden_size, self.all_head_size)
+        self.key = nn.Linear(config.hidden_size, self.all_head_size)
+        self.value = nn.Linear(config.hidden_size, self.all_head_size)
+
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+        self.schema_link_embeddings = nn.Embedding(7, self.num_attention_heads)
+
+    def transpose_for_scores(self, x):
+        new_x_shape = x.size()[:-1] + (self.num_attention_heads,
+                                       self.attention_head_size)
+        x = x.view(*new_x_shape)
+        return x.permute(0, 2, 1, 3)
+
+    def forward(self, hidden_states, attention_mask, relation):
+        '''
+        relation is [batch, seq len, seq len]
+        '''
+        mixed_query_layer = self.query(
+            hidden_states)  # [batch, seq len, hidden dim]
+        mixed_key_layer = self.key(hidden_states)
+        mixed_value_layer = self.value(hidden_states)
+
+        schema_link_embeddings = self.schema_link_embeddings(
+            relation)  # [batch, seq len, seq len, 1]
+        schema_link_embeddings = schema_link_embeddings.permute(0, 3, 1, 2)
+
+        query_layer = self.transpose_for_scores(
+            mixed_query_layer)  # [batch, num attn heads, seq len, head dim]
+        key_layer = self.transpose_for_scores(
+            mixed_key_layer)  # [batch, num attn heads, seq len, head dim]
+        value_layer = self.transpose_for_scores(
+            mixed_value_layer)  # [batch, num attn heads, seq len, head dim]
+
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = torch.matmul(query_layer, key_layer.transpose(
+            -1, -2))  # [batch, num attn heads, seq len, seq len]
+        attention_scores = attention_scores / math.sqrt(
+            self.attention_head_size)
+
+        merged_attention_scores = attention_scores + schema_link_embeddings
+
+        # Apply the attention mask is (precomputed for all layers in BertModel forward() function)
+        merged_attention_scores = merged_attention_scores + attention_mask
+
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.Softmax(dim=-1)(merged_attention_scores)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        # attention_probs is [batch, num attn heads, seq len, seq len]
+        attention_probs = self.dropout(attention_probs)
+
+        context_layer = torch.matmul(attention_probs, value_layer)
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (
+            self.all_head_size, )
+        context_layer = context_layer.view(*new_context_layer_shape)
+        return context_layer
+
+
+class BertSelfOutput(nn.Module):
+
+    def __init__(self, config):
+        super(BertSelfOutput, self).__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+class BertAttention(nn.Module):
+
+    def __init__(self, config, schema_link_module='none'):
+        super(BertAttention, self).__init__()
+        if schema_link_module == 'none':
+            self.self = BertSelfAttention(config)
+        if schema_link_module == 'rat':
+            self.self = BertSelfAttentionWithRelationsRAT(config)
+        if schema_link_module == 'add':
+            self.self = BertSelfAttentionWithRelationsTableformer(config)
+        self.output = BertSelfOutput(config)
+
+    def forward(self, input_tensor, attention_mask, schema_link_matrix=None):
+        self_output = self.self(input_tensor, attention_mask,
+                                schema_link_matrix)
+        attention_output = self.output(self_output, input_tensor)
+        return attention_output
+
+
+class BertIntermediate(nn.Module):
+
+    def __init__(self, config):
+        super(BertIntermediate, self).__init__()
+        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+        self.intermediate_act_fn = ACT2FN[config.hidden_act] \
+            if isinstance(config.hidden_act, str) else config.hidden_act
+
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        return hidden_states
+
+
+class BertOutput(nn.Module):
+
+    def __init__(self, config):
+        super(BertOutput, self).__init__()
+        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+class BertLayer(nn.Module):
+
+    def __init__(self, config, schema_link_module='none'):
+        super(BertLayer, self).__init__()
+        self.attention = BertAttention(
+            config, schema_link_module=schema_link_module)
+        self.intermediate = BertIntermediate(config)
+        self.output = BertOutput(config)
+
+    def forward(self, hidden_states, attention_mask, schema_link_matrix=None):
+        attention_output = self.attention(hidden_states, attention_mask,
+                                          schema_link_matrix)
+        intermediate_output = self.intermediate(attention_output)
+        layer_output = self.output(intermediate_output, attention_output)
+        return layer_output
+
+
+class SqlBertEncoder(nn.Module):
+
+    def __init__(self, layers, config):
+        super(SqlBertEncoder, self).__init__()
+        layer = BertLayer(config)
+        self.layer = nn.ModuleList(
+            [copy.deepcopy(layer) for _ in range(layers)])
+
+    def forward(self,
+                hidden_states,
+                attention_mask,
+                output_all_encoded_layers=True):
+        all_encoder_layers = []
+        for layer_module in self.layer:
+            hidden_states = layer_module(hidden_states, attention_mask)
+            if output_all_encoded_layers:
+                all_encoder_layers.append(hidden_states)
+        if not output_all_encoded_layers:
+            all_encoder_layers.append(hidden_states)
+        return all_encoder_layers
+
+
+class BertEncoder(nn.Module):
+
+    def __init__(self, config, schema_link_module='none'):
+        super(BertEncoder, self).__init__()
+        layer = BertLayer(config, schema_link_module=schema_link_module)
+        self.layer = nn.ModuleList(
+            [copy.deepcopy(layer) for _ in range(config.num_hidden_layers)])
+
+    def forward(self,
+                hidden_states,
+                attention_mask,
+                all_schema_link_matrix=None,
+                all_schema_link_mask=None,
+                output_all_encoded_layers=True):
+        all_encoder_layers = []
+        for layer_module in self.layer:
+            hidden_states = layer_module(hidden_states, attention_mask,
+                                         all_schema_link_matrix)
+            if output_all_encoded_layers:
+                all_encoder_layers.append(hidden_states)
+        if not output_all_encoded_layers:
+            all_encoder_layers.append(hidden_states)
+        return all_encoder_layers
+
+
+class BertPooler(nn.Module):
+
+    def __init__(self, config):
+        super(BertPooler, self).__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.activation = nn.Tanh()
+
+    def forward(self, hidden_states):
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the first token.
+        first_token_tensor = hidden_states[:, 0]
+        pooled_output = self.dense(first_token_tensor)
+        pooled_output = self.activation(pooled_output)
+        return pooled_output
+
+
+class BertPredictionHeadTransform(nn.Module):
+
+    def __init__(self, config):
+        super(BertPredictionHeadTransform, self).__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.transform_act_fn = ACT2FN[config.hidden_act] \
+            if isinstance(config.hidden_act, str) else config.hidden_act
+        self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12)
+
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.transform_act_fn(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states)
+        return hidden_states
+
+
+class BertLMPredictionHead(nn.Module):
+
+    def __init__(self, config, bert_model_embedding_weights):
+        super(BertLMPredictionHead, self).__init__()
+        self.transform = BertPredictionHeadTransform(config)
+
+        # The output weights are the same as the input embeddings, but there is
+        # an output-only bias for each token.
+        self.decoder = nn.Linear(
+            bert_model_embedding_weights.size(1),
+            bert_model_embedding_weights.size(0),
+            bias=False)
+        self.decoder.weight = bert_model_embedding_weights
+        self.bias = nn.Parameter(
+            torch.zeros(bert_model_embedding_weights.size(0)))
+
+    def forward(self, hidden_states):
+        hidden_states = self.transform(hidden_states)
+        hidden_states = self.decoder(hidden_states) + self.bias
+        return hidden_states
+
+
+class BertOnlyMLMHead(nn.Module):
+
+    def __init__(self, config, bert_model_embedding_weights):
+        super(BertOnlyMLMHead, self).__init__()
+        self.predictions = BertLMPredictionHead(config,
+                                                bert_model_embedding_weights)
+
+    def forward(self, sequence_output):
+        prediction_scores = self.predictions(sequence_output)
+        return prediction_scores
+
+
+class BertOnlyNSPHead(nn.Module):
+
+    def __init__(self, config):
+        super(BertOnlyNSPHead, self).__init__()
+        self.seq_relationship = nn.Linear(config.hidden_size, 2)
+
+    def forward(self, pooled_output):
+        seq_relationship_score = self.seq_relationship(pooled_output)
+        return seq_relationship_score
+
+
+class BertPreTrainingHeads(nn.Module):
+
+    def __init__(self, config, bert_model_embedding_weights):
+        super(BertPreTrainingHeads, self).__init__()
+        self.predictions = BertLMPredictionHead(config,
+                                                bert_model_embedding_weights)
+        self.seq_relationship = nn.Linear(config.hidden_size, 2)
+
+    def forward(self, sequence_output, pooled_output):
+        prediction_scores = self.predictions(sequence_output)
+        seq_relationship_score = self.seq_relationship(pooled_output)
+        return prediction_scores, seq_relationship_score
+
+
+class PreTrainedBertModel(nn.Module):
+    """ An abstract class to handle weights initialization and
+        a simple interface for dowloading and loading pretrained models.
+    """
+
+    def __init__(self, config, *inputs, **kwargs):
+        super(PreTrainedBertModel, self).__init__()
+        if not isinstance(config, Star3Config):
+            raise ValueError(
+                'Parameter config in `{}(config)` should be an instance of class `Star3Config`. '
+                'To create a model from a Google pretrained model use '
+                '`model = {}.from_pretrained(PRETRAINED_MODEL_NAME)`'.format(
+                    self.__class__.__name__, self.__class__.__name__))
+        self.config = config
+
+    def init_bert_weights(self, module):
+        """ Initialize the weights.
+        """
+        if isinstance(module, (nn.Linear, nn.Embedding)):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(
+                mean=0.0, std=self.config.initializer_range)
+        elif isinstance(module, BertLayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        if isinstance(module, nn.Linear) and module.bias is not None:
+            module.bias.data.zero_()
+
+    @classmethod
+    def from_pretrained(cls,
+                        pretrained_model_name,
+                        state_dict=None,
+                        cache_dir=None,
+                        *inputs,
+                        **kwargs):
+        """
+        Instantiate a PreTrainedBertModel from a pre-trained model file or a pytorch state dict.
+        Download and cache the pre-trained model file if needed.
+
+        Params:
+            pretrained_model_name: either:
+                - a str with the name of a pre-trained model to load selected in the list of:
+                    . `bert-base-uncased`
+                    . `bert-large-uncased`
+                    . `bert-base-cased`
+                    . `bert-large-cased`
+                    . `bert-base-multilingual-uncased`
+                    . `bert-base-multilingual-cased`
+                    . `bert-base-chinese`
+                - a path or url to a pretrained model archive containing:
+                    . `bert_config.json` a configuration file for the model
+                    . `pytorch_model.bin` a PyTorch dump of a BertForPreTraining instance
+            cache_dir: an optional path to a folder in which the pre-trained models will be cached.
+            state_dict: an optional state dictionnary (collections.OrderedDict object)
+                to use instead of Google pre-trained models
+            *inputs, **kwargs: additional input for the specific Bert class
+                (ex: num_labels for BertForSequenceClassification)
+        """
+        resolved_archive_file = pretrained_model_name
+        # redirect to the cache, if necessary
+        tempdir = None
+        if os.path.isdir(resolved_archive_file):
+            serialization_dir = resolved_archive_file
+        else:
+            # Extract archive to temp dir
+            tempdir = tempfile.mkdtemp()
+            logger.info('extracting archive file {} to temp dir {}'.format(
+                resolved_archive_file, tempdir))
+            with tarfile.open(resolved_archive_file, 'r:gz') as archive:
+                archive.extractall(tempdir)
+            serialization_dir = tempdir
+        # Load config
+        config_file = os.path.join(serialization_dir, CONFIG_NAME)
+        config = Star3Config.from_json_file(config_file)
+        logger.info('Model config {}'.format(config))
+        # Instantiate model.
+        model = cls(config, *inputs, **kwargs)
+        if state_dict is None:
+            weights_path = os.path.join(serialization_dir, WEIGHTS_NAME)
+            state_dict = torch.load(weights_path)
+
+        old_keys = []
+        new_keys = []
+        for key in state_dict.keys():
+            new_key = None
+            if 'gamma' in key:
+                new_key = key.replace('gamma', 'weight')
+            if 'beta' in key:
+                new_key = key.replace('beta', 'bias')
+            if new_key:
+                old_keys.append(key)
+                new_keys.append(new_key)
+        for old_key, new_key in zip(old_keys, new_keys):
+            state_dict[new_key] = state_dict.pop(old_key)
+
+        missing_keys = []
+        unexpected_keys = []
+        error_msgs = []
+        # copy state_dict so _load_from_state_dict can modify it
+        metadata = getattr(state_dict, '_metadata', None)
+        state_dict = state_dict.copy()
+        if metadata is not None:
+            state_dict._metadata = metadata
+
+        def load(module, prefix=''):
+            local_metadata = {} if metadata is None else metadata.get(
+                prefix[:-1], {})
+            module._load_from_state_dict(state_dict, prefix, local_metadata,
+                                         True, missing_keys, unexpected_keys,
+                                         error_msgs)
+            for name, child in module._modules.items():
+                if child is not None:
+                    load(child, prefix + name + '.')
+
+        load(model, prefix='' if hasattr(model, 'bert') else 'bert.')
+        if len(missing_keys) > 0:
+            logger.info(
+                'Weights of {} not initialized from pretrained model: {}'.
+                format(model.__class__.__name__, missing_keys))
+            print()
+            print('*' * 10, 'WARNING missing weights', '*' * 10)
+            print('Weights of {} not initialized from pretrained model: {}'.
+                  format(model.__class__.__name__, missing_keys))
+            print()
+        if len(unexpected_keys) > 0:
+            logger.info(
+                'Weights from pretrained model not used in {}: {}'.format(
+                    model.__class__.__name__, unexpected_keys))
+            print()
+            print('*' * 10, 'WARNING unexpected weights', '*' * 10)
+            print('Weights from pretrained model not used in {}: {}'.format(
+                model.__class__.__name__, unexpected_keys))
+            print()
+        if tempdir:
+            # Clean up temp dir
+            shutil.rmtree(tempdir)
+        return model
+
+
+class Star3Model(PreTrainedBertModel):
+    """Star3Model model ("Bidirectional Embedding Representations from a Transformer pretrained on STAR3.0").
+
+    Params:
+        config: a Star3Config class instance with the configuration to build a new model
+
+    Inputs:
+        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
+            with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
+            `extract_features.py`, `run_classifier.py` and `run_squad.py`)
+        `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
+            types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
+            a `sentence B` token (see BERT paper for more details).
+        `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
+            selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
+            input sequence length in the current batch. It's the mask that we typically use for attention when
+            a batch has varying length sentences.
+        `output_all_encoded_layers`: boolean which controls the content of the `encoded_layers` output
+            as described below. Default: `True`.
+
+    Outputs: Tuple of (encoded_layers, pooled_output)
+        `encoded_layers`: controled by `output_all_encoded_layers` argument:
+            - `output_all_encoded_layers=True`: outputs a list of the full sequences of encoded-hidden-states at the end
+                of each attention block (i.e. 12 full sequences for BERT-base, 24 for BERT-large), each
+                encoded-hidden-state is a torch.FloatTensor of size [batch_size, sequence_length, hidden_size],
+            - `output_all_encoded_layers=False`: outputs only the full sequence of hidden-states corresponding
+                to the last attention block of shape [batch_size, sequence_length, hidden_size],
+        `pooled_output`: a torch.FloatTensor of size [batch_size, hidden_size] which is the output of a
+            classifier pretrained on top of the hidden state associated to the first character of the
+            input (`CLF`) to train on the Next-Sentence task (see BERT's paper).
+
+    Example usage:
+    ```python
+    # Already been converted into WordPiece token ids
+    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
+    input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
+    token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
+
+    config = modeling.Star3Config(vocab_size_or_config_json_file=32000, hidden_size=768,
+        num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
+
+    model = modeling.Star3Model(config=config)
+    all_encoder_layers, pooled_output = model(input_ids, token_type_ids, input_mask)
+    ```
+    """
+
+    def __init__(self, config, schema_link_module='none'):
+        super(Star3Model, self).__init__(config)
+        self.embeddings = BertEmbeddings(config)
+        self.encoder = BertEncoder(
+            config, schema_link_module=schema_link_module)
+        self.pooler = BertPooler(config)
+        self.apply(self.init_bert_weights)
+
+    def forward(self,
+                input_ids,
+                header_ids,
+                token_order_ids=None,
+                token_type_ids=None,
+                attention_mask=None,
+                match_type_ids=None,
+                l_hs=None,
+                header_len=None,
+                type_ids=None,
+                col_dict_list=None,
+                ids=None,
+                header_flatten_tokens=None,
+                header_flatten_index=None,
+                header_flatten_output=None,
+                token_column_id=None,
+                token_column_mask=None,
+                column_start_index=None,
+                headers_length=None,
+                all_schema_link_matrix=None,
+                all_schema_link_mask=None,
+                output_all_encoded_layers=True):
+        if attention_mask is None:
+            attention_mask = torch.ones_like(input_ids)
+        if token_type_ids is None:
+            token_type_ids = torch.zeros_like(input_ids)
+
+        # We create a 3D attention mask from a 2D tensor mask.
+        # Sizes are [batch_size, 1, 1, to_seq_length]
+        # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
+        # this attention mask is more simple than the triangular masking of causal attention
+        # used in OpenAI GPT, we just need to prepare the broadcast dimension here.
+        extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
+
+        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
+        # masked positions, this operation will create a tensor which is 0.0 for
+        # positions we want to attend and -10000.0 for masked positions.
+        # Since we are adding it to the raw scores before the softmax, this is
+        # effectively the same as removing these entirely.
+
+        # Bowen: comment out the following line for Pytorch >= 1.5
+        # https://github.com/huggingface/transformers/issues/3936#issuecomment-793764416
+        # extended_attention_mask = extended_attention_mask.to(self.dtype)  # fp16 compatibility
+        extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
+
+        embedding_output = self.embeddings(
+            input_ids, header_ids, token_type_ids, match_type_ids, l_hs,
+            header_len, type_ids, col_dict_list, ids, header_flatten_tokens,
+            header_flatten_index, header_flatten_output, token_column_id,
+            token_column_mask, column_start_index, headers_length)
+        encoded_layers = self.encoder(
+            embedding_output,
+            extended_attention_mask,
+            all_schema_link_matrix=all_schema_link_matrix,
+            all_schema_link_mask=all_schema_link_mask,
+            output_all_encoded_layers=output_all_encoded_layers)
+        sequence_output = encoded_layers[-1]
+        pooled_output = self.pooler(sequence_output)
+        if not output_all_encoded_layers:
+            encoded_layers = encoded_layers[-1]
+        return encoded_layers, pooled_output
+
+
+class Seq2SQL(nn.Module):
+
+    def __init__(self, iS, hS, lS, dr, n_cond_ops, n_agg_ops, n_action_ops,
+                 max_select_num, max_where_num, device):
+        super(Seq2SQL, self).__init__()
+        self.iS = iS
+        self.hS = hS
+        self.ls = lS
+        self.dr = dr
+        self.device = device
+
+        self.n_agg_ops = n_agg_ops
+        self.n_cond_ops = n_cond_ops
+        self.n_action_ops = n_action_ops
+        self.max_select_num = max_select_num
+        self.max_where_num = max_where_num
+
+        self.w_sss_model = nn.Linear(iS, max_where_num)
+        self.w_sse_model = nn.Linear(iS, max_where_num)
+        self.s_ht_model = nn.Linear(iS, max_select_num)
+        self.wc_ht_model = nn.Linear(iS, max_where_num)
+
+        self.select_agg_model = nn.Linear(iS * max_select_num,
+                                          n_agg_ops * max_select_num)
+        self.w_op_model = nn.Linear(iS * max_where_num,
+                                    n_cond_ops * max_where_num)
+
+        self.conn_model = nn.Linear(iS, 3)
+        self.action_model = nn.Linear(iS, n_action_ops + 1)
+        self.slen_model = nn.Linear(iS, max_select_num + 1)
+        self.wlen_model = nn.Linear(iS, max_where_num + 1)
+
+    def forward(self, wemb_layer, l_n, l_hs, start_index, column_index, tokens,
+                ids):
+        # chunk input lists for multi-gpu
+        max_l_n = max(l_n)
+        max_l_hs = max(l_hs)
+        l_n = np.array(l_n)[ids.cpu().numpy()].tolist()
+        l_hs = np.array(l_hs)[ids.cpu().numpy()].tolist()
+        start_index = np.array(start_index)[ids.cpu().numpy()].tolist()
+        column_index = np.array(column_index)[ids.cpu().numpy()].tolist()
+        # tokens = np.array(tokens)[ids.cpu().numpy()].tolist()
+
+        conn_index = []
+        slen_index = []
+        wlen_index = []
+        action_index = []
+        where_op_index = []
+        select_agg_index = []
+        header_pos_index = []
+        query_index = []
+        for ib, elem in enumerate(start_index):
+            # [SEP] conn [SEP] wlen [SEP] (wop [SEP])*wn slen [SEP] (agg [SEP])*sn
+            action_index.append(elem + 1)
+            conn_index.append(elem + 2)
+            wlen_index.append(elem + 3)
+            woi = [elem + 4 + i for i in range(self.max_where_num)]
+
+            slen_index.append(elem + 4 + self.max_where_num)
+            sai = [
+                elem + 5 + self.max_where_num + i
+                for i in range(self.max_select_num)
+            ]
+            where_op_index.append(woi)
+            select_agg_index.append(sai)
+
+            qilist = [i for i in range(l_n[ib] + 2)] + [l_n[ib] + 1] * (
+                max_l_n - l_n[ib])
+            query_index.append(qilist)
+
+            index = [column_index[ib] + i for i in range(0, l_hs[ib], 1)]
+            index += [index[0] for _ in range(max_l_hs - len(index))]
+            header_pos_index.append(index)
+
+        # print("tokens: ", tokens)
+        # print("conn_index: ", conn_index, "start_index: ", start_index)
+        conn_index = torch.tensor(conn_index, dtype=torch.long).to(self.device)
+        slen_index = torch.tensor(slen_index, dtype=torch.long).to(self.device)
+        wlen_index = torch.tensor(wlen_index, dtype=torch.long).to(self.device)
+        action_index = torch.tensor(
+            action_index, dtype=torch.long).to(self.device)
+        where_op_index = torch.tensor(
+            where_op_index, dtype=torch.long).to(self.device)
+        select_agg_index = torch.tensor(
+            select_agg_index, dtype=torch.long).to(self.device)
+        query_index = torch.tensor(
+            query_index, dtype=torch.long).to(self.device)
+        header_index = torch.tensor(
+            header_pos_index, dtype=torch.long).to(self.device)
+
+        bS = len(l_n)
+        conn_emb = torch.zeros([bS, self.iS]).to(self.device)
+        slen_emb = torch.zeros([bS, self.iS]).to(self.device)
+        wlen_emb = torch.zeros([bS, self.iS]).to(self.device)
+        action_emb = torch.zeros([bS, self.iS]).to(self.device)
+        wo_emb = torch.zeros([bS, self.max_where_num, self.iS]).to(self.device)
+        sa_emb = torch.zeros([bS, self.max_select_num,
+                              self.iS]).to(self.device)
+        qv_emb = torch.zeros([bS, max_l_n + 2, self.iS]).to(self.device)
+        ht_emb = torch.zeros([bS, max_l_hs, self.iS]).to(self.device)
+        for i in range(bS):
+            conn_emb[i, :] = wemb_layer[i].index_select(0, conn_index[i])
+            slen_emb[i, :] = wemb_layer[i].index_select(0, slen_index[i])
+            wlen_emb[i, :] = wemb_layer[i].index_select(0, wlen_index[i])
+            action_emb[i, :] = wemb_layer[i].index_select(0, action_index[i])
+
+            wo_emb[i, :, :] = wemb_layer[i].index_select(
+                0, where_op_index[i, :])
+            sa_emb[i, :, :] = wemb_layer[i].index_select(
+                0, select_agg_index[i, :])
+            qv_emb[i, :, :] = wemb_layer[i].index_select(0, query_index[i, :])
+            ht_emb[i, :, :] = wemb_layer[i].index_select(0, header_index[i, :])
+
+        s_cco = self.conn_model(conn_emb.reshape(-1, self.iS)).reshape(bS, 3)
+        s_slen = self.slen_model(slen_emb.reshape(-1, self.iS)).reshape(
+            bS, self.max_select_num + 1)
+        s_wlen = self.wlen_model(wlen_emb.reshape(-1, self.iS)).reshape(
+            bS, self.max_where_num + 1)
+        s_action = self.action_model(action_emb.reshape(-1, self.iS)).reshape(
+            bS, self.n_action_ops + 1)
+        wo_output = self.w_op_model(
+            wo_emb.reshape(-1, self.iS * self.max_where_num)).reshape(
+                bS, -1, self.n_cond_ops)
+
+        wc_output = self.wc_ht_model(ht_emb.reshape(-1, self.iS)).reshape(
+            bS, -1, self.max_where_num).transpose(1, 2)
+
+        wv_ss = self.w_sss_model(qv_emb.reshape(-1, self.iS)).reshape(
+            bS, -1, self.max_where_num).transpose(1, 2)
+        wv_se = self.w_sse_model(qv_emb.reshape(-1, self.iS)).reshape(
+            bS, -1, self.max_where_num).transpose(1, 2)
+
+        sc_output = self.s_ht_model(ht_emb.reshape(-1, self.iS)).reshape(
+            bS, -1, self.max_select_num).transpose(1, 2)
+        sa_output = self.select_agg_model(
+            sa_emb.reshape(-1, self.iS * self.max_select_num)).reshape(
+                bS, -1, self.n_agg_ops)
+
+        return s_action, sc_output, sa_output, s_cco, wc_output, wo_output, (
+            wv_ss, wv_se), (s_slen, s_wlen)
diff --git a/modelscope/models/nlp/structbert/configuration_sbert.py b/modelscope/models/nlp/structbert/configuration_sbert.py
index 374d4b62..a727a978 100644
--- a/modelscope/models/nlp/structbert/configuration_sbert.py
+++ b/modelscope/models/nlp/structbert/configuration_sbert.py
@@ -85,7 +85,7 @@ class SbertConfig(PretrainedConfig):
             If adv_bound not proveded, 2 * sigma will be used as the adv_bound factor
     """
 
-    model_type = 'sbert'
+    model_type = 'structbert'
 
     def __init__(self,
                  vocab_size=30522,
diff --git a/modelscope/models/nlp/table_question_answering.py b/modelscope/models/nlp/table_question_answering.py
new file mode 100644
index 00000000..3c91a518
--- /dev/null
+++ b/modelscope/models/nlp/table_question_answering.py
@@ -0,0 +1,745 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import os
+from typing import Dict
+
+import numpy
+import torch
+import torch.nn.functional as F
+from transformers import BertTokenizer
+
+from modelscope.metainfo import Models
+from modelscope.models.base import Model, Tensor
+from modelscope.models.builder import MODELS
+from modelscope.models.nlp.star3.configuration_star3 import Star3Config
+from modelscope.models.nlp.star3.modeling_star3 import Seq2SQL, Star3Model
+from modelscope.preprocessors.star3.fields.struct import Constant
+from modelscope.utils.constant import ModelFile, Tasks
+from modelscope.utils.device import verify_device
+
+__all__ = ['TableQuestionAnswering']
+
+
+@MODELS.register_module(
+    Tasks.table_question_answering, module_name=Models.star3)
+class TableQuestionAnswering(Model):
+
+    def __init__(self, model_dir: str, *args, **kwargs):
+        """initialize the table-question-answering model from the `model_dir` path.
+
+        Args:
+            model_dir (str): the model path.
+        """
+        super().__init__(model_dir, *args, **kwargs)
+        self.tokenizer = BertTokenizer(
+            os.path.join(model_dir, ModelFile.VOCAB_FILE))
+        device_name = kwargs.get('device', 'gpu')
+        verify_device(device_name)
+        self._device_name = device_name
+
+        state_dict = torch.load(
+            os.path.join(self.model_dir, ModelFile.TORCH_MODEL_BIN_FILE),
+            map_location='cpu')
+
+        self.backbone_config = Star3Config.from_json_file(
+            os.path.join(self.model_dir, ModelFile.CONFIGURATION))
+        self.backbone_model = Star3Model(
+            config=self.backbone_config, schema_link_module='rat')
+        self.backbone_model.load_state_dict(state_dict['backbone_model'])
+
+        constant = Constant()
+        self.agg_ops = constant.agg_ops
+        self.cond_ops = constant.cond_ops
+        self.cond_conn_ops = constant.cond_conn_ops
+        self.action_ops = constant.action_ops
+        self.max_select_num = constant.max_select_num
+        self.max_where_num = constant.max_where_num
+        self.col_type_dict = constant.col_type_dict
+        self.schema_link_dict = constant.schema_link_dict
+        n_cond_ops = len(self.cond_ops)
+        n_agg_ops = len(self.agg_ops)
+        n_action_ops = len(self.action_ops)
+        iS = self.backbone_config.hidden_size
+        self.head_model = Seq2SQL(iS, 100, 2, 0.0, n_cond_ops, n_agg_ops,
+                                  n_action_ops, self.max_select_num,
+                                  self.max_where_num, self._device_name)
+        self.head_model.load_state_dict(state_dict['head_model'], strict=False)
+
+        self.backbone_model.to(self._device_name)
+        self.head_model.to(self._device_name)
+
+    def convert_string(self, pr_wvi, nlu, nlu_tt):
+        convs = []
+        for b, nlu1 in enumerate(nlu):
+            conv_dict = {}
+            nlu_tt1 = nlu_tt[b]
+            idx = 0
+            convflag = True
+            for i, ntok in enumerate(nlu_tt1):
+                if idx >= len(nlu1):
+                    convflag = False
+                    break
+
+                if ntok.startswith('##'):
+                    ntok = ntok.replace('##', '')
+
+                tok = nlu1[idx:idx + 1].lower()
+                if ntok == tok:
+                    conv_dict[i] = [idx, idx + 1]
+                    idx += 1
+                elif ntok == '#':
+                    conv_dict[i] = [idx, idx]
+                elif ntok == '[UNK]':
+                    conv_dict[i] = [idx, idx + 1]
+                    j = i + 1
+                    idx += 1
+                    if idx < len(nlu1) and j < len(
+                            nlu_tt1) and nlu_tt1[j] != '[UNK]':
+                        while idx < len(nlu1):
+                            val = nlu1[idx:idx + 1].lower()
+                            if nlu_tt1[j].startswith(val):
+                                break
+                            idx += 1
+                        conv_dict[i][1] = idx
+                elif tok in ntok:
+                    startid = idx
+                    idx += 1
+                    while idx < len(nlu1):
+                        tok += nlu1[idx:idx + 1].lower()
+                        if ntok == tok:
+                            conv_dict[i] = [startid, idx + 1]
+                            break
+                        idx += 1
+                    idx += 1
+                else:
+                    convflag = False
+
+            conv = []
+            if convflag:
+                for pr_wvi1 in pr_wvi[b]:
+                    s1, e1 = conv_dict[pr_wvi1[0]]
+                    s2, e2 = conv_dict[pr_wvi1[1]]
+                    newidx = pr_wvi1[1]
+                    while newidx + 1 < len(
+                            nlu_tt1) and s2 == e2 and nlu_tt1[newidx] == '#':
+                        newidx += 1
+                        s2, e2 = conv_dict[newidx]
+                    if newidx + 1 < len(nlu_tt1) and nlu_tt1[
+                            newidx + 1].startswith('##'):
+                        s2, e2 = conv_dict[newidx + 1]
+                    phrase = nlu1[s1:e2]
+                    conv.append(phrase)
+            else:
+                for pr_wvi1 in pr_wvi[b]:
+                    phrase = ''.join(nlu_tt1[pr_wvi1[0]:pr_wvi1[1]
+                                             + 1]).replace('##', '')
+                    conv.append(phrase)
+            convs.append(conv)
+
+        return convs
+
+    def get_fields_info(self, t1s, tables, train=True):
+        nlu, nlu_t, sql_i, q_know, t_know, action, hs_t, types, units, his_sql, schema_link = \
+            [], [], [], [], [], [], [], [], [], [], []
+        for t1 in t1s:
+            nlu.append(t1['question'])
+            nlu_t.append(t1['question_tok'])
+            hs_t.append(t1['header_tok'])
+            q_know.append(t1['bertindex_knowledge'])
+            t_know.append(t1['header_knowledge'])
+            types.append(t1['types'])
+            units.append(t1['units'])
+            his_sql.append(t1.get('history_sql', None))
+            schema_link.append(t1.get('schema_link', []))
+            if train:
+                action.append(t1.get('action', [0]))
+                sql_i.append(t1['sql'])
+
+        return nlu, nlu_t, sql_i, q_know, t_know, action, hs_t, types, units, his_sql, schema_link
+
+    def get_history_select_where(self, his_sql, header_len):
+        if his_sql is None:
+            return [0], [0]
+
+        sel = []
+        for seli in his_sql['sel']:
+            if seli + 1 < header_len and seli + 1 not in sel:
+                sel.append(seli + 1)
+
+        whe = []
+        for condi in his_sql['conds']:
+            if condi[0] + 1 < header_len and condi[0] + 1 not in whe:
+                whe.append(condi[0] + 1)
+
+        if len(sel) == 0:
+            sel.append(0)
+        if len(whe) == 0:
+            whe.append(0)
+
+        sel.sort()
+        whe.sort()
+
+        return sel, whe
+
+    def get_types_ids(self, col_type):
+        for key, type_ids in self.col_type_dict.items():
+            if key in col_type.lower():
+                return type_ids
+        return self.col_type_dict['null']
+
+    def generate_inputs(self, nlu1_tok, hs_t_1, type_t, unit_t, his_sql,
+                        q_know, t_know, s_link):
+        tokens = []
+        orders = []
+        types = []
+        segment_ids = []
+        matchs = []
+        col_dict = {}
+        schema_tok = []
+
+        tokens.append('[CLS]')
+        orders.append(0)
+        types.append(0)
+        i_st_nlu = len(tokens)
+
+        matchs.append(0)
+        segment_ids.append(0)
+        for idx, token in enumerate(nlu1_tok):
+            if q_know[idx] == 100:
+                break
+            elif q_know[idx] >= 5:
+                matchs.append(1)
+            else:
+                matchs.append(q_know[idx] + 1)
+            tokens.append(token)
+            orders.append(0)
+            types.append(0)
+            segment_ids.append(0)
+
+        i_ed_nlu = len(tokens)
+
+        tokens.append('[SEP]')
+        orders.append(0)
+        types.append(0)
+        matchs.append(0)
+        segment_ids.append(0)
+
+        sel, whe = self.get_history_select_where(his_sql, len(hs_t_1))
+
+        if len(sel) == 1 and sel[0] == 0 \
+                and len(whe) == 1 and whe[0] == 0:
+            pass
+        else:
+            tokens.append('select')
+            orders.append(0)
+            types.append(0)
+            matchs.append(10)
+            segment_ids.append(0)
+
+            for seli in sel:
+                tokens.append('[PAD]')
+                orders.append(0)
+                types.append(0)
+                matchs.append(10)
+                segment_ids.append(0)
+                col_dict[len(tokens) - 1] = seli
+
+            tokens.append('where')
+            orders.append(0)
+            types.append(0)
+            matchs.append(10)
+            segment_ids.append(0)
+
+            for whei in whe:
+                tokens.append('[PAD]')
+                orders.append(0)
+                types.append(0)
+                matchs.append(10)
+                segment_ids.append(0)
+                col_dict[len(tokens) - 1] = whei
+
+            tokens.append('[SEP]')
+            orders.append(0)
+            types.append(0)
+            matchs.append(10)
+            segment_ids.append(0)
+
+        column_start = len(tokens)
+        i_hds_f = []
+        header_flatten_tokens, header_flatten_index = [], []
+        for i, hds11 in enumerate(hs_t_1):
+            if len(unit_t[i]) == 1 and unit_t[i][0] == 'null':
+                temp_header_tokens = hds11
+            else:
+                temp_header_tokens = hds11 + unit_t[i]
+            schema_tok.append(temp_header_tokens)
+            header_flatten_tokens.extend(temp_header_tokens)
+            header_flatten_index.extend([i + 1] * len(temp_header_tokens))
+            i_st_hd_f = len(tokens)
+            tokens += ['[PAD]']
+            orders.append(0)
+            types.append(self.get_types_ids(type_t[i]))
+            i_ed_hd_f = len(tokens)
+            col_dict[len(tokens) - 1] = i
+            i_hds_f.append((i_st_hd_f, i_ed_hd_f))
+            if i == 0:
+                matchs.append(6)
+            else:
+                matchs.append(t_know[i - 1] + 6)
+            segment_ids.append(1)
+
+        tokens.append('[SEP]')
+        orders.append(0)
+        types.append(0)
+        matchs.append(0)
+        segment_ids.append(1)
+
+        # position where
+        # [SEP]
+        start_ids = len(tokens) - 1
+
+        tokens.append('action')  # action
+        orders.append(1)
+        types.append(0)
+        matchs.append(0)
+        segment_ids.append(1)
+
+        tokens.append('connect')  # column
+        orders.append(1)
+        types.append(0)
+        matchs.append(0)
+        segment_ids.append(1)
+
+        tokens.append('allen')  # select len
+        orders.append(1)
+        types.append(0)
+        matchs.append(0)
+        segment_ids.append(1)
+
+        for x in range(self.max_where_num):
+            tokens.append('act')  # op
+            orders.append(2 + x)
+            types.append(0)
+            matchs.append(0)
+            segment_ids.append(1)
+
+        tokens.append('size')  # where len
+        orders.append(1)
+        types.append(0)
+        matchs.append(0)
+        segment_ids.append(1)
+
+        for x in range(self.max_select_num):
+            tokens.append('focus')  # agg
+            orders.append(2 + x)
+            types.append(0)
+            matchs.append(0)
+            segment_ids.append(1)
+
+        i_nlu = (i_st_nlu, i_ed_nlu)
+
+        schema_link_matrix = numpy.zeros((len(tokens), len(tokens)),
+                                         dtype='int32')
+        schema_link_mask = numpy.zeros((len(tokens), len(tokens)),
+                                       dtype='float32')
+        for relation in s_link:
+            if relation['label'] in ['col', 'val']:
+                [q_st, q_ed] = relation['question_index']
+                cid = max(0, relation['column_index'])
+                schema_link_matrix[
+                    i_st_nlu + q_st: i_st_nlu + q_ed + 1,
+                    column_start + cid + 1: column_start + cid + 1 + 1] = \
+                    self.schema_link_dict[relation['label'] + '_middle']
+                schema_link_matrix[
+                    i_st_nlu + q_st,
+                    column_start + cid + 1: column_start + cid + 1 + 1] = \
+                    self.schema_link_dict[relation['label'] + '_start']
+                schema_link_matrix[
+                    i_st_nlu + q_ed,
+                    column_start + cid + 1: column_start + cid + 1 + 1] = \
+                    self.schema_link_dict[relation['label'] + '_end']
+                schema_link_mask[i_st_nlu + q_st:i_st_nlu + q_ed + 1,
+                                 column_start + cid + 1:column_start + cid + 1
+                                 + 1] = 1.0
+
+        return tokens, orders, types, segment_ids, matchs, \
+            i_nlu, i_hds_f, start_ids, column_start, col_dict, schema_tok, \
+            header_flatten_tokens, header_flatten_index, schema_link_matrix, schema_link_mask
+
+    def gen_l_hpu(self, i_hds):
+        """
+        Treat columns as if it is a batch of natural language utterance
+        with batch-size = # of columns * # of batch_size
+        i_hds = [(17, 18), (19, 21), (22, 23), (24, 25), (26, 29), (30, 34)])
+        """
+        l_hpu = []
+        for i_hds1 in i_hds:
+            for i_hds11 in i_hds1:
+                l_hpu.append(i_hds11[1] - i_hds11[0])
+
+        return l_hpu
+
+    def get_bert_output(self, model_bert, tokenizer, nlu_t, hs_t, col_types,
+                        units, his_sql, q_know, t_know, schema_link):
+        """
+        Here, input is toknized further by WordPiece (WP) tokenizer and fed into BERT.
+
+        INPUT
+        :param model_bert:
+        :param tokenizer: WordPiece toknizer
+        :param nlu: Question
+        :param nlu_t: CoreNLP tokenized nlu.
+        :param hds: Headers
+        :param hs_t: None or 1st-level tokenized headers
+        :param max_seq_length: max input token length
+
+        OUTPUT
+        tokens: BERT input tokens
+        nlu_tt: WP-tokenized input natural language questions
+        orig_to_tok_index: map the index of 1st-level-token to the index of 2nd-level-token
+        tok_to_orig_index: inverse map.
+
+        """
+
+        l_n = []
+        l_hs = []  # The length of columns for each batch
+
+        input_ids = []
+        order_ids = []
+        type_ids = []
+        segment_ids = []
+        match_ids = []
+        input_mask = []
+
+        i_nlu = [
+        ]  # index to retreive the position of contextual vector later.
+        i_hds = []
+        tokens = []
+        orders = []
+        types = []
+        matchs = []
+        segments = []
+        schema_link_matrix_list = []
+        schema_link_mask_list = []
+        start_index = []
+        column_index = []
+        col_dict_list = []
+        header_list = []
+        header_flatten_token_list = []
+        header_flatten_tokenid_list = []
+        header_flatten_index_list = []
+
+        header_tok_max_len = 0
+        cur_max_length = 0
+
+        for b, nlu_t1 in enumerate(nlu_t):
+            hs_t1 = [hs_t[b][-1]] + hs_t[b][:-1]
+            type_t1 = [col_types[b][-1]] + col_types[b][:-1]
+            unit_t1 = [units[b][-1]] + units[b][:-1]
+            l_hs.append(len(hs_t1))
+
+            # [CLS] nlu [SEP] col1 [SEP] col2 [SEP] ...col-n [SEP]
+            # 2. Generate BERT inputs & indices.
+            tokens1, orders1, types1, segment1, match1, i_nlu1, i_hds_1, \
+                start_idx, column_start, col_dict, schema_tok, \
+                header_flatten_tokens, header_flatten_index, schema_link_matrix, schema_link_mask = \
+                self.generate_inputs(
+                    nlu_t1, hs_t1, type_t1, unit_t1, his_sql[b],
+                    q_know[b], t_know[b], schema_link[b])
+
+            l_n.append(i_nlu1[1] - i_nlu1[0])
+            start_index.append(start_idx)
+            column_index.append(column_start)
+            col_dict_list.append(col_dict)
+            tokens.append(tokens1)
+            orders.append(orders1)
+            types.append(types1)
+            segments.append(segment1)
+            matchs.append(match1)
+            i_nlu.append(i_nlu1)
+            i_hds.append(i_hds_1)
+            schema_link_matrix_list.append(schema_link_matrix)
+            schema_link_mask_list.append(schema_link_mask)
+            header_flatten_token_list.append(header_flatten_tokens)
+            header_flatten_index_list.append(header_flatten_index)
+            header_list.append(schema_tok)
+            header_max = max([len(schema_tok1) for schema_tok1 in schema_tok])
+            if header_max > header_tok_max_len:
+                header_tok_max_len = header_max
+
+            if len(tokens1) > cur_max_length:
+                cur_max_length = len(tokens1)
+
+            if len(tokens1) > 512:
+                print('input too long!!! total_num:%d\t question:%s' %
+                      (len(tokens1), ''.join(nlu_t1)))
+
+        assert cur_max_length <= 512
+
+        for i, tokens1 in enumerate(tokens):
+            segment_ids1 = segments[i]
+            order_ids1 = orders[i]
+            type_ids1 = types[i]
+            match_ids1 = matchs[i]
+            input_ids1 = tokenizer.convert_tokens_to_ids(tokens1)
+            input_mask1 = [1] * len(input_ids1)
+
+            while len(input_ids1) < cur_max_length:
+                input_ids1.append(0)
+                input_mask1.append(0)
+                segment_ids1.append(0)
+                order_ids1.append(0)
+                type_ids1.append(0)
+                match_ids1.append(0)
+
+            if len(input_ids1) != cur_max_length:
+                print('Error: ', nlu_t1, tokens1, len(input_ids1),
+                      cur_max_length)
+
+            assert len(input_ids1) == cur_max_length
+            assert len(input_mask1) == cur_max_length
+            assert len(order_ids1) == cur_max_length
+            assert len(segment_ids1) == cur_max_length
+            assert len(match_ids1) == cur_max_length
+            assert len(type_ids1) == cur_max_length
+
+            input_ids.append(input_ids1)
+            order_ids.append(order_ids1)
+            type_ids.append(type_ids1)
+            segment_ids.append(segment_ids1)
+            input_mask.append(input_mask1)
+            match_ids.append(match_ids1)
+
+        header_len = []
+        header_ids = []
+        header_max_len = max(
+            [len(header_list1) for header_list1 in header_list])
+        for header1 in header_list:
+            header_len1 = []
+            header_ids1 = []
+            for header_tok in header1:
+                header_len1.append(len(header_tok))
+                header_tok_ids1 = tokenizer.convert_tokens_to_ids(header_tok)
+                while len(header_tok_ids1) < header_tok_max_len:
+                    header_tok_ids1.append(0)
+                header_ids1.append(header_tok_ids1)
+            while len(header_ids1) < header_max_len:
+                header_ids1.append([0] * header_tok_max_len)
+            header_len.append(header_len1)
+            header_ids.append(header_ids1)
+
+        for i, header_flatten_token in enumerate(header_flatten_token_list):
+            header_flatten_tokenid = tokenizer.convert_tokens_to_ids(
+                header_flatten_token)
+            header_flatten_tokenid_list.append(header_flatten_tokenid)
+
+        # Convert to tensor
+        all_input_ids = torch.tensor(
+            input_ids, dtype=torch.long).to(self._device_name)
+        all_order_ids = torch.tensor(
+            order_ids, dtype=torch.long).to(self._device_name)
+        all_type_ids = torch.tensor(
+            type_ids, dtype=torch.long).to(self._device_name)
+        all_input_mask = torch.tensor(
+            input_mask, dtype=torch.long).to(self._device_name)
+        all_segment_ids = torch.tensor(
+            segment_ids, dtype=torch.long).to(self._device_name)
+        all_match_ids = torch.tensor(
+            match_ids, dtype=torch.long).to(self._device_name)
+        all_header_ids = torch.tensor(
+            header_ids, dtype=torch.long).to(self._device_name)
+        all_ids = torch.arange(
+            all_input_ids.shape[0], dtype=torch.long).to(self._device_name)
+
+        bS = len(header_flatten_tokenid_list)
+        max_header_flatten_token_length = max(
+            [len(x) for x in header_flatten_tokenid_list])
+        all_header_flatten_tokens = numpy.zeros(
+            (bS, max_header_flatten_token_length), dtype='int32')
+        all_header_flatten_index = numpy.zeros(
+            (bS, max_header_flatten_token_length), dtype='int32')
+        for i, header_flatten_tokenid in enumerate(
+                header_flatten_tokenid_list):
+            for j, tokenid in enumerate(header_flatten_tokenid):
+                all_header_flatten_tokens[i, j] = tokenid
+            for j, hdindex in enumerate(header_flatten_index_list[i]):
+                all_header_flatten_index[i, j] = hdindex
+        all_header_flatten_output = numpy.zeros((bS, header_max_len + 1),
+                                                dtype='int32')
+        all_header_flatten_tokens = torch.tensor(
+            all_header_flatten_tokens, dtype=torch.long).to(self._device_name)
+        all_header_flatten_index = torch.tensor(
+            all_header_flatten_index, dtype=torch.long).to(self._device_name)
+        all_header_flatten_output = torch.tensor(
+            all_header_flatten_output,
+            dtype=torch.float32).to(self._device_name)
+
+        all_token_column_id = numpy.zeros((bS, cur_max_length), dtype='int32')
+        all_token_column_mask = numpy.zeros((bS, cur_max_length),
+                                            dtype='float32')
+        for bi, col_dict in enumerate(col_dict_list):
+            for ki, vi in col_dict.items():
+                all_token_column_id[bi, ki] = vi + 1
+                all_token_column_mask[bi, ki] = 1.0
+        all_token_column_id = torch.tensor(
+            all_token_column_id, dtype=torch.long).to(self._device_name)
+        all_token_column_mask = torch.tensor(
+            all_token_column_mask, dtype=torch.float32).to(self._device_name)
+
+        all_schema_link_matrix = numpy.zeros(
+            (bS, cur_max_length, cur_max_length), dtype='int32')
+        all_schema_link_mask = numpy.zeros(
+            (bS, cur_max_length, cur_max_length), dtype='float32')
+        for i, schema_link_matrix in enumerate(schema_link_matrix_list):
+            temp_len = schema_link_matrix.shape[0]
+            all_schema_link_matrix[i, 0:temp_len,
+                                   0:temp_len] = schema_link_matrix
+            all_schema_link_mask[i, 0:temp_len,
+                                 0:temp_len] = schema_link_mask_list[i]
+        all_schema_link_matrix = torch.tensor(
+            all_schema_link_matrix, dtype=torch.long).to(self._device_name)
+        all_schema_link_mask = torch.tensor(
+            all_schema_link_mask, dtype=torch.long).to(self._device_name)
+
+        # 5. generate l_hpu from i_hds
+        l_hpu = self.gen_l_hpu(i_hds)
+
+        # 4. Generate BERT output.
+        all_encoder_layer, pooled_output = model_bert(
+            all_input_ids,
+            all_header_ids,
+            token_order_ids=all_order_ids,
+            token_type_ids=all_segment_ids,
+            attention_mask=all_input_mask,
+            match_type_ids=all_match_ids,
+            l_hs=l_hs,
+            header_len=header_len,
+            type_ids=all_type_ids,
+            col_dict_list=col_dict_list,
+            ids=all_ids,
+            header_flatten_tokens=all_header_flatten_tokens,
+            header_flatten_index=all_header_flatten_index,
+            header_flatten_output=all_header_flatten_output,
+            token_column_id=all_token_column_id,
+            token_column_mask=all_token_column_mask,
+            column_start_index=column_index,
+            headers_length=l_hs,
+            all_schema_link_matrix=all_schema_link_matrix,
+            all_schema_link_mask=all_schema_link_mask,
+            output_all_encoded_layers=False)
+
+        return all_encoder_layer, pooled_output, tokens, i_nlu, i_hds, \
+            l_n, l_hpu, l_hs, start_index, column_index, all_ids
+
+    def predict(self, querys):
+        self.head_model.eval()
+        self.backbone_model.eval()
+
+        nlu, nlu_t, sql_i, q_know, t_know, tb, hs_t, types, units, his_sql, schema_link = \
+            self.get_fields_info(querys, None, train=False)
+
+        with torch.no_grad():
+            all_encoder_layer, _, tokens, i_nlu, i_hds, l_n, l_hpu, l_hs, start_index, column_index, ids = \
+                self.get_bert_output(
+                    self.backbone_model, self.tokenizer,
+                    nlu_t, hs_t, types, units, his_sql, q_know, t_know, schema_link)
+
+            s_action, s_sc, s_sa, s_cco, s_wc, s_wo, s_wvs, s_len = self.head_model(
+                all_encoder_layer, l_n, l_hs, start_index, column_index,
+                tokens, ids)
+
+        action_batch = torch.argmax(F.softmax(s_action, -1), -1).cpu().tolist()
+        scco_batch = torch.argmax(F.softmax(s_cco, -1), -1).cpu().tolist()
+        sc_batch = torch.argmax(F.softmax(s_sc, -1), -1).cpu().tolist()
+        sa_batch = torch.argmax(F.softmax(s_sa, -1), -1).cpu().tolist()
+        wc_batch = torch.argmax(F.softmax(s_wc, -1), -1).cpu().tolist()
+        wo_batch = torch.argmax(F.softmax(s_wo, -1), -1).cpu().tolist()
+        s_wvs_s, s_wvs_e = s_wvs
+        wvss_batch = torch.argmax(F.softmax(s_wvs_s, -1), -1).cpu().tolist()
+        wvse_batch = torch.argmax(F.softmax(s_wvs_e, -1), -1).cpu().tolist()
+        s_slen, s_wlen = s_len
+        slen_batch = torch.argmax(F.softmax(s_slen, -1), -1).cpu().tolist()
+        wlen_batch = torch.argmax(F.softmax(s_wlen, -1), -1).cpu().tolist()
+
+        pr_wvi = []
+        for i in range(len(querys)):
+            wvi = []
+            for j in range(wlen_batch[i]):
+                wvi.append([
+                    max(0, wvss_batch[i][j] - 1),
+                    max(0, wvse_batch[i][j] - 1)
+                ])
+            pr_wvi.append(wvi)
+        pr_wvi_str = self.convert_string(pr_wvi, nlu, nlu_t)
+
+        pre_results = []
+        for ib in range(len(querys)):
+            res_one = {}
+            sql = {}
+            sql['cond_conn_op'] = scco_batch[ib]
+            sl = slen_batch[ib]
+            sql['sel'] = list(
+                numpy.array(sc_batch[ib][:sl]).astype(numpy.int32) - 1)
+            sql['agg'] = list(
+                numpy.array(sa_batch[ib][:sl]).astype(numpy.int32))
+            sels = []
+            aggs = []
+            for ia, sel in enumerate(sql['sel']):
+                if sel == -1:
+                    if sql['agg'][ia] > 0:
+                        sels.append(l_hs[ib] - 1)
+                        aggs.append(sql['agg'][ia])
+                    continue
+                sels.append(sel)
+                if sql['agg'][ia] == -1:
+                    aggs.append(0)
+                else:
+                    aggs.append(sql['agg'][ia])
+            if len(sels) == 0:
+                sels.append(l_hs[ib] - 1)
+                aggs.append(0)
+            assert len(sels) == len(aggs)
+            sql['sel'] = sels
+            sql['agg'] = aggs
+
+            conds = []
+            wl = wlen_batch[ib]
+            wc_os = list(
+                numpy.array(wc_batch[ib][:wl]).astype(numpy.int32) - 1)
+            wo_os = list(numpy.array(wo_batch[ib][:wl]).astype(numpy.int32))
+            res_one['question_tok'] = querys[ib]['question_tok']
+            for i in range(wl):
+                if wc_os[i] == -1:
+                    continue
+                conds.append([wc_os[i], wo_os[i], pr_wvi_str[ib][i]])
+            if len(conds) == 0:
+                conds.append([l_hs[ib] - 1, 2, 'Nulll'])
+            sql['conds'] = conds
+            res_one['question'] = querys[ib]['question']
+            res_one['table_id'] = querys[ib]['table_id']
+            res_one['sql'] = sql
+            res_one['action'] = action_batch[ib]
+            res_one['model_out'] = [
+                sc_batch[ib], sa_batch[ib], wc_batch[ib], wo_batch[ib],
+                wvss_batch[ib], wvse_batch[ib]
+            ]
+            pre_results.append(res_one)
+
+        return pre_results
+
+    def forward(self, input: Dict[str, Tensor]) -> Dict[str, Tensor]:
+        """return the result by the model
+
+        Args:
+            input (Dict[str, Tensor]): the preprocessed data
+
+        Returns:
+            Dict[str, Tensor]: results
+                Example:
+        """
+        result = self.predict(input['datas'])[0]
+
+        return {
+            'result': result,
+            'history_sql': input['datas'][0]['history_sql']
+        }
diff --git a/modelscope/models/nlp/task_models/__init__.py b/modelscope/models/nlp/task_models/__init__.py
index e69de29b..90f22aa1 100644
--- a/modelscope/models/nlp/task_models/__init__.py
+++ b/modelscope/models/nlp/task_models/__init__.py
@@ -0,0 +1,32 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .information_extraction import InformationExtractionModel
+    from .feature_extraction import FeatureExtractionModel
+    from .fill_mask import FillMaskModel
+    from .sequence_classification import SequenceClassificationModel
+    from .task_model import SingleBackboneTaskModelBase
+    from .token_classification import TokenClassificationModel
+
+else:
+    _import_structure = {
+        'information_extraction': ['InformationExtractionModel'],
+        'feature_extraction': ['FeatureExtractionModel'],
+        'fill_mask': ['FillMaskModel'],
+        'sequence_classification': ['SequenceClassificationModel'],
+        'task_model': ['SingleBackboneTaskModelBase'],
+        'token_classification': ['TokenClassificationModel'],
+    }
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/models/nlp/task_models/feature_extraction.py b/modelscope/models/nlp/task_models/feature_extraction.py
new file mode 100644
index 00000000..069c37aa
--- /dev/null
+++ b/modelscope/models/nlp/task_models/feature_extraction.py
@@ -0,0 +1,43 @@
+from typing import Any, Dict
+
+import numpy as np
+
+from modelscope.metainfo import TaskModels
+from modelscope.models.builder import MODELS
+from modelscope.models.nlp.bert import BertConfig
+from modelscope.models.nlp.task_models.task_model import \
+    SingleBackboneTaskModelBase
+from modelscope.outputs import OutputKeys
+from modelscope.utils.constant import Tasks
+from modelscope.utils.hub import parse_label_mapping
+
+__all__ = ['FeatureExtractionModel']
+
+
+@MODELS.register_module(
+    Tasks.feature_extraction, module_name=TaskModels.feature_extraction)
+class FeatureExtractionModel(SingleBackboneTaskModelBase):
+
+    def __init__(self, model_dir: str, *args, **kwargs):
+        """initialize the fill mask model from the `model_dir` path.
+
+        Args:
+            model_dir (str): the model path.
+        """
+        super().__init__(model_dir, *args, **kwargs)
+        if 'base_model_prefix' in kwargs:
+            self._base_model_prefix = kwargs['base_model_prefix']
+
+        self.build_backbone(self.backbone_cfg)
+
+    def forward(self, **input: Dict[str, Any]) -> Dict[str, np.ndarray]:
+
+        # backbone do not need labels, only head need for loss compute
+        labels = input.pop(OutputKeys.LABELS, None)
+
+        outputs = super().forward(input)
+        sequence_output, pooled_output = self.extract_backbone_outputs(outputs)
+        if labels is not None:
+            input[OutputKeys.LABELS] = labels
+
+        return {OutputKeys.TEXT_EMBEDDING: sequence_output}
diff --git a/modelscope/models/nlp/task_models/fill_mask.py b/modelscope/models/nlp/task_models/fill_mask.py
new file mode 100644
index 00000000..f7ef1cc2
--- /dev/null
+++ b/modelscope/models/nlp/task_models/fill_mask.py
@@ -0,0 +1,47 @@
+from typing import Any, Dict
+
+import numpy as np
+
+from modelscope.metainfo import TaskModels
+from modelscope.models.builder import MODELS
+from modelscope.models.nlp.bert import BertConfig
+from modelscope.models.nlp.task_models.task_model import \
+    SingleBackboneTaskModelBase
+from modelscope.outputs import OutputKeys
+from modelscope.utils.constant import Tasks
+from modelscope.utils.hub import parse_label_mapping
+
+__all__ = ['FillMaskModel']
+
+
+@MODELS.register_module(Tasks.fill_mask, module_name=TaskModels.fill_mask)
+class FillMaskModel(SingleBackboneTaskModelBase):
+
+    def __init__(self, model_dir: str, *args, **kwargs):
+        """initialize the fill mask model from the `model_dir` path.
+
+        Args:
+            model_dir (str): the model path.
+        """
+        super().__init__(model_dir, *args, **kwargs)
+        if 'base_model_prefix' in kwargs:
+            self._base_model_prefix = kwargs['base_model_prefix']
+
+        self.build_backbone(self.backbone_cfg)
+        self.build_head(self.head_cfg)
+
+    def forward(self, **input: Dict[str, Any]) -> Dict[str, np.ndarray]:
+
+        # backbone do not need labels, only head need for loss compute
+        labels = input.pop(OutputKeys.LABELS, None)
+
+        outputs = super().forward(input)
+        sequence_output, pooled_output = self.extract_backbone_outputs(outputs)
+        outputs = self.head.forward(sequence_output)
+
+        if labels is not None:
+            input[OutputKeys.LABELS] = labels
+            loss = self.compute_loss(outputs, labels)
+            outputs.update(loss)
+        outputs[OutputKeys.INPUT_IDS] = input[OutputKeys.INPUT_IDS]
+        return outputs
diff --git a/modelscope/models/nlp/task_models/information_extraction.py b/modelscope/models/nlp/task_models/information_extraction.py
new file mode 100644
index 00000000..0a7d5a47
--- /dev/null
+++ b/modelscope/models/nlp/task_models/information_extraction.py
@@ -0,0 +1,37 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import Any, Dict
+
+import numpy as np
+
+from modelscope.metainfo import TaskModels
+from modelscope.models.builder import MODELS
+from modelscope.models.nlp.task_models.task_model import \
+    SingleBackboneTaskModelBase
+from modelscope.outputs import OutputKeys
+from modelscope.utils.constant import Tasks
+
+__all__ = ['InformationExtractionModel']
+
+
+@MODELS.register_module(
+    Tasks.information_extraction,
+    module_name=TaskModels.information_extraction)
+class InformationExtractionModel(SingleBackboneTaskModelBase):
+
+    def __init__(self, model_dir: str, *args, **kwargs):
+        """initialize the information extraction model from the `model_dir` path.
+
+        Args:
+            model_dir (str): the model path.
+        """
+        super().__init__(model_dir, *args, **kwargs)
+
+        self.build_backbone(self.backbone_cfg)
+        self.build_head(self.head_cfg)
+
+    def forward(self, **input: Dict[str, Any]) -> Dict[str, np.ndarray]:
+        outputs = super().forward(input)
+        sequence_output, pooled_output = self.extract_backbone_outputs(outputs)
+        outputs = self.head.forward(sequence_output, input['text'],
+                                    input['offsets'])
+        return {OutputKeys.SPO_LIST: outputs}
diff --git a/modelscope/models/nlp/task_models/sequence_classification.py b/modelscope/models/nlp/task_models/sequence_classification.py
index 988f2917..1f5e46c3 100644
--- a/modelscope/models/nlp/task_models/sequence_classification.py
+++ b/modelscope/models/nlp/task_models/sequence_classification.py
@@ -1,3 +1,4 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
 import os
 from typing import Any, Dict
 
@@ -10,10 +11,14 @@ from modelscope.models.nlp.task_models.task_model import \
     SingleBackboneTaskModelBase
 from modelscope.outputs import OutputKeys
 from modelscope.utils.constant import Tasks
+from modelscope.utils.hub import parse_label_mapping
 
 __all__ = ['SequenceClassificationModel']
 
 
+@MODELS.register_module(
+    Tasks.sentence_similarity, module_name=TaskModels.text_classification)
+@MODELS.register_module(Tasks.nli, module_name=TaskModels.text_classification)
 @MODELS.register_module(
     Tasks.sentiment_classification, module_name=TaskModels.text_classification)
 @MODELS.register_module(
@@ -30,49 +35,36 @@ class SequenceClassificationModel(SingleBackboneTaskModelBase):
         if 'base_model_prefix' in kwargs:
             self._base_model_prefix = kwargs['base_model_prefix']
 
-        backbone_cfg = self.cfg.backbone
-        head_cfg = self.cfg.head
-
         # get the num_labels from label_mapping.json
         self.id2label = {}
-        self.label_path = os.path.join(model_dir, 'label_mapping.json')
-        if os.path.exists(self.label_path):
-            with open(self.label_path) as f:
-                self.label_mapping = json.load(f)
-            self.id2label = {
-                idx: name
-                for name, idx in self.label_mapping.items()
-            }
-        head_cfg['num_labels'] = len(self.label_mapping)
+        # get the num_labels
+        num_labels = kwargs.get('num_labels')
+        if num_labels is None:
+            label2id = parse_label_mapping(model_dir)
+            if label2id is not None and len(label2id) > 0:
+                num_labels = len(label2id)
+            self.id2label = {id: label for label, id in label2id.items()}
+        self.head_cfg['num_labels'] = num_labels
 
-        self.build_backbone(backbone_cfg)
-        self.build_head(head_cfg)
+        self.build_backbone(self.backbone_cfg)
+        self.build_head(self.head_cfg)
+
+    def forward(self, **input: Dict[str, Any]) -> Dict[str, np.ndarray]:
+        # backbone do not need labels, only head need for loss compute
+        labels = input.pop(OutputKeys.LABELS, None)
 
-    def forward(self, input: Dict[str, Any]) -> Dict[str, np.ndarray]:
         outputs = super().forward(input)
         sequence_output, pooled_output = self.extract_backbone_outputs(outputs)
         outputs = self.head.forward(pooled_output)
-        if 'labels' in input:
-            loss = self.compute_loss(outputs, input['labels'])
+        if labels is not None:
+            input[OutputKeys.LABELS] = labels
+            loss = self.compute_loss(outputs, labels)
             outputs.update(loss)
         return outputs
 
     def extract_logits(self, outputs):
         return outputs[OutputKeys.LOGITS].cpu().detach()
 
-    def extract_backbone_outputs(self, outputs):
-        sequence_output = None
-        pooled_output = None
-        if hasattr(self.backbone, 'extract_sequence_outputs'):
-            sequence_output = self.backbone.extract_sequence_outputs(outputs)
-        if hasattr(self.backbone, 'extract_pooled_outputs'):
-            pooled_output = self.backbone.extract_pooled_outputs(outputs)
-        return sequence_output, pooled_output
-
-    def compute_loss(self, outputs, labels):
-        loss = self.head.compute_loss(outputs, labels)
-        return loss
-
     def postprocess(self, input, **kwargs):
         logits = self.extract_logits(input)
         probs = logits.softmax(-1).numpy()
diff --git a/modelscope/models/nlp/task_models/task_model.py b/modelscope/models/nlp/task_models/task_model.py
index 104b4c32..0b43044f 100644
--- a/modelscope/models/nlp/task_models/task_model.py
+++ b/modelscope/models/nlp/task_models/task_model.py
@@ -1,3 +1,4 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
 import os.path
 import re
 from abc import ABC
@@ -73,7 +74,7 @@ class BaseTaskModel(TorchModel, ABC):
 
     def __init__(self, model_dir: str, *args, **kwargs):
         super().__init__(model_dir, *args, **kwargs)
-        self.cfg = ConfigDict(kwargs)
+        self.config = ConfigDict(kwargs)
 
     def __repr__(self):
         # only log backbone and head name
@@ -396,6 +397,9 @@ class SingleBackboneTaskModelBase(BaseTaskModel):
 
     def __init__(self, model_dir: str, *args, **kwargs):
         super().__init__(model_dir, *args, **kwargs)
+        self.backbone_cfg = self.config.get('backbone', None)
+        assert self.backbone_cfg is not None
+        self.head_cfg = self.config.get('head', None)
 
     def build_backbone(self, cfg):
         if 'prefix' in cfg:
@@ -404,9 +408,13 @@ class SingleBackboneTaskModelBase(BaseTaskModel):
         setattr(self, cfg['prefix'], backbone)
 
     def build_head(self, cfg):
+        if cfg is None:
+            raise ValueError(
+                'Head config is missing, check if this was a backbone-only model'
+            )
         if 'prefix' in cfg:
             self._head_prefix = cfg['prefix']
-        head = build_head(cfg)
+        head = build_head(cfg, group_key=self.group_key)
         setattr(self, self._head_prefix, head)
         return head
 
@@ -430,8 +438,18 @@ class SingleBackboneTaskModelBase(BaseTaskModel):
             outputs = self.backbone.forward(**input)
         return outputs
 
-    def compute_loss(self, outputs: Dict[str, Any], labels):
-        raise NotImplementedError()
+    def compute_loss(self, outputs, labels):
+        loss = self.head.compute_loss(outputs, labels)
+        return loss
+
+    def extract_backbone_outputs(self, outputs):
+        sequence_output = None
+        pooled_output = None
+        if hasattr(self.backbone, 'extract_sequence_outputs'):
+            sequence_output = self.backbone.extract_sequence_outputs(outputs)
+        if hasattr(self.backbone, 'extract_pooled_outputs'):
+            pooled_output = self.backbone.extract_pooled_outputs(outputs)
+        return sequence_output, pooled_output
 
 
 class EncoderDecoderTaskModelBase(BaseTaskModel):
@@ -452,7 +470,7 @@ class EncoderDecoderTaskModelBase(BaseTaskModel):
 
     def build_encoder(self):
         encoder = build_backbone(
-            self.cfg,
+            self.config,
             type_name=self._encoder_key_in_cfg,
             task_name=Tasks.backbone)
         setattr(self, self._encoder_prefix, encoder)
@@ -460,7 +478,7 @@ class EncoderDecoderTaskModelBase(BaseTaskModel):
 
     def build_decoder(self):
         decoder = build_backbone(
-            self.cfg,
+            self.config,
             type_name=self._decoder_key_in_cfg,
             task_name=Tasks.backbone)
         setattr(self, self._decoder_prefix, decoder)
diff --git a/modelscope/models/nlp/task_models/token_classification.py b/modelscope/models/nlp/task_models/token_classification.py
new file mode 100644
index 00000000..f3930182
--- /dev/null
+++ b/modelscope/models/nlp/task_models/token_classification.py
@@ -0,0 +1,77 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import Any, Dict
+
+import numpy as np
+import torch
+
+from modelscope.metainfo import TaskModels
+from modelscope.models.builder import MODELS
+from modelscope.models.nlp.task_models.task_model import \
+    SingleBackboneTaskModelBase
+from modelscope.outputs import OutputKeys
+from modelscope.utils.constant import Tasks
+from modelscope.utils.hub import parse_label_mapping
+from modelscope.utils.tensor_utils import (torch_nested_detach,
+                                           torch_nested_numpify)
+
+__all__ = ['TokenClassificationModel']
+
+
+@MODELS.register_module(
+    Tasks.token_classification, module_name=TaskModels.token_classification)
+class TokenClassificationModel(SingleBackboneTaskModelBase):
+
+    def __init__(self, model_dir: str, *args, **kwargs):
+        """initialize the token classification model from the `model_dir` path.
+
+        Args:
+            model_dir (str): the model path.
+        """
+        super().__init__(model_dir, *args, **kwargs)
+        if 'base_model_prefix' in kwargs:
+            self._base_model_prefix = kwargs['base_model_prefix']
+
+        # get the num_labels
+        num_labels = kwargs.get('num_labels')
+        if num_labels is None:
+            label2id = parse_label_mapping(model_dir)
+            if label2id is not None and len(label2id) > 0:
+                num_labels = len(label2id)
+            self.id2label = {id: label for label, id in label2id.items()}
+        self.head_cfg['num_labels'] = num_labels
+
+        self.build_backbone(self.backbone_cfg)
+        self.build_head(self.head_cfg)
+
+    def forward(self, **input: Dict[str, Any]) -> Dict[str, np.ndarray]:
+        labels = None
+        if OutputKeys.LABEL in input:
+            labels = input.pop(OutputKeys.LABEL)
+        elif OutputKeys.LABELS in input:
+            labels = input.pop(OutputKeys.LABELS)
+
+        outputs = super().forward(input)
+        sequence_output, pooled_output = self.extract_backbone_outputs(outputs)
+        outputs = self.head.forward(sequence_output)
+        if labels in input:
+            loss = self.compute_loss(outputs, labels)
+            outputs.update(loss)
+        return outputs
+
+    def extract_logits(self, outputs):
+        return outputs[OutputKeys.LOGITS].cpu().detach()
+
+    def extract_backbone_outputs(self, outputs):
+        sequence_output = None
+        pooled_output = None
+        if hasattr(self.backbone, 'extract_sequence_outputs'):
+            sequence_output = self.backbone.extract_sequence_outputs(outputs)
+        return sequence_output, pooled_output
+
+    def postprocess(self, input, **kwargs):
+        logits = self.extract_logits(input)
+        pred = torch.argmax(logits[0], dim=-1)
+        pred = torch_nested_numpify(torch_nested_detach(pred))
+        logits = torch_nested_numpify(torch_nested_detach(logits))
+        res = {OutputKeys.PREDICTIONS: pred, OutputKeys.LOGITS: logits}
+        return res
diff --git a/modelscope/models/nlp/token_classification.py b/modelscope/models/nlp/token_classification.py
index 59d7d0cf..c63e8037 100644
--- a/modelscope/models/nlp/token_classification.py
+++ b/modelscope/models/nlp/token_classification.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 from abc import abstractmethod
 from typing import Dict
 
@@ -8,12 +10,13 @@ from torch import nn
 from modelscope.metainfo import Models
 from modelscope.models.base import TorchModel
 from modelscope.models.builder import MODELS
+from modelscope.models.nlp.bert import BertPreTrainedModel
+from modelscope.models.nlp.structbert import SbertPreTrainedModel
 from modelscope.outputs import OutputKeys
 from modelscope.utils.constant import Tasks
 from modelscope.utils.hub import parse_label_mapping
 from modelscope.utils.tensor_utils import (torch_nested_detach,
                                            torch_nested_numpify)
-from .structbert import SbertPreTrainedModel
 
 __all__ = ['SbertForTokenClassification']
 
@@ -91,6 +94,7 @@ class TokenClassification(TorchModel):
 
 
 @MODELS.register_module(Tasks.word_segmentation, module_name=Models.structbert)
+@MODELS.register_module(Tasks.part_of_speech, module_name=Models.structbert)
 @MODELS.register_module(
     Tasks.token_classification, module_name=Models.structbert)
 class SbertForTokenClassification(TokenClassification, SbertPreTrainedModel):
@@ -168,3 +172,49 @@ class SbertForTokenClassification(TokenClassification, SbertPreTrainedModel):
                          pretrained_model_name_or_path=kwargs.get('model_dir'),
                          model_dir=kwargs.get('model_dir'),
                          **model_args)
+
+
+@MODELS.register_module(Tasks.word_segmentation, module_name=Models.bert)
+@MODELS.register_module(Tasks.token_classification, module_name=Models.bert)
+class BertForSequenceClassification(TokenClassification, BertPreTrainedModel):
+    """Bert token classification model.
+
+        Inherited from TokenClassificationBase.
+    """
+    base_model_prefix: str = 'bert'
+    supports_gradient_checkpointing = True
+    _keys_to_ignore_on_load_missing = [r'position_ids']
+
+    def __init__(self, config, model_dir):
+        if hasattr(config, 'base_model_prefix'):
+            BertForSequenceClassification.base_model_prefix = config.base_model_prefix
+        super().__init__(config, model_dir)
+
+    def build_base_model(self):
+        from .bert import BertModel
+        return BertModel(self.config, add_pooling_layer=True)
+
+    def forward(self,
+                input_ids=None,
+                attention_mask=None,
+                token_type_ids=None,
+                position_ids=None,
+                head_mask=None,
+                inputs_embeds=None,
+                labels=None,
+                output_attentions=None,
+                output_hidden_states=None,
+                return_dict=None,
+                **kwargs):
+        return super().forward(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            labels=labels,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            **kwargs)
diff --git a/modelscope/msdatasets/__init__.py b/modelscope/msdatasets/__init__.py
index 8e0647bb..073f9396 100644
--- a/modelscope/msdatasets/__init__.py
+++ b/modelscope/msdatasets/__init__.py
@@ -1 +1,3 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from . import cv
 from .ms_dataset import MsDataset
diff --git a/modelscope/msdatasets/cv/__init__.py b/modelscope/msdatasets/cv/__init__.py
new file mode 100644
index 00000000..fad91bcf
--- /dev/null
+++ b/modelscope/msdatasets/cv/__init__.py
@@ -0,0 +1,3 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from . import (image_classification, image_semantic_segmentation,
+               object_detection)
diff --git a/modelscope/msdatasets/cv/easycv_base.py b/modelscope/msdatasets/cv/easycv_base.py
new file mode 100644
index 00000000..a45827a3
--- /dev/null
+++ b/modelscope/msdatasets/cv/easycv_base.py
@@ -0,0 +1,36 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os.path as osp
+
+
+class EasyCVBaseDataset(object):
+    """Adapt to MSDataset.
+
+    Args:
+        split_config (dict): Dataset root path from MSDataset, e.g.
+            {"train":"local cache path"} or {"evaluation":"local cache path"}.
+        preprocessor (Preprocessor): An optional preprocessor instance, please make sure the preprocessor fits for
+            the model if supplied. Not support yet.
+        mode: Training or Evaluation.
+    """
+    DATA_ROOT_PATTERN = '${data_root}'
+
+    def __init__(self,
+                 split_config=None,
+                 preprocessor=None,
+                 mode=None,
+                 args=(),
+                 kwargs={}) -> None:
+        self.split_config = split_config
+        self.preprocessor = preprocessor
+        self.mode = mode
+        if self.split_config is not None:
+            self._update_data_source(kwargs['data_source'])
+
+    def _update_data_source(self, data_source):
+        data_root = next(iter(self.split_config.values()))
+        data_root = data_root.rstrip(osp.sep)
+
+        for k, v in data_source.items():
+            if isinstance(v, str) and self.DATA_ROOT_PATTERN in v:
+                data_source.update(
+                    {k: v.replace(self.DATA_ROOT_PATTERN, data_root)})
diff --git a/modelscope/msdatasets/cv/face_2d_keypoins/__init__.py b/modelscope/msdatasets/cv/face_2d_keypoins/__init__.py
new file mode 100644
index 00000000..e9d76b7e
--- /dev/null
+++ b/modelscope/msdatasets/cv/face_2d_keypoins/__init__.py
@@ -0,0 +1,20 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .face_2d_keypoints_dataset import FaceKeypointDataset
+
+else:
+    _import_structure = {'face_2d_keypoints_dataset': ['FaceKeypointDataset']}
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/msdatasets/cv/face_2d_keypoins/face_2d_keypoints_dataset.py b/modelscope/msdatasets/cv/face_2d_keypoins/face_2d_keypoints_dataset.py
new file mode 100644
index 00000000..2f2e03ef
--- /dev/null
+++ b/modelscope/msdatasets/cv/face_2d_keypoins/face_2d_keypoints_dataset.py
@@ -0,0 +1,37 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from easycv.datasets.face import FaceKeypointDataset as _FaceKeypointDataset
+
+from modelscope.metainfo import Datasets
+from modelscope.msdatasets.cv.easycv_base import EasyCVBaseDataset
+from modelscope.msdatasets.task_datasets.builder import TASK_DATASETS
+from modelscope.utils.constant import Tasks
+
+
+@TASK_DATASETS.register_module(
+    group_key=Tasks.face_2d_keypoints,
+    module_name=Datasets.Face2dKeypointsDataset)
+class FaceKeypointDataset(EasyCVBaseDataset, _FaceKeypointDataset):
+    """EasyCV dataset for face 2d keypoints.
+
+    Args:
+        split_config (dict): Dataset root path from MSDataset, e.g.
+            {"train":"local cache path"} or {"evaluation":"local cache path"}.
+        preprocessor (Preprocessor): An optional preprocessor instance, please make sure the preprocessor fits for
+            the model if supplied. Not support yet.
+        mode: Training or Evaluation.
+    """
+
+    def __init__(self,
+                 split_config=None,
+                 preprocessor=None,
+                 mode=None,
+                 *args,
+                 **kwargs) -> None:
+        EasyCVBaseDataset.__init__(
+            self,
+            split_config=split_config,
+            preprocessor=preprocessor,
+            mode=mode,
+            args=args,
+            kwargs=kwargs)
+        _FaceKeypointDataset.__init__(self, *args, **kwargs)
diff --git a/modelscope/msdatasets/cv/image_classification/__init__.py b/modelscope/msdatasets/cv/image_classification/__init__.py
new file mode 100644
index 00000000..95e8d7a1
--- /dev/null
+++ b/modelscope/msdatasets/cv/image_classification/__init__.py
@@ -0,0 +1,20 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .classification_dataset import ClsDataset
+
+else:
+    _import_structure = {'classification_dataset': ['ClsDataset']}
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/msdatasets/cv/image_classification/classification_dataset.py b/modelscope/msdatasets/cv/image_classification/classification_dataset.py
new file mode 100644
index 00000000..ba73e472
--- /dev/null
+++ b/modelscope/msdatasets/cv/image_classification/classification_dataset.py
@@ -0,0 +1,36 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from easycv.datasets.classification import ClsDataset as _ClsDataset
+
+from modelscope.metainfo import Datasets
+from modelscope.msdatasets.cv.easycv_base import EasyCVBaseDataset
+from modelscope.msdatasets.task_datasets.builder import TASK_DATASETS
+from modelscope.utils.constant import Tasks
+
+
+@TASK_DATASETS.register_module(
+    group_key=Tasks.image_classification, module_name=Datasets.ClsDataset)
+class ClsDataset(_ClsDataset):
+    """EasyCV dataset for classification.
+
+    Args:
+        split_config (dict): Dataset root path from MSDataset, e.g.
+            {"train":"local cache path"} or {"evaluation":"local cache path"}.
+        preprocessor (Preprocessor): An optional preprocessor instance, please make sure the preprocessor fits for
+            the model if supplied. Not support yet.
+        mode: Training or Evaluation.
+    """
+
+    def __init__(self,
+                 split_config=None,
+                 preprocessor=None,
+                 mode=None,
+                 *args,
+                 **kwargs) -> None:
+        EasyCVBaseDataset.__init__(
+            self,
+            split_config=split_config,
+            preprocessor=preprocessor,
+            mode=mode,
+            args=args,
+            kwargs=kwargs)
+        _ClsDataset.__init__(self, *args, **kwargs)
diff --git a/modelscope/msdatasets/cv/image_semantic_segmentation/__init__.py b/modelscope/msdatasets/cv/image_semantic_segmentation/__init__.py
new file mode 100644
index 00000000..26121bdb
--- /dev/null
+++ b/modelscope/msdatasets/cv/image_semantic_segmentation/__init__.py
@@ -0,0 +1,20 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .segmentation_dataset import SegDataset
+
+else:
+    _import_structure = {'easycv_segmentation': ['SegDataset']}
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/msdatasets/cv/image_semantic_segmentation/segmentation_dataset.py b/modelscope/msdatasets/cv/image_semantic_segmentation/segmentation_dataset.py
new file mode 100644
index 00000000..b1316e2e
--- /dev/null
+++ b/modelscope/msdatasets/cv/image_semantic_segmentation/segmentation_dataset.py
@@ -0,0 +1,42 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from easycv.datasets.segmentation import SegDataset as _SegDataset
+
+from modelscope.metainfo import Datasets
+from modelscope.msdatasets.cv.easycv_base import EasyCVBaseDataset
+from modelscope.msdatasets.task_datasets.builder import TASK_DATASETS
+from modelscope.utils.constant import Tasks
+
+
+@TASK_DATASETS.register_module(
+    group_key=Tasks.image_segmentation, module_name=Datasets.SegDataset)
+class SegDataset(EasyCVBaseDataset, _SegDataset):
+    """EasyCV dataset for Sementic segmentation.
+    For more details, please refer to :
+    https://github.com/alibaba/EasyCV/blob/master/easycv/datasets/segmentation/raw.py .
+
+    Args:
+        split_config (dict): Dataset root path from MSDataset, e.g.
+            {"train":"local cache path"} or {"evaluation":"local cache path"}.
+        preprocessor (Preprocessor): An optional preprocessor instance, please make sure the preprocessor fits for
+            the model if supplied. Not support yet.
+        mode: Training or Evaluation.
+        data_source: Data source config to parse input data.
+        pipeline: Sequence of transform object or config dict to be composed.
+        ignore_index (int): Label index to be ignored.
+        profiling: If set True, will print transform time.
+    """
+
+    def __init__(self,
+                 split_config=None,
+                 preprocessor=None,
+                 mode=None,
+                 *args,
+                 **kwargs) -> None:
+        EasyCVBaseDataset.__init__(
+            self,
+            split_config=split_config,
+            preprocessor=preprocessor,
+            mode=mode,
+            args=args,
+            kwargs=kwargs)
+        _SegDataset.__init__(self, *args, **kwargs)
diff --git a/modelscope/msdatasets/cv/object_detection/__init__.py b/modelscope/msdatasets/cv/object_detection/__init__.py
new file mode 100644
index 00000000..30af2d9b
--- /dev/null
+++ b/modelscope/msdatasets/cv/object_detection/__init__.py
@@ -0,0 +1,22 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .easycv_detection import DetDataset, DetImagesMixDataset
+
+else:
+    _import_structure = {
+        'easycv_detection': ['DetDataset', 'DetImagesMixDataset']
+    }
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/msdatasets/cv/object_detection/detection_dataset.py b/modelscope/msdatasets/cv/object_detection/detection_dataset.py
new file mode 100644
index 00000000..2f6ad7d3
--- /dev/null
+++ b/modelscope/msdatasets/cv/object_detection/detection_dataset.py
@@ -0,0 +1,92 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os.path as osp
+
+from easycv.datasets.detection import DetDataset as _DetDataset
+from easycv.datasets.detection import \
+    DetImagesMixDataset as _DetImagesMixDataset
+
+from modelscope.metainfo import Datasets
+from modelscope.msdatasets.cv.easycv_base import EasyCVBaseDataset
+from modelscope.msdatasets.task_datasets import TASK_DATASETS
+from modelscope.utils.constant import Tasks
+
+
+@TASK_DATASETS.register_module(
+    group_key=Tasks.image_object_detection, module_name=Datasets.DetDataset)
+class DetDataset(EasyCVBaseDataset, _DetDataset):
+    """EasyCV dataset for object detection.
+    For more details, please refer to https://github.com/alibaba/EasyCV/blob/master/easycv/datasets/detection/raw.py .
+
+    Args:
+        split_config (dict): Dataset root path from MSDataset, e.g.
+            {"train":"local cache path"} or {"evaluation":"local cache path"}.
+        preprocessor (Preprocessor): An optional preprocessor instance, please make sure the preprocessor fits for
+            the model if supplied. Not support yet.
+        mode: Training or Evaluation.
+        data_source: Data source config to parse input data.
+        pipeline: Transform config list
+        profiling: If set True, will print pipeline time
+        classes: A list of class names, used in evaluation for result and groundtruth visualization
+    """
+
+    def __init__(self,
+                 split_config=None,
+                 preprocessor=None,
+                 mode=None,
+                 *args,
+                 **kwargs) -> None:
+        EasyCVBaseDataset.__init__(
+            self,
+            split_config=split_config,
+            preprocessor=preprocessor,
+            mode=mode,
+            args=args,
+            kwargs=kwargs)
+        _DetDataset.__init__(self, *args, **kwargs)
+
+
+@TASK_DATASETS.register_module(
+    group_key=Tasks.image_object_detection,
+    module_name=Datasets.DetImagesMixDataset)
+class DetImagesMixDataset(EasyCVBaseDataset, _DetImagesMixDataset):
+    """EasyCV dataset for object detection, a wrapper of multiple images mixed dataset.
+    Suitable for training on multiple images mixed data augmentation like
+    mosaic and mixup. For the augmentation pipeline of mixed image data,
+    the `get_indexes` method needs to be provided to obtain the image
+    indexes, and you can set `skip_flags` to change the pipeline running
+    process. At the same time, we provide the `dynamic_scale` parameter
+    to dynamically change the output image size.
+    output boxes format: cx, cy, w, h
+
+    For more details, please refer to https://github.com/alibaba/EasyCV/blob/master/easycv/datasets/detection/mix.py .
+
+    Args:
+        split_config (dict): Dataset root path from MSDataset, e.g.
+            {"train":"local cache path"} or {"evaluation":"local cache path"}.
+        preprocessor (Preprocessor): An optional preprocessor instance, please make sure the preprocessor fits for
+            the model if supplied. Not support yet.
+        mode: Training or Evaluation.
+        data_source (:obj:`DetSourceCoco`): Data source config to parse input data.
+        pipeline (Sequence[dict]): Sequence of transform object or
+            config dict to be composed.
+        dynamic_scale (tuple[int], optional): The image scale can be changed
+            dynamically. Default to None.
+        skip_type_keys (list[str], optional): Sequence of type string to
+            be skip pipeline. Default to None.
+        label_padding: out labeling padding [N, 120, 5]
+    """
+
+    def __init__(self,
+                 split_config=None,
+                 preprocessor=None,
+                 mode=None,
+                 *args,
+                 **kwargs) -> None:
+        EasyCVBaseDataset.__init__(
+            self,
+            split_config=split_config,
+            preprocessor=preprocessor,
+            mode=mode,
+            args=args,
+            kwargs=kwargs)
+        _DetImagesMixDataset.__init__(self, *args, **kwargs)
diff --git a/modelscope/msdatasets/ms_dataset.py b/modelscope/msdatasets/ms_dataset.py
index 6e4486dd..361b8ae0 100644
--- a/modelscope/msdatasets/ms_dataset.py
+++ b/modelscope/msdatasets/ms_dataset.py
@@ -1,9 +1,13 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import math
 import os
 from typing import (Any, Callable, Dict, Iterable, List, Mapping, Optional,
                     Sequence, Union)
 
 import json
 import numpy as np
+import torch
 from datasets import Dataset, DatasetDict
 from datasets import load_dataset as hf_load_dataset
 from datasets.config import TF_AVAILABLE, TORCH_AVAILABLE
@@ -12,9 +16,11 @@ from datasets.utils.download_manager import DownloadConfig
 from datasets.utils.file_utils import (is_relative_path,
                                        relative_to_absolute_path)
 
-from modelscope.msdatasets.config import MS_DATASETS_CACHE
+from modelscope.hub.repository import DatasetRepository
 from modelscope.utils.config import ConfigDict
-from modelscope.utils.constant import (DEFAULT_DATASET_REVISION,
+from modelscope.utils.config_ds import MS_DATASETS_CACHE
+from modelscope.utils.constant import (DEFAULT_DATASET_NAMESPACE,
+                                       DEFAULT_DATASET_REVISION,
                                        DatasetFormations, DownloadMode, Hubs)
 from modelscope.utils.logger import get_logger
 from .task_datasets.builder import build_task_dataset
@@ -23,6 +29,7 @@ from .utils.dataset_utils import (get_dataset_files,
                                   get_target_dataset_structure,
                                   load_dataset_builder)
 from .utils.download_utils import DatasetDownloadManager
+from .utils.upload_utils import DatasetUploadManager
 
 logger = get_logger()
 
@@ -37,6 +44,42 @@ def format_list(para) -> List:
     return para
 
 
+class MsMapDataset(torch.utils.data.Dataset):
+
+    def __init__(self, dataset: Iterable, preprocessor_list, retained_columns,
+                 columns, to_tensor):
+        super(MsDataset).__init__()
+        self.dataset = dataset
+        self.preprocessor_list = preprocessor_list
+        self.to_tensor = to_tensor
+        self.retained_columns = retained_columns
+        self.columns = columns
+
+    def __len__(self):
+        return len(self.dataset)
+
+    def type_converter(self, x):
+        if self.to_tensor:
+            return torch.tensor(x)
+        else:
+            return x
+
+    def __getitem__(self, index):
+        item_dict = self.dataset[index]
+        res = {
+            k: self.type_converter(item_dict[k])
+            for k in self.columns
+            if (not self.to_tensor) or k in self.retained_columns
+        }
+        for preprocessor in self.preprocessor_list:
+            res.update({
+                k: self.type_converter(v)
+                for k, v in preprocessor(item_dict).items()
+                if (not self.to_tensor) or k in self.retained_columns
+            })
+        return res
+
+
 class MsDataset:
     """
     ModelScope Dataset (aka, MsDataset) is backed by a huggingface Dataset to
@@ -97,7 +140,7 @@ class MsDataset:
     @staticmethod
     def load(
         dataset_name: Union[str, list],
-        namespace: Optional[str] = 'modelscope',
+        namespace: Optional[str] = DEFAULT_DATASET_NAMESPACE,
         target: Optional[str] = None,
         version: Optional[str] = DEFAULT_DATASET_REVISION,
         hub: Optional[Hubs] = Hubs.modelscope,
@@ -171,20 +214,27 @@ class MsDataset:
                              Mapping[str, Union[str, Sequence[str]]]]] = None,
                          download_mode: Optional[DownloadMode] = None,
                          **config_kwargs) -> Union[dict, 'MsDataset']:
+        from modelscope.hub.api import HubApi
+        api = HubApi()
+        download_dataset = ''
         if isinstance(dataset_name, str):
             dataset_formation = DatasetFormations.native
-            if dataset_name in _PACKAGED_DATASETS_MODULES or os.path.isdir(dataset_name) or \
-                    (os.path.isfile(dataset_name) and dataset_name.endswith('.py')):
+            if dataset_name in _PACKAGED_DATASETS_MODULES or os.path.isdir(
+                    dataset_name):
                 dataset_formation = DatasetFormations.hf_compatible
+            elif os.path.isfile(dataset_name) and dataset_name.endswith('.py'):
+                dataset_formation = DatasetFormations.hf_compatible
+                file_name = os.path.basename(dataset_name)
+                download_dataset = os.path.splitext(file_name)[0]
             elif is_relative_path(dataset_name) and dataset_name.count(
                     '/') == 0:
-                from modelscope.hub.api import HubApi
-                api = HubApi()
+                download_dataset = dataset_name
                 dataset_scripts, dataset_formation, download_dir = api.fetch_dataset_scripts(
                     dataset_name, namespace, download_mode, version)
                 # dataset organized to be compatible with hf format
                 if dataset_formation == DatasetFormations.hf_compatible:
                     dataset_name = dataset_scripts['.py'][0]
+                    download_dataset = dataset_name
             else:
                 raise FileNotFoundError(
                     f"Couldn't find a dataset script at {relative_to_absolute_path(dataset_name)} "
@@ -219,6 +269,14 @@ class MsDataset:
         else:
             raise TypeError('path must be a str or a list, but got'
                             f' {type(dataset_name)}')
+
+        if download_dataset:
+            try:
+                api.on_dataset_download(
+                    dataset_name=download_dataset, namespace=namespace)
+            except Exception as e:
+                logger.error(e)
+
         return MsDataset.from_hf_dataset(dataset, target=target)
 
     @staticmethod
@@ -238,15 +296,15 @@ class MsDataset:
                 break
         target_subset_name, target_dataset_structure = get_target_dataset_structure(
             dataset_json, subset_name, split)
-        meta_map, file_map = get_dataset_files(target_dataset_structure,
-                                               dataset_name, namespace,
-                                               version)
+        meta_map, file_map, args_map = get_dataset_files(
+            target_dataset_structure, dataset_name, namespace, version)
         builder = load_dataset_builder(
             dataset_name,
             subset_name,
             namespace,
             meta_data_files=meta_map,
             zip_data_files=file_map,
+            args_map=args_map,
             cache_dir=MS_DATASETS_CACHE,
             version=version,
             split=list(target_dataset_structure.keys()),
@@ -279,6 +337,7 @@ class MsDataset:
         self,
         preprocessors: Union[Callable, List[Callable]],
         columns: Union[str, List[str]] = None,
+        to_tensor: bool = True,
     ):
         preprocessor_list = preprocessors if isinstance(
             preprocessors, list) else [preprocessors]
@@ -288,65 +347,29 @@ class MsDataset:
         columns = [
             key for key in self._hf_ds.features.keys() if key in columns
         ]
-        sample = next(iter(self._hf_ds))
-
-        sample_res = {k: np.array(sample[k]) for k in columns}
-        for processor in preprocessor_list:
-            sample_res.update(
-                {k: np.array(v)
-                 for k, v in processor(sample).items()})
-
-        def is_numpy_number(value):
-            return np.issubdtype(value.dtype, np.integer) or np.issubdtype(
-                value.dtype, np.floating)
-
         retained_columns = []
-        for k in sample_res.keys():
-            if not is_numpy_number(sample_res[k]):
-                logger.warning(
-                    f'Data of column {k} is non-numeric, will be removed')
-                continue
-            retained_columns.append(k)
+        if to_tensor:
+            sample = next(iter(self._hf_ds))
 
-        import math
-        import torch
+            sample_res = {k: np.array(sample[k]) for k in columns}
+            for processor in preprocessor_list:
+                sample_res.update(
+                    {k: np.array(v)
+                     for k, v in processor(sample).items()})
 
-        class MsIterableDataset(torch.utils.data.IterableDataset):
+            def is_numpy_number(value):
+                return np.issubdtype(value.dtype, np.integer) or np.issubdtype(
+                    value.dtype, np.floating)
 
-            def __init__(self, dataset: Iterable):
-                super(MsIterableDataset).__init__()
-                self.dataset = dataset
+            for k in sample_res.keys():
+                if not is_numpy_number(sample_res[k]):
+                    logger.warning(
+                        f'Data of column {k} is non-numeric, will be removed')
+                    continue
+                retained_columns.append(k)
 
-            def __len__(self):
-                return len(self.dataset)
-
-            def __iter__(self):
-                worker_info = torch.utils.data.get_worker_info()
-                if worker_info is None:  # single-process data loading
-                    iter_start = 0
-                    iter_end = len(self.dataset)
-                else:  # in a worker process
-                    per_worker = math.ceil(
-                        len(self.dataset) / float(worker_info.num_workers))
-                    worker_id = worker_info.id
-                    iter_start = worker_id * per_worker
-                    iter_end = min(iter_start + per_worker, len(self.dataset))
-
-                for idx in range(iter_start, iter_end):
-                    item_dict = self.dataset[idx]
-                    res = {
-                        k: np.array(item_dict[k])
-                        for k in columns if k in retained_columns
-                    }
-                    for preprocessor in preprocessor_list:
-                        res.update({
-                            k: np.array(v)
-                            for k, v in preprocessor(item_dict).items()
-                            if k in retained_columns
-                        })
-                    yield res
-
-        return MsIterableDataset(self._hf_ds)
+        return MsMapDataset(self._hf_ds, preprocessor_list, retained_columns,
+                            columns, to_tensor)
 
     def to_torch_dataset(
         self,
@@ -354,6 +377,7 @@ class MsDataset:
         preprocessors: Union[Callable, List[Callable]] = None,
         task_name: str = None,
         task_data_config: ConfigDict = None,
+        to_tensor: bool = True,
         **format_kwargs,
     ):
         """Create a torch.utils.data.Dataset from the MS Dataset. The torch.utils.data.Dataset can be passed to
@@ -361,13 +385,14 @@ class MsDataset:
 
         Args:
             preprocessors (Callable or List[Callable], default None): (list of) Preprocessor object used to process
-                every sample of the dataset. The output type of processors is dict, and each numeric field of the dict
+                every sample of the dataset. The output type of processors is dict, and each (numeric) field of the dict
                 will be used as a field of torch.utils.data.Dataset.
-            columns (str or List[str], default None): Dataset column(s) to be loaded (numeric data only). If the
-                preprocessor is None, the arg columns must have at least one column. If the `preprocessors` is not None,
-                the output fields of processors will also be added.
+            columns (str or List[str], default None): Dataset column(s) to be loaded (numeric data only if
+                `to_tensor` is True). If the preprocessor is None, the arg columns must have at least one column.
+                If the `preprocessors` is not None, the output fields of processors will also be added.
             task_name (str, default None):  task name, refer to :obj:`Tasks` for more details
             task_data_config (ConfigDict, default None): config dict for model object.
+            to_tensor (bool, default None): whether convert the data types of dataset column(s) to torch.tensor or not.
             format_kwargs: A `dict` of arguments to be passed to the `torch.tensor`.
 
         Returns:
@@ -380,11 +405,11 @@ class MsDataset:
             )
         if isinstance(self._hf_ds, ExternalDataset):
             task_data_config.update({'preprocessor': preprocessors})
-            return build_task_dataset(task_data_config, task_name,
-                                      self._hf_ds.config_kwargs)
+            task_data_config.update(self._hf_ds.config_kwargs)
+            return build_task_dataset(task_data_config, task_name)
         if preprocessors is not None:
             return self.to_torch_dataset_with_processors(
-                preprocessors, columns=columns)
+                preprocessors, columns=columns, to_tensor=to_tensor)
         else:
             self._hf_ds.reset_format()
             self._hf_ds.set_format(
@@ -539,3 +564,93 @@ class MsDataset:
     def to_hf_dataset(self) -> Dataset:
         self._hf_ds.reset_format()
         return self._hf_ds
+
+    @staticmethod
+    def upload(object_name: str,
+               local_file_path: str,
+               dataset_name: str,
+               namespace: Optional[str] = DEFAULT_DATASET_NAMESPACE,
+               version: Optional[str] = DEFAULT_DATASET_REVISION) -> None:
+        """Upload dataset file to the ModelScope Hub. Please login to the ModelScope Hub first.
+
+        Args:
+            object_name (str): The object name on ModelScope, in the form of your-dataset-name.zip
+            local_file_path (str): Local file to upload
+            dataset_name (str): Name of the dataset
+            namespace(str, optional): Namespace of the dataset
+            version: Optional[str]: Version of the dataset
+
+        Returns:
+            None
+
+        """
+        _upload_manager = DatasetUploadManager(
+            dataset_name=dataset_name, namespace=namespace, version=version)
+        _upload_manager.upload(object_name, local_file_path)
+
+    @staticmethod
+    def clone_meta(dataset_work_dir: str,
+                   dataset_id: str,
+                   revision: Optional[str] = DEFAULT_DATASET_REVISION,
+                   auth_token: Optional[str] = None,
+                   git_path: Optional[str] = None) -> None:
+        """Clone meta-file of dataset from the ModelScope Hub.
+        Args:
+            dataset_work_dir (str): Current git working directory.
+            dataset_id (str): Dataset id, in the form of your-namespace/your-dataset-name .
+            revision(`Optional[str]`):
+                revision of the model you want to clone from. Can be any of a branch, tag or commit hash
+            auth_token(`Optional[str]`):
+                token obtained when calling `HubApi.login()`. Usually you can safely ignore the parameter
+                as the token is already saved when you login the first time, if None, we will use saved token.
+            git_path:(`Optional[str]`):
+                The git command line path, if None, we use 'git'
+        Returns:
+            None
+        """
+
+        _repo = DatasetRepository(
+            repo_work_dir=dataset_work_dir,
+            dataset_id=dataset_id,
+            revision=revision,
+            auth_token=auth_token,
+            git_path=git_path)
+        clone_work_dir = _repo.clone()
+        if clone_work_dir:
+            logger.info('Already cloned repo to: {}'.format(clone_work_dir))
+        else:
+            logger.warning(
+                'Repo dir already exists: {}'.format(clone_work_dir))
+
+    @staticmethod
+    def upload_meta(dataset_work_dir: str,
+                    commit_message: str,
+                    revision: Optional[str] = DEFAULT_DATASET_REVISION,
+                    auth_token: Optional[str] = None,
+                    git_path: Optional[str] = None,
+                    force: bool = False) -> None:
+        """Upload meta-file of dataset to the ModelScope Hub. Please clone the meta-data from the ModelScope Hub first.
+
+        Args:
+            dataset_work_dir (str): Current working directory.
+            commit_message (str): Commit message.
+            revision(`Optional[str]`):
+                revision of the model you want to clone from. Can be any of a branch, tag or commit hash
+            auth_token(`Optional[str]`):
+                token obtained when calling `HubApi.login()`. Usually you can safely ignore the parameter
+                as the token is already saved when you login the first time, if None, we will use saved token.
+            git_path:(`Optional[str]`):
+                The git command line path, if None, we use 'git'
+            force (Optional[bool]): whether to use forced-push.
+
+        Returns:
+            None
+
+        """
+        _repo = DatasetRepository(
+            repo_work_dir=dataset_work_dir,
+            dataset_id='',
+            revision=revision,
+            auth_token=auth_token,
+            git_path=git_path)
+        _repo.push(commit_message=commit_message, branch=revision, force=force)
diff --git a/modelscope/msdatasets/task_datasets/__init__.py b/modelscope/msdatasets/task_datasets/__init__.py
index 1905bf39..e2bf5bc1 100644
--- a/modelscope/msdatasets/task_datasets/__init__.py
+++ b/modelscope/msdatasets/task_datasets/__init__.py
@@ -9,16 +9,21 @@ if TYPE_CHECKING:
     from .torch_base_dataset import TorchTaskDataset
     from .veco_dataset import VecoDataset
     from .image_instance_segmentation_coco_dataset import ImageInstanceSegmentationCocoDataset
+    from .movie_scene_segmentation import MovieSceneSegmentationDataset
     from .video_summarization_dataset import VideoSummarizationDataset
+    from .passage_ranking_dataset import PassageRankingDataset
+
 else:
     _import_structure = {
         'base': ['TaskDataset'],
         'builder': ['TASK_DATASETS', 'build_task_dataset'],
         'torch_base_dataset': ['TorchTaskDataset'],
+        'passage_ranking_dataset': ['PassageRankingDataset'],
         'veco_dataset': ['VecoDataset'],
         'image_instance_segmentation_coco_dataset':
         ['ImageInstanceSegmentationCocoDataset'],
         'video_summarization_dataset': ['VideoSummarizationDataset'],
+        'movie_scene_segmentation': ['MovieSceneSegmentationDataset'],
     }
     import sys
 
diff --git a/modelscope/msdatasets/task_datasets/image_instance_segmentation_coco_dataset.py b/modelscope/msdatasets/task_datasets/image_instance_segmentation_coco_dataset.py
index 04c8e142..1c7bc249 100644
--- a/modelscope/msdatasets/task_datasets/image_instance_segmentation_coco_dataset.py
+++ b/modelscope/msdatasets/task_datasets/image_instance_segmentation_coco_dataset.py
@@ -1,3 +1,5 @@
+# Part of the implementation is borrowed and modified from MMDetection, publicly available at
+# https://github.com/open-mmlab/mmdetection/blob/master/mmdet/datasets/coco.py
 import os.path as osp
 
 import numpy as np
@@ -59,18 +61,21 @@ class ImageInstanceSegmentationCocoDataset(TorchTaskDataset):
                  preprocessor=None,
                  classes=None,
                  seg_prefix=None,
+                 folder_name=None,
+                 ann_file=None,
+                 img_prefix=None,
                  test_mode=False,
                  filter_empty_gt=True,
                  **kwargs):
-        self.data_root = next(iter(split_config.values()))
+        data_root = next(iter(split_config.values()))
+        self.data_root = osp.join(data_root,
+                                  folder_name) if folder_name else data_root
         self.split = next(iter(split_config.keys()))
         self.preprocessor = preprocessor
 
-        self.ann_file = osp.join(self.data_root,
-                                 DATASET_STRUCTURE[self.split]['annotation'])
+        self.ann_file = osp.join(self.data_root, ann_file)
 
-        self.img_prefix = osp.join(self.data_root,
-                                   DATASET_STRUCTURE[self.split]['images'])
+        self.img_prefix = osp.join(self.data_root, img_prefix)
         self.seg_prefix = seg_prefix
         self.test_mode = test_mode
         self.filter_empty_gt = filter_empty_gt
diff --git a/modelscope/msdatasets/task_datasets/movie_scene_segmentation/__init__.py b/modelscope/msdatasets/task_datasets/movie_scene_segmentation/__init__.py
new file mode 100644
index 00000000..b1bc40f8
--- /dev/null
+++ b/modelscope/msdatasets/task_datasets/movie_scene_segmentation/__init__.py
@@ -0,0 +1,2 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from .movie_scene_segmentation_dataset import MovieSceneSegmentationDataset
diff --git a/modelscope/msdatasets/task_datasets/movie_scene_segmentation/movie_scene_segmentation_dataset.py b/modelscope/msdatasets/task_datasets/movie_scene_segmentation/movie_scene_segmentation_dataset.py
new file mode 100644
index 00000000..68cbf918
--- /dev/null
+++ b/modelscope/msdatasets/task_datasets/movie_scene_segmentation/movie_scene_segmentation_dataset.py
@@ -0,0 +1,172 @@
+# The implementation here is modified based on BaSSL,
+# originally Apache 2.0 License and publicly available at https://github.com/kakaobrain/bassl
+import copy
+import os
+import os.path as osp
+import random
+
+import json
+import torch
+from torchvision.datasets.folder import pil_loader
+
+from modelscope.metainfo import Models
+from modelscope.msdatasets.task_datasets.builder import TASK_DATASETS
+from modelscope.msdatasets.task_datasets.torch_base_dataset import \
+    TorchTaskDataset
+from modelscope.utils.constant import Tasks
+from . import sampler
+
+DATASET_STRUCTURE = {
+    'train': {
+        'annotation': 'anno/train.json',
+        'images': 'keyf_240p',
+        'feat': 'feat'
+    },
+    'test': {
+        'annotation': 'anno/test.json',
+        'images': 'keyf_240p',
+        'feat': 'feat'
+    }
+}
+
+
+@TASK_DATASETS.register_module(
+    Tasks.movie_scene_segmentation, module_name=Models.resnet50_bert)
+class MovieSceneSegmentationDataset(TorchTaskDataset):
+    """dataset for movie scene segmentation.
+
+    Args:
+        split_config (dict): Annotation file path. {"train":"xxxxx"}
+        data_root (str, optional): Data root for ``ann_file``,
+            ``img_prefix``, ``seg_prefix``, ``proposal_file`` if specified.
+        test_mode (bool, optional): If set True, annotation will not be loaded.
+    """
+
+    def __init__(self, **kwargs):
+        split_config = kwargs['split_config']
+
+        self.data_root = next(iter(split_config.values()))
+        if not osp.exists(self.data_root):
+            self.data_root = osp.dirname(self.data_root)
+            assert osp.exists(self.data_root)
+
+        self.split = next(iter(split_config.keys()))
+        self.preprocessor = kwargs['preprocessor']
+
+        self.ann_file = osp.join(self.data_root,
+                                 DATASET_STRUCTURE[self.split]['annotation'])
+        self.img_prefix = osp.join(self.data_root,
+                                   DATASET_STRUCTURE[self.split]['images'])
+        self.feat_prefix = osp.join(self.data_root,
+                                    DATASET_STRUCTURE[self.split]['feat'])
+
+        self.test_mode = kwargs['test_mode']
+        if self.test_mode:
+            self.preprocessor.eval()
+        else:
+            self.preprocessor.train()
+
+        self.cfg = kwargs.pop('cfg', None)
+
+        self.num_keyframe = self.cfg.num_keyframe if self.cfg is not None else 3
+        self.use_single_keyframe = self.cfg.use_single_keyframe if self.cfg is not None else False
+
+        self.load_data()
+        self.init_sampler(self.cfg)
+
+    def __len__(self):
+        """Total number of samples of data."""
+        return len(self.anno_data)
+
+    def __getitem__(self, idx: int):
+        data = self.anno_data[
+            idx]  # {"video_id", "shot_id", "num_shot", "boundary_label"}
+        vid, sid = data['video_id'], data['shot_id']
+        num_shot = data['num_shot']
+
+        shot_idx = self.shot_sampler(int(sid), num_shot)
+
+        video = self.load_shot_list(vid, shot_idx)
+        if self.preprocessor is None:
+            video = torch.stack(video, dim=0)
+            video = video.view(-1, self.num_keyframe, 3, 224, 224)
+        else:
+            video = self.preprocessor(video)
+
+        payload = {
+            'idx': idx,
+            'vid': vid,
+            'sid': sid,
+            'video': video,
+            'label': abs(data['boundary_label']),  # ignore -1 label.
+        }
+        return payload
+
+    def load_data(self):
+        self.tmpl = '{}/shot_{}_img_{}.jpg'  # video_id, shot_id, shot_num
+
+        if not self.test_mode:
+            with open(self.ann_file) as f:
+                self.anno_data = json.load(f)
+            self.vidsid2label = {
+                f"{it['video_id']}_{it['shot_id']}": it['boundary_label']
+                for it in self.anno_data
+            }
+        else:
+            with open(self.ann_file) as f:
+                self.anno_data = json.load(f)
+
+    def init_sampler(self, cfg):
+        # shot sampler
+        if cfg is not None:
+            self.sampling_method = cfg.sampling_method.name
+            sampler_args = copy.deepcopy(
+                cfg.sampling_method.params.get(self.sampling_method, {}))
+            if self.sampling_method == 'instance':
+                self.shot_sampler = sampler.InstanceShotSampler()
+            elif self.sampling_method == 'temporal':
+                self.shot_sampler = sampler.TemporalShotSampler(**sampler_args)
+            elif self.sampling_method == 'shotcol':
+                self.shot_sampler = sampler.SequenceShotSampler(**sampler_args)
+            elif self.sampling_method == 'bassl':
+                self.shot_sampler = sampler.SequenceShotSampler(**sampler_args)
+            elif self.sampling_method == 'bassl+shotcol':
+                self.shot_sampler = sampler.SequenceShotSampler(**sampler_args)
+            elif self.sampling_method == 'sbd':
+                self.shot_sampler = sampler.NeighborShotSampler(**sampler_args)
+            else:
+                raise NotImplementedError
+        else:
+            self.shot_sampler = sampler.NeighborShotSampler()
+
+    def load_shot_list(self, vid, shot_idx):
+        shot_list = []
+        cache = {}
+        for sidx in shot_idx:
+            vidsid = f'{vid}_{sidx:04d}'
+            if vidsid in cache:
+                shot = cache[vidsid]
+            else:
+                shot_path = os.path.join(
+                    self.img_prefix, self.tmpl.format(vid, f'{sidx:04d}',
+                                                      '{}'))
+                shot = self.load_shot_keyframes(shot_path)
+                cache[vidsid] = shot
+            shot_list.extend(shot)
+        return shot_list
+
+    def load_shot_keyframes(self, path):
+        shot = None
+        if not self.test_mode and self.use_single_keyframe:
+            # load one randomly sampled keyframe
+            shot = [
+                pil_loader(
+                    path.format(random.randint(0, self.num_keyframe - 1)))
+            ]
+        else:
+            # load all keyframes
+            shot = [
+                pil_loader(path.format(i)) for i in range(self.num_keyframe)
+            ]
+        assert shot is not None
+        return shot
diff --git a/modelscope/msdatasets/task_datasets/movie_scene_segmentation/sampler.py b/modelscope/msdatasets/task_datasets/movie_scene_segmentation/sampler.py
new file mode 100644
index 00000000..0fc2fe0f
--- /dev/null
+++ b/modelscope/msdatasets/task_datasets/movie_scene_segmentation/sampler.py
@@ -0,0 +1,102 @@
+# ------------------------------------------------------------------------------------
+# BaSSL
+# Copyright (c) 2021 KakaoBrain. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# Github: https://github.com/kakaobrain/bassl
+# ------------------------------------------------------------------------------------
+
+import random
+
+import numpy as np
+
+
+class InstanceShotSampler:
+    """ This is for instance at pre-training stage """
+
+    def __call__(self, center_sid: int, *args, **kwargs):
+        return center_sid
+
+
+class TemporalShotSampler:
+    """ This is for temporal at pre-training stage """
+
+    def __init__(self, neighbor_size: int):
+        self.N = neighbor_size
+
+    def __call__(self, center_sid: int, total_num_shot: int):
+        """ we randomly sample one shot from neighbor shots within local temporal window
+        """
+        shot_idx = center_sid + np.arange(
+            -self.N, self.N + 1
+        )  # total number of neighbor shots = 2N+1 (query (1) + neighbors (2*N))
+        shot_idx = np.clip(shot_idx, 0,
+                           total_num_shot)  # deal with out-of-boundary indices
+        shot_idx = random.choice(
+            np.unique(np.delete(shot_idx, np.where(shot_idx == center_sid))))
+        return shot_idx
+
+
+class SequenceShotSampler:
+    """ This is for bassl or shotcol at pre-training stage """
+
+    def __init__(self, neighbor_size: int, neighbor_interval: int):
+        self.interval = neighbor_interval
+        self.window_size = neighbor_size * self.interval  # temporal coverage
+
+    def __call__(self,
+                 center_sid: int,
+                 total_num_shot: int,
+                 sparse_method: str = 'edge'):
+        """
+        Args:
+            center_sid: index of center shot
+            total_num_shot: last index of shot for given video
+            sparse_stride: stride to sample sparse ones from dense sequence
+                    for curriculum learning
+        """
+
+        dense_shot_idx = center_sid + np.arange(
+            -self.window_size, self.window_size + 1,
+            self.interval)  # total number of shots = 2*neighbor_size+1
+
+        if dense_shot_idx[0] < 0:
+            # if center_sid is near left-side of video, we shift window rightward
+            # so that the leftmost index is 0
+            dense_shot_idx -= dense_shot_idx[0]
+        elif dense_shot_idx[-1] > (total_num_shot - 1):
+            # if center_sid is near right-side of video, we shift window leftward
+            # so that the rightmost index is total_num_shot - 1
+            dense_shot_idx -= dense_shot_idx[-1] - (total_num_shot - 1)
+
+        # to deal with videos that have smaller number of shots than window size
+        dense_shot_idx = np.clip(dense_shot_idx, 0, total_num_shot)
+
+        if sparse_method == 'edge':
+            # in this case, we use two edge shots as sparse sequence
+            sparse_stride = len(dense_shot_idx) - 1
+            sparse_idx_to_dense = np.arange(0, len(dense_shot_idx),
+                                            sparse_stride)
+        elif sparse_method == 'edge+center':
+            # in this case, we use two edge shots + center shot as sparse sequence
+            sparse_idx_to_dense = np.array(
+                [0, len(dense_shot_idx) - 1,
+                 len(dense_shot_idx) // 2])
+
+        shot_idx = [sparse_idx_to_dense, dense_shot_idx]
+        return shot_idx
+
+
+class NeighborShotSampler:
+    """ This is for scene boundary detection (sbd), i.e., fine-tuning stage """
+
+    def __init__(self, neighbor_size: int = 8):
+        self.neighbor_size = neighbor_size
+
+    def __call__(self, center_sid: int, total_num_shot: int):
+        # total number of shots = 2 * neighbor_size + 1
+        shot_idx = center_sid + np.arange(-self.neighbor_size,
+                                          self.neighbor_size + 1)
+        shot_idx = np.clip(shot_idx, 0,
+                           total_num_shot)  # for out-of-boundary indices
+
+        return shot_idx
diff --git a/modelscope/msdatasets/task_datasets/passage_ranking_dataset.py b/modelscope/msdatasets/task_datasets/passage_ranking_dataset.py
new file mode 100644
index 00000000..517e0d36
--- /dev/null
+++ b/modelscope/msdatasets/task_datasets/passage_ranking_dataset.py
@@ -0,0 +1,151 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import random
+from dataclasses import dataclass
+from typing import Any, Dict, List, Tuple, Union
+
+import torch
+from datasets import Dataset, IterableDataset, concatenate_datasets
+from torch.utils.data import ConcatDataset
+from transformers import DataCollatorWithPadding
+
+from modelscope.metainfo import Models
+from modelscope.utils.constant import ModeKeys, Tasks
+from .base import TaskDataset
+from .builder import TASK_DATASETS
+from .torch_base_dataset import TorchTaskDataset
+
+
+@TASK_DATASETS.register_module(
+    group_key=Tasks.passage_ranking, module_name=Models.bert)
+class PassageRankingDataset(TorchTaskDataset):
+
+    def __init__(self,
+                 datasets: Union[Any, List[Any]],
+                 mode,
+                 preprocessor=None,
+                 *args,
+                 **kwargs):
+        self.seed = kwargs.get('seed', 42)
+        self.permutation = None
+        self.datasets = None
+        self.dataset_config = kwargs
+        self.query_sequence = self.dataset_config.get('query_sequence',
+                                                      'query')
+        self.pos_sequence = self.dataset_config.get('pos_sequence',
+                                                    'positive_passages')
+        self.neg_sequence = self.dataset_config.get('neg_sequence',
+                                                    'negative_passages')
+        self.passage_text_fileds = self.dataset_config.get(
+            'passage_text_fileds', ['title', 'text'])
+        self.qid_field = self.dataset_config.get('qid_field', 'query_id')
+        if mode == ModeKeys.TRAIN:
+            train_config = kwargs.get('train', {})
+            self.neg_samples = train_config.get('neg_samples', 4)
+
+        super().__init__(datasets, mode, preprocessor, **kwargs)
+
+    def __getitem__(self, index) -> Any:
+        if self.mode == ModeKeys.TRAIN:
+            return self.__get_train_item__(index)
+        else:
+            return self.__get_test_item__(index)
+
+    def __get_test_item__(self, index):
+        group = self._inner_dataset[index]
+        labels = []
+
+        qry = group[self.query_sequence]
+
+        pos_sequences = group[self.pos_sequence]
+        pos_sequences = [
+            ' '.join([ele[key] for key in self.passage_text_fileds])
+            for ele in pos_sequences
+        ]
+        labels.extend([1] * len(pos_sequences))
+
+        neg_sequences = group[self.neg_sequence]
+        neg_sequences = [
+            ' '.join([ele[key] for key in self.passage_text_fileds])
+            for ele in neg_sequences
+        ]
+
+        labels.extend([0] * len(neg_sequences))
+        qid = group[self.qid_field]
+
+        examples = pos_sequences + neg_sequences
+        sample = {
+            'qid': torch.LongTensor([int(qid)] * len(labels)),
+            self.preprocessor.first_sequence: qry,
+            self.preprocessor.second_sequence: examples,
+            'labels': torch.LongTensor(labels)
+        }
+        return self.prepare_sample(sample)
+
+    def __get_train_item__(self, index):
+        group = self._inner_dataset[index]
+
+        qry = group[self.query_sequence]
+
+        pos_sequences = group[self.pos_sequence]
+        pos_sequences = [
+            ' '.join([ele[key] for key in self.passage_text_fileds])
+            for ele in pos_sequences
+        ]
+
+        neg_sequences = group[self.neg_sequence]
+        neg_sequences = [
+            ' '.join([ele[key] for key in self.passage_text_fileds])
+            for ele in neg_sequences
+        ]
+
+        pos_psg = random.choice(pos_sequences)
+
+        if len(neg_sequences) < self.neg_samples:
+            negs = random.choices(neg_sequences, k=self.neg_samples)
+        else:
+            negs = random.sample(neg_sequences, k=self.neg_samples)
+        examples = [pos_psg] + negs
+        sample = {
+            self.preprocessor.first_sequence: qry,
+            self.preprocessor.second_sequence: examples,
+        }
+        return self.prepare_sample(sample)
+
+    def __len__(self):
+        return len(self._inner_dataset)
+
+    def prepare_dataset(self, datasets: Union[Any, List[Any]]) -> Any:
+        """Prepare a dataset.
+
+        User can process the input datasets in a whole dataset perspective.
+        This method gives a default implementation of datasets merging, user can override this
+        method to write custom logics.
+
+        Args:
+            datasets: The original dataset(s)
+
+        Returns: A single dataset, which may be created after merging.
+
+        """
+        if isinstance(datasets, List):
+            if len(datasets) == 1:
+                return datasets[0]
+            elif len(datasets) > 1:
+                return ConcatDataset(datasets)
+        else:
+            return datasets
+
+    def prepare_sample(self, data):
+        """Preprocess the data fetched from the inner_dataset.
+
+        If the preprocessor is None, the original data will be returned, else the preprocessor will be called.
+        User can override this method to implement custom logics.
+
+        Args:
+            data: The data fetched from the dataset.
+
+        Returns: The processed data.
+
+        """
+        return self.preprocessor(
+            data) if self.preprocessor is not None else data
diff --git a/modelscope/msdatasets/utils/dataset_builder.py b/modelscope/msdatasets/utils/dataset_builder.py
index 85489c58..0548f7b9 100644
--- a/modelscope/msdatasets/utils/dataset_builder.py
+++ b/modelscope/msdatasets/utils/dataset_builder.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import os
 from typing import Mapping, Sequence, Union
 
@@ -5,10 +7,11 @@ import datasets
 import pandas as pd
 import pyarrow as pa
 from datasets.info import DatasetInfo
+from datasets.naming import camelcase_to_snakecase
 from datasets.packaged_modules import csv
 from datasets.utils.filelock import FileLock
 
-from modelscope.utils.constant import DownloadMode
+from modelscope.utils.constant import DEFAULT_DATASET_NAMESPACE, DownloadMode
 from modelscope.utils.logger import get_logger
 
 logger = get_logger()
@@ -27,7 +30,6 @@ class MsCsvDatasetBuilder(csv.Csv):
         zip_data_files: Mapping[str, Union[str, Sequence[str]]] = None,
         **config_kwargs,
     ):
-        self.namespace = namespace
         super().__init__(
             cache_dir=cache_dir,
             name=subset_name,
@@ -35,9 +37,9 @@ class MsCsvDatasetBuilder(csv.Csv):
             data_files=meta_data_files,
             **config_kwargs)
 
-        self.name = dataset_name
-        self.info.builder_name = self.name
-        self._cache_dir = self._build_cache_dir()
+        self.name = camelcase_to_snakecase(dataset_name)
+        self.info.builder_name = dataset_name
+        self._cache_dir = self._build_cache_dir(namespace=namespace)
         lock_path = os.path.join(
             self._cache_dir_root,
             self._cache_dir.replace(os.sep, '_') + '.lock')
@@ -48,7 +50,6 @@ class MsCsvDatasetBuilder(csv.Csv):
                     logger.info(
                         f'Overwrite dataset info from restored data version, cache_dir is {self._cache_dir}'
                     )
-                    self.info = DatasetInfo.from_directory(self._cache_dir)
                 # dir exists but no data, remove the empty dir as data aren't available anymore
                 else:
                     logger.warning(
@@ -57,14 +58,17 @@ class MsCsvDatasetBuilder(csv.Csv):
                     os.rmdir(self._cache_dir)
         self.zip_data_files = zip_data_files
 
-    def _relative_data_dir(self, with_version=True, with_hash=True) -> str:
+    def _relative_data_dir(self,
+                           with_version=True,
+                           with_hash=True,
+                           namespace=DEFAULT_DATASET_NAMESPACE) -> str:
         """Relative path of this dataset in cache_dir:
         Will be:
             self.name/self.config.version/self.hash/
         or if a namespace has been specified:
             self.namespace___self.name/self.config.version/self.hash/
         """
-        builder_data_dir = self.name if self.namespace is None else f'{self.namespace}___{self.name}'
+        builder_data_dir = self.info.builder_name if namespace is None else f'{namespace}___{self.info.builder_name}'
         builder_config = self.config
         hash = self.hash
         if builder_config:
@@ -76,10 +80,11 @@ class MsCsvDatasetBuilder(csv.Csv):
             builder_data_dir = os.path.join(builder_data_dir, hash)
         return builder_data_dir
 
-    def _build_cache_dir(self):
+    def _build_cache_dir(self, namespace=DEFAULT_DATASET_NAMESPACE):
         builder_data_dir = os.path.join(
             self._cache_dir_root,
-            self._relative_data_dir(with_version=False, with_hash=True))
+            self._relative_data_dir(
+                with_version=False, with_hash=True, namespace=namespace))
 
         return builder_data_dir
 
@@ -97,15 +102,8 @@ class MsCsvDatasetBuilder(csv.Csv):
                 datasets.SplitGenerator(
                     name=split_name,
                     gen_kwargs={
-                        'files':
-                        dl_manager.iter_files(files),
-                        'base_dir':
-                        os.path.join(
-                            zip_data_files.get(split_name),
-                            os.path.splitext(
-                                self.zip_data_files.get(split_name))[0])
-                        if self.zip_data_files.get(split_name) else
-                        zip_data_files.get(split_name)
+                        'files': dl_manager.iter_files(files),
+                        'base_dir': zip_data_files.get(split_name)
                     }))
         return splits
 
@@ -161,6 +159,7 @@ class TaskSpecificDatasetBuilder(MsCsvDatasetBuilder):
         self.zip_data_files = zip_data_files
         self.split_path_dict = None
         self.config = None
+        self.info = DatasetInfo.from_dict({'builder_name': dataset_name})
         self._cache_dir_root = os.path.expanduser(cache_dir)
         self._cache_dir = self._build_cache_dir()
         self._config_kwargs = config_kwargs
@@ -181,12 +180,8 @@ class TaskSpecificDatasetBuilder(MsCsvDatasetBuilder):
         self._download_and_prepare(dl_manager=dl_manager)
 
     def _download_and_prepare(self, dl_manager):
-        split_path_dict = dl_manager.download_and_extract(self.zip_data_files)
-        self.split_path_dict = {
-            k: os.path.join(v,
-                            os.path.splitext(self.zip_data_files[k])[0])
-            for k, v in split_path_dict.items()
-        }
+        self.split_path_dict = dl_manager.download_and_extract(
+            self.zip_data_files)
 
     def as_dataset(self):
         return ExternalDataset(self.split_path_dict, self._config_kwargs)
diff --git a/modelscope/msdatasets/utils/dataset_utils.py b/modelscope/msdatasets/utils/dataset_utils.py
index 09556d84..ef42f75f 100644
--- a/modelscope/msdatasets/utils/dataset_utils.py
+++ b/modelscope/msdatasets/utils/dataset_utils.py
@@ -1,6 +1,8 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import os
 from collections import defaultdict
-from typing import Mapping, Optional, Sequence, Union
+from typing import Any, Mapping, Optional, Sequence, Union
 
 from datasets.builder import DatasetBuilder
 
@@ -11,6 +13,14 @@ from .dataset_builder import MsCsvDatasetBuilder, TaskSpecificDatasetBuilder
 logger = get_logger()
 
 
+def format_dataset_structure(dataset_structure):
+    return {
+        k: v
+        for k, v in dataset_structure.items()
+        if (v.get('meta') or v.get('file'))
+    }
+
+
 def get_target_dataset_structure(dataset_structure: dict,
                                  subset_name: Optional[str] = None,
                                  split: Optional[str] = None):
@@ -56,7 +66,8 @@ def get_target_dataset_structure(dataset_structure: dict,
             f'No subset_name specified, defaulting to the {target_subset_name}'
         )
     # verify dataset split
-    target_dataset_structure = dataset_structure[target_subset_name]
+    target_dataset_structure = format_dataset_structure(
+        dataset_structure[target_subset_name])
     if split and split not in target_dataset_structure:
         raise ValueError(
             f'split {split} not found. Available: {target_dataset_structure.keys()}'
@@ -83,6 +94,7 @@ def get_dataset_files(subset_split_into: dict,
     """
     meta_map = defaultdict(dict)
     file_map = defaultdict(dict)
+    args_map = defaultdict(dict)
     from modelscope.hub.api import HubApi
     modelscope_api = HubApi()
     for split, info in subset_split_into.items():
@@ -90,7 +102,8 @@ def get_dataset_files(subset_split_into: dict,
             info.get('meta', ''), dataset_name, namespace, revision)
         if info.get('file'):
             file_map[split] = info['file']
-    return meta_map, file_map
+        args_map[split] = info.get('args')
+    return meta_map, file_map, args_map
 
 
 def load_dataset_builder(dataset_name: str, subset_name: str, namespace: str,
@@ -98,12 +111,16 @@ def load_dataset_builder(dataset_name: str, subset_name: str, namespace: str,
                                                              Sequence[str]]],
                          zip_data_files: Mapping[str, Union[str,
                                                             Sequence[str]]],
-                         cache_dir: str, version: Optional[Union[str]],
-                         split: Sequence[str],
+                         args_map: Mapping[str, Any], cache_dir: str,
+                         version: Optional[Union[str]], split: Sequence[str],
                          **config_kwargs) -> DatasetBuilder:
     sub_dir = os.path.join(version, '_'.join(split))
     meta_data_file = next(iter(meta_data_files.values()))
     if not meta_data_file:
+        args_map = next(iter(args_map.values()))
+        if args_map is None:
+            args_map = {}
+        args_map.update(config_kwargs)
         builder_instance = TaskSpecificDatasetBuilder(
             dataset_name=dataset_name,
             namespace=namespace,
@@ -112,7 +129,7 @@ def load_dataset_builder(dataset_name: str, subset_name: str, namespace: str,
             meta_data_files=meta_data_files,
             zip_data_files=zip_data_files,
             hash=sub_dir,
-            **config_kwargs)
+            **args_map)
     elif meta_data_file.endswith('.csv'):
         builder_instance = MsCsvDatasetBuilder(
             dataset_name=dataset_name,
diff --git a/modelscope/msdatasets/utils/download_utils.py b/modelscope/msdatasets/utils/download_utils.py
index bc637f0e..2e21bf50 100644
--- a/modelscope/msdatasets/utils/download_utils.py
+++ b/modelscope/msdatasets/utils/download_utils.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 from typing import Optional
 
 from datasets.utils.download_manager import DownloadConfig, DownloadManager
@@ -34,8 +36,8 @@ class DatasetDownloadManager(DownloadManager):
         url_or_filename = str(url_or_filename)
         if is_relative_path(url_or_filename):
             # fetch oss files
-            return self.oss_utilities.download(url_or_filename,
-                                               self.download_config.cache_dir)
+            return self.oss_utilities.download(
+                url_or_filename, download_config=download_config)
         else:
             return cached_path(
                 url_or_filename, download_config=download_config)
diff --git a/modelscope/msdatasets/utils/oss_utils.py b/modelscope/msdatasets/utils/oss_utils.py
index 83cfc7dd..4a403876 100644
--- a/modelscope/msdatasets/utils/oss_utils.py
+++ b/modelscope/msdatasets/utils/oss_utils.py
@@ -1,6 +1,7 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 from __future__ import print_function
 import os
-import sys
 
 import oss2
 from datasets.utils.file_utils import hash_url_to_filename
@@ -19,7 +20,20 @@ class OssUtilities:
         self.oss_dir = oss_config['Dir']
         self.oss_backup_dir = oss_config['BackupDir']
 
-    def download(self, oss_file_name, cache_dir):
+        self.upload_resumable_tmp_store = '/tmp/modelscope/tmp_dataset'
+        self.upload_multipart_threshold = 50 * 1024 * 1024
+        self.upload_part_size = 1 * 1024 * 1024
+        self.upload_num_threads = 4
+        self.upload_max_retries = 3
+
+    @staticmethod
+    def _percentage(consumed_bytes, total_bytes):
+        if total_bytes:
+            rate = int(100 * (float(consumed_bytes) / float(total_bytes)))
+            print('\r{0}% '.format(rate), end='', flush=True)
+
+    def download(self, oss_file_name, download_config):
+        cache_dir = download_config.cache_dir
         candidate_key = os.path.join(self.oss_dir, oss_file_name)
         candidate_key_backup = os.path.join(self.oss_backup_dir, oss_file_name)
         file_oss_key = candidate_key if self.bucket.object_exists(
@@ -27,11 +41,36 @@ class OssUtilities:
         filename = hash_url_to_filename(file_oss_key, etag=None)
         local_path = os.path.join(cache_dir, filename)
 
-        def percentage(consumed_bytes, total_bytes):
-            if total_bytes:
-                rate = int(100 * (float(consumed_bytes) / float(total_bytes)))
-                print('\r{0}% '.format(rate), end='', flush=True)
-
-        self.bucket.get_object_to_file(
-            file_oss_key, local_path, progress_callback=percentage)
+        if download_config.force_download or not os.path.exists(local_path):
+            oss2.resumable_download(
+                self.bucket,
+                file_oss_key,
+                local_path,
+                multiget_threshold=0,
+                progress_callback=self._percentage)
         return local_path
+
+    def upload(self, oss_object_name: str, local_file_path: str) -> str:
+        retry_count = 0
+        object_key = os.path.join(self.oss_dir, oss_object_name)
+        resumable_store = oss2.ResumableStore(
+            root=self.upload_resumable_tmp_store)
+
+        while True:
+            try:
+                retry_count += 1
+                oss2.resumable_upload(
+                    self.bucket,
+                    object_key,
+                    local_file_path,
+                    store=resumable_store,
+                    multipart_threshold=self.upload_multipart_threshold,
+                    part_size=self.upload_part_size,
+                    progress_callback=self._percentage,
+                    num_threads=self.upload_num_threads)
+                break
+            except Exception:
+                if retry_count >= self.upload_max_retries:
+                    raise
+
+        return object_key
diff --git a/modelscope/msdatasets/utils/upload_utils.py b/modelscope/msdatasets/utils/upload_utils.py
new file mode 100644
index 00000000..4813b89f
--- /dev/null
+++ b/modelscope/msdatasets/utils/upload_utils.py
@@ -0,0 +1,23 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+from .oss_utils import OssUtilities
+
+
+class DatasetUploadManager(object):
+
+    def __init__(self, dataset_name: str, namespace: str, version: str):
+        from modelscope.hub.api import HubApi
+        _hub_api = HubApi()
+        _cookies = _hub_api.check_cookies_upload_data(use_cookies=True)
+        _oss_config = _hub_api.get_dataset_access_config_session(
+            cookies=_cookies,
+            dataset_name=dataset_name,
+            namespace=namespace,
+            revision=version)
+
+        self.oss_utilities = OssUtilities(_oss_config)
+
+    def upload(self, object_name: str, local_file_path: str) -> str:
+        object_key = self.oss_utilities.upload(
+            oss_object_name=object_name, local_file_path=local_file_path)
+        return object_key
diff --git a/modelscope/outputs.py b/modelscope/outputs.py
index 640d67fa..d8d2458a 100644
--- a/modelscope/outputs.py
+++ b/modelscope/outputs.py
@@ -7,6 +7,7 @@ class OutputKeys(object):
     LOSS = 'loss'
     LOGITS = 'logits'
     SCORES = 'scores'
+    SCORE = 'score'
     LABEL = 'label'
     LABELS = 'labels'
     INPUT_IDS = 'input_ids'
@@ -20,6 +21,7 @@ class OutputKeys(object):
     POLYGONS = 'polygons'
     OUTPUT = 'output'
     OUTPUT_IMG = 'output_img'
+    OUTPUT_VIDEO = 'output_video'
     OUTPUT_PCM = 'output_pcm'
     IMG_EMBEDDING = 'img_embedding'
     SPO_LIST = 'spo_list'
@@ -34,6 +36,10 @@ class OutputKeys(object):
     UUID = 'uuid'
     WORD = 'word'
     KWS_LIST = 'kws_list'
+    HISTORY = 'history'
+    TIMESTAMPS = 'timestamps'
+    SPLIT_VIDEO_NUM = 'split_video_num'
+    SPLIT_META_LIST = 'split_meta_list'
 
 
 TASK_OUTPUTS = {
@@ -53,6 +59,15 @@ TASK_OUTPUTS = {
     # }
     Tasks.ocr_recognition: [OutputKeys.TEXT],
 
+    # face 2d keypoint result for single sample
+    #   {
+    #       "keypoints": [
+    #           [x1, y1]*106
+    #       ],
+    #       "poses": [pitch, roll, yaw]
+    #   }
+    Tasks.face_2d_keypoints: [OutputKeys.KEYPOINTS, OutputKeys.POSES],
+
     # face detection result for single sample
     #   {
     #       "scores": [0.9, 0.1, 0.05, 0.05]
@@ -72,6 +87,14 @@ TASK_OUTPUTS = {
     Tasks.face_detection:
     [OutputKeys.SCORES, OutputKeys.BOXES, OutputKeys.KEYPOINTS],
 
+    # facial expression recognition result for single sample
+    #   {
+    #       "scores": [0.9, 0.1, 0.02, 0.02, 0.02, 0.02, 0.02],
+    #       "labels": ['Angry', 'Disgust', 'Fear', 'Happy', 'Sad', 'Surprise', 'Neutral']
+    #   }
+    Tasks.facial_expression_recognition:
+    [OutputKeys.SCORES, OutputKeys.LABELS],
+
     # face recognition result for single sample
     #   {
     #       "img_embedding": np.array with shape [1, D],
@@ -129,6 +152,12 @@ TASK_OUTPUTS = {
     Tasks.image_segmentation:
     [OutputKeys.SCORES, OutputKeys.LABELS, OutputKeys.MASKS],
 
+    # semantic segmentation result for single sample
+    #   {
+    #       "masks": [np.array # 2D array with shape [height, width]]
+    #   }
+    Tasks.semantic_segmentation: [OutputKeys.MASKS],
+
     # image matting result for single sample
     # {
     #   "output_img": np.array with shape(h, w, 4)
@@ -180,23 +209,58 @@ TASK_OUTPUTS = {
     #               [[score]*15]
     #              ]
     #   "boxes": [
-    #               [[x1, y1], [x2, y2]],
-    #               [[x1, y1], [x2, y2]],
-    #               [[x1, y1], [x2, y2]],
+    #               [x1, y1, x2, y2],
+    #               [x1, y1, x2, y2],
+    #               [x1, y1, x2, y2],
     #             ]
     # }
     Tasks.body_2d_keypoints:
     [OutputKeys.POSES, OutputKeys.SCORES, OutputKeys.BOXES],
 
+    # 3D human body keypoints detection result for single sample
+    # {
+    #   "poses": [		    # 3d pose coordinate in camera coordinate
+    #     	[[x, y, z]*17],	# joints of per image
+    #     	[[x, y, z]*17],
+    #     	...
+    #     ],
+    #   "timestamps": [     # timestamps of all frames
+    #     "00:00:0.230",
+    #     "00:00:0.560",
+    #     "00:00:0.690",
+    #   ],
+    #   "output_video": "path_to_rendered_video" , this is optional
+    # and is only avaialbe when the "render" option is enabled.
+    # }
+    Tasks.body_3d_keypoints:
+    [OutputKeys.POSES, OutputKeys.TIMESTAMPS, OutputKeys.OUTPUT_VIDEO],
+
+    # 2D hand keypoints result for single sample
+    # {
+    #     "keypoints": [
+    #                     [[x, y, score] * 21],
+    #                     [[x, y, score] * 21],
+    #                     [[x, y, score] * 21],
+    #                  ],
+    #     "boxes": [
+    #                 [x1, y1, x2, y2],
+    #                 [x1, y1, x2, y2],
+    #                 [x1, y1, x2, y2],
+    #             ]
+    # }
+    Tasks.hand_2d_keypoints: [OutputKeys.KEYPOINTS, OutputKeys.BOXES],
+
     # video single object tracking result for single video
     # {
     #   "boxes": [
     #               [x1, y1, x2, y2],
     #               [x1, y1, x2, y2],
     #               [x1, y1, x2, y2],
-    #             ]
+    #             ],
+    #   "timestamps": ["hh:mm:ss", "hh:mm:ss", "hh:mm:ss"]
     # }
-    Tasks.video_single_object_tracking: [OutputKeys.BOXES],
+    Tasks.video_single_object_tracking:
+    [OutputKeys.BOXES, OutputKeys.TIMESTAMPS],
 
     # live category recognition result for single video
     # {
@@ -229,6 +293,35 @@ TASK_OUTPUTS = {
     #    "output_img": np.ndarray with shape [height, width, 3]
     # }
     Tasks.virtual_try_on: [OutputKeys.OUTPUT_IMG],
+    # text driven segmentation result for single sample
+    #   {
+    #       "masks": [
+    #           np.array # 2D array containing only 0, 255
+    #       ]
+    #   }
+    Tasks.text_driven_segmentation: [OutputKeys.MASKS],
+    # shop segmentation result for single sample
+    #   {
+    #       "masks": [
+    #           np.array # 2D array containing only 0, 255
+    #       ]
+    #   }
+    Tasks.shop_segmentation: [OutputKeys.MASKS],
+    # movide scene segmentation result for a single video
+    # {
+    #        "split_video_num":3,
+    #        "split_meta_list":
+    #        [
+    #           {
+    #               "shot": [0,1,2],
+    #               "frame": [start_frame, end_frame],
+    #               "timestamp": [start_timestamp, end_timestamp] # ['00:00:01.133', '00:00:02.245']
+    #           }
+    #        ]
+    #
+    # }
+    Tasks.movie_scene_segmentation:
+    [OutputKeys.SPLIT_VIDEO_NUM, OutputKeys.SPLIT_META_LIST],
 
     # ============ nlp tasks ===================
 
@@ -273,8 +366,7 @@ TASK_OUTPUTS = {
     #     "text": "《父老乡亲》是由是由由中国人民解放军海政文工团创作的军旅歌曲，石顺义作词，王锡仁作曲，范琳琳演唱",
     #     "spo_list": [{"subject": "石顺义", "predicate": "国籍", "object": "中国"}]
     # }
-    Tasks.relation_extraction:
-    [OutputKeys.UUID, OutputKeys.TEXT, OutputKeys.SPO_LIST],
+    Tasks.relation_extraction: [OutputKeys.SPO_LIST],
 
     # translation result for a source sentence
     #   {
@@ -285,26 +377,20 @@ TASK_OUTPUTS = {
     # word segmentation result for single sample
     # {
     #   "output": "今天 天气 不错 ， 适合 出去 游玩"
-    # }
-    Tasks.word_segmentation: [OutputKeys.OUTPUT],
-
-    # part-of-speech result for single sample
-    # [
-    #     {'word': '诸葛', 'label': 'PROPN'},
-    #     {'word': '亮', 'label': 'PROPN'},
-    #     {'word': '发明', 'label': 'VERB'},
-    #     {'word': '八', 'label': 'NUM'},
-    #     {'word': '阵', 'label': 'NOUN'},
-    #     {'word': '图', 'label': 'PART'},
-    #     {'word': '以', 'label': 'ADV'},
-    #     {'word': '利', 'label': 'VERB'},
-    #     {'word': '立营', 'label': 'VERB'},
-    #     {'word': '练兵', 'label': 'VERB'},
-    #     {'word': '.', 'label': 'PUNCT'}
+    #   "labels": [
+    #     {'word': '今天', 'label': 'PROPN'},
+    #     {'word': '天气', 'label': 'PROPN'},
+    #     {'word': '不错', 'label': 'VERB'},
+    #     {'word': ',', 'label': 'NUM'},
+    #     {'word': '适合', 'label': 'NOUN'},
+    #     {'word': '出去', 'label': 'PART'},
+    #     {'word': '游玩', 'label': 'ADV'},
     # ]
-    # TODO @wenmeng.zwm support list of result check
-    Tasks.part_of_speech: [OutputKeys.WORD, OutputKeys.LABEL],
+    # }
+    Tasks.word_segmentation: [OutputKeys.OUTPUT, OutputKeys.LABELS],
+    Tasks.part_of_speech: [OutputKeys.OUTPUT, OutputKeys.LABELS],
 
+    # TODO @wenmeng.zwm support list of result check
     # named entity recognition result for single sample
     # {
     #   "output": [
@@ -319,6 +405,8 @@ TASK_OUTPUTS = {
     #    "output": "我想吃苹果"
     # }
     Tasks.text_error_correction: [OutputKeys.OUTPUT],
+    Tasks.sentence_embedding: [OutputKeys.TEXT_EMBEDDING, OutputKeys.SCORES],
+    Tasks.passage_ranking: [OutputKeys.SCORES],
 
     # text generation result for single sample
     # {
@@ -326,11 +414,11 @@ TASK_OUTPUTS = {
     # }
     Tasks.text_generation: [OutputKeys.TEXT],
 
-    # text feature extraction for single sample
+    # text generation result for single sample
     # {
-    #   "text_embedding": np.array with shape [1, D]
+    #   "text": "北京"
     # }
-    Tasks.sentence_embedding: [OutputKeys.TEXT_EMBEDDING],
+    Tasks.text2text_generation: [OutputKeys.TEXT],
 
     # fill mask result for single sample
     # {
@@ -338,6 +426,22 @@ TASK_OUTPUTS = {
     # }
     Tasks.fill_mask: [OutputKeys.TEXT],
 
+    # feature extraction result for single sample
+    # {
+    #   "text_embedding": [[
+    #     [1.08599677e-04, 1.72710388e-05, 2.95618793e-05, 1.93638436e-04],
+    #     [6.45841064e-05, 1.15997791e-04, 5.11605394e-05, 9.87020373e-01],
+    #     [2.66957268e-05, 4.72324500e-05, 9.74208378e-05, 4.18022355e-05]
+    #   ],
+    #   [
+    #     [2.97343540e-05, 5.81317654e-05, 5.44203431e-05, 6.28319322e-05],
+    #     [8.24327726e-05, 4.66077945e-05, 5.32869453e-05, 4.16190960e-05],
+    #     [3.61441926e-05, 3.38475402e-05, 3.44323053e-05, 5.70138109e-05]
+    #   ]
+    # ]
+    # }
+    Tasks.feature_extraction: [OutputKeys.TEXT_EMBEDDING],
+
     # (Deprecated) dialog intent prediction result for single sample
     # {'output': {'prediction': array([2.62349960e-03, 4.12110658e-03, 4.12748595e-05, 3.77560973e-05,
     #        1.08599677e-04, 1.72710388e-05, 2.95618793e-05, 1.93638436e-04,
@@ -408,6 +512,13 @@ TASK_OUTPUTS = {
     # }
     Tasks.conversational_text_to_sql: [OutputKeys.TEXT],
 
+    # table-question-answering result for single sample
+    # {
+    #   "sql": "SELECT shop.Name FROM shop."
+    #   "sql_history": {sel: 0, agg: 0, conds: [[0, 0, 'val']]}
+    # }
+    Tasks.table_question_answering: [OutputKeys.OUTPUT, OutputKeys.HISTORY],
+
     # ============ audio tasks ===================
     # asr result for single sample
     # { "text": "每一天都要快乐喔"}
@@ -488,6 +599,15 @@ TASK_OUTPUTS = {
     Tasks.generative_multi_modal_embedding:
     [OutputKeys.IMG_EMBEDDING, OutputKeys.TEXT_EMBEDDING, OutputKeys.CAPTION],
 
+    # multi-modal similarity result for single sample
+    # {
+    #   "img_embedding": np.array with shape [1, D],
+    #   "text_embedding": np.array with shape [1, D],
+    #   "similarity": float
+    # }
+    Tasks.multi_modal_similarity:
+    [OutputKeys.IMG_EMBEDDING, OutputKeys.TEXT_EMBEDDING, OutputKeys.SCORES],
+
     # VQA result for a sample
     # {"text": "this is a text answser. "}
     Tasks.visual_question_answering: [OutputKeys.TEXT],
@@ -504,9 +624,62 @@ TASK_OUTPUTS = {
     # }
     Tasks.visual_entailment: [OutputKeys.SCORES, OutputKeys.LABELS],
 
+    # {
+    #     'labels': ['吸烟', '打电话', '吸烟'],
+    #     'scores': [0.7527753114700317, 0.753358006477356, 0.6880350708961487],
+    #     'boxes': [[547, 2, 1225, 719], [529, 8, 1255, 719], [584, 0, 1269, 719]],
+    #     'timestamps': [1, 3, 5]
+    # }
+    Tasks.action_detection: [
+        OutputKeys.TIMESTAMPS,
+        OutputKeys.LABELS,
+        OutputKeys.SCORES,
+        OutputKeys.BOXES,
+    ],
+
+    # {
+    #   'output': [
+    #     [{'label': '6527856', 'score': 0.9942756295204163}, {'label': '1000012000', 'score': 0.0379515215754509},
+    #      {'label': '13421097', 'score': 2.2825044965202324e-08}],
+    #     [{'label': '1000012000', 'score': 0.910681426525116}, {'label': '6527856', 'score': 0.0005046309670433402},
+    #      {'label': '13421097', 'score': 2.75914817393641e-06}],
+    #     [{'label': '1000012000', 'score': 0.910681426525116}, {'label': '6527856', 'score': 0.0005046309670433402},
+    #      {'label': '13421097', 'score': 2.75914817393641e-06}]]
+    # }
+    Tasks.faq_question_answering: [OutputKeys.OUTPUT],
+
     # image person reid result for single sample
     #   {
     #       "img_embedding": np.array with shape [1, D],
     #   }
     Tasks.image_reid_person: [OutputKeys.IMG_EMBEDDING],
+
+    # {
+    #     'output': ['Done' / 'Decode_Error']
+    # }
+    Tasks.video_inpainting: [OutputKeys.OUTPUT],
+
+    # {
+    #     'output': ['bixin']
+    # }
+    Tasks.hand_static: [OutputKeys.OUTPUT],
+
+    #     'output': [
+    #                [2, 75, 287, 240, 510, 0.8335018754005432],
+    #                [1, 127, 83, 332, 366, 0.9175254702568054],
+    #                [0, 0, 0, 367, 639, 0.9693422317504883]]
+    # }
+    Tasks.face_human_hand_detection: [OutputKeys.OUTPUT],
+
+    # {
+    #   {'output': 'Happiness', 'boxes': (203, 104, 663, 564)}
+    # }
+    Tasks.face_emotion: [OutputKeys.OUTPUT, OutputKeys.BOXES],
+
+    # {
+    #     "masks": [
+    #           np.array # 2D array containing only 0, 255
+    #       ]
+    # }
+    Tasks.product_segmentation: [OutputKeys.MASKS],
 }
diff --git a/modelscope/pipelines/audio/ans_pipeline.py b/modelscope/pipelines/audio/ans_pipeline.py
index 410a7cb5..e55f613e 100644
--- a/modelscope/pipelines/audio/ans_pipeline.py
+++ b/modelscope/pipelines/audio/ans_pipeline.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import io
 from typing import Any, Dict
 
@@ -6,25 +8,15 @@ import numpy as np
 import soundfile as sf
 import torch
 
+from modelscope.fileio import File
 from modelscope.metainfo import Pipelines
 from modelscope.outputs import OutputKeys
 from modelscope.pipelines.base import Input, Pipeline
 from modelscope.pipelines.builder import PIPELINES
+from modelscope.utils.audio.audio_utils import audio_norm
 from modelscope.utils.constant import Tasks
 
 
-def audio_norm(x):
-    rms = (x**2).mean()**0.5
-    scalar = 10**(-25 / 20) / rms
-    x = x * scalar
-    pow_x = x**2
-    avg_pow_x = pow_x.mean()
-    rmsx = pow_x[pow_x > avg_pow_x].mean()**0.5
-    scalarx = 10**(-25 / 20) / rmsx
-    x = x * scalarx
-    return x
-
-
 @PIPELINES.register_module(
     Tasks.acoustic_noise_suppression,
     module_name=Pipelines.speech_frcrn_ans_cirm_16k)
@@ -45,11 +37,12 @@ class ANSPipeline(Pipeline):
         super().__init__(model=model, **kwargs)
         self.model.eval()
 
-    def preprocess(self, inputs: Input) -> Dict[str, Any]:
+    def preprocess(self, inputs: Input, **preprocess_params) -> Dict[str, Any]:
         if isinstance(inputs, bytes):
             data1, fs = sf.read(io.BytesIO(inputs))
         elif isinstance(inputs, str):
-            data1, fs = sf.read(inputs)
+            file_bytes = File.read(inputs)
+            data1, fs = sf.read(io.BytesIO(file_bytes))
         else:
             raise TypeError(f'Unsupported type {type(inputs)}.')
         if len(data1.shape) > 1:
@@ -61,7 +54,8 @@ class ANSPipeline(Pipeline):
         inputs = np.reshape(data, [1, data.shape[0]])
         return {'ndarray': inputs, 'nsamples': data.shape[0]}
 
-    def forward(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
+    def forward(self, inputs: Dict[str, Any],
+                **forward_params) -> Dict[str, Any]:
         ndarray = inputs['ndarray']
         if isinstance(ndarray, torch.Tensor):
             ndarray = ndarray.cpu().numpy()
@@ -98,7 +92,8 @@ class ANSPipeline(Pipeline):
                 current_idx = 0
                 while current_idx + window <= t:
                     print('current_idx: {}'.format(current_idx))
-                    tmp_input = ndarray[:, current_idx:current_idx + window]
+                    tmp_input = dict(noisy=ndarray[:, current_idx:current_idx
+                                                   + window])
                     tmp_output = self.model(
                         tmp_input, )['wav_l2'][0].cpu().numpy()
                     end_index = current_idx + window - give_up_length
@@ -111,7 +106,8 @@ class ANSPipeline(Pipeline):
                                     give_up_length:-give_up_length]
                     current_idx += stride
             else:
-                outputs = self.model(ndarray)['wav_l2'][0].cpu().numpy()
+                outputs = self.model(
+                    dict(noisy=ndarray))['wav_l2'][0].cpu().numpy()
         outputs = (outputs[:nsamples] * 32768).astype(np.int16).tobytes()
         return {OutputKeys.OUTPUT_PCM: outputs}
 
diff --git a/modelscope/pipelines/audio/asr_inference_pipeline.py b/modelscope/pipelines/audio/asr_inference_pipeline.py
index b321b770..282d1184 100644
--- a/modelscope/pipelines/audio/asr_inference_pipeline.py
+++ b/modelscope/pipelines/audio/asr_inference_pipeline.py
@@ -1,4 +1,3 @@
-import os
 from typing import Any, Dict, List, Sequence, Tuple, Union
 
 import yaml
@@ -9,6 +8,8 @@ from modelscope.outputs import OutputKeys
 from modelscope.pipelines.base import Pipeline
 from modelscope.pipelines.builder import PIPELINES
 from modelscope.preprocessors import WavToScp
+from modelscope.utils.audio.audio_utils import (extract_pcm_from_wav,
+                                                load_bytes_from_url)
 from modelscope.utils.constant import Frameworks, Tasks
 from modelscope.utils.logger import get_logger
 
@@ -41,12 +42,20 @@ class AutomaticSpeechRecognitionPipeline(Pipeline):
 
         self.recog_type = recog_type
         self.audio_format = audio_format
-        self.audio_in = audio_in
         self.audio_fs = audio_fs
 
+        if isinstance(audio_in, str):
+            # load pcm data from url if audio_in is url str
+            self.audio_in = load_bytes_from_url(audio_in)
+        elif isinstance(audio_in, bytes):
+            # load pcm data from wav data if audio_in is wave format
+            self.audio_in = extract_pcm_from_wav(audio_in)
+        else:
+            self.audio_in = audio_in
+
         if recog_type is None or audio_format is None:
             self.recog_type, self.audio_format, self.audio_in = asr_utils.type_checking(
-                audio_in=audio_in,
+                audio_in=self.audio_in,
                 recog_type=recog_type,
                 audio_format=audio_format)
 
diff --git a/modelscope/pipelines/audio/kws_farfield_pipeline.py b/modelscope/pipelines/audio/kws_farfield_pipeline.py
index a114e7fb..e2f618fa 100644
--- a/modelscope/pipelines/audio/kws_farfield_pipeline.py
+++ b/modelscope/pipelines/audio/kws_farfield_pipeline.py
@@ -1,7 +1,13 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import io
 import wave
 from typing import Any, Dict
 
+import numpy
+import soundfile as sf
+
+from modelscope.fileio import File
 from modelscope.metainfo import Pipelines
 from modelscope.outputs import OutputKeys
 from modelscope.pipelines.base import Input, Pipeline
@@ -34,11 +40,12 @@ class KWSFarfieldPipeline(Pipeline):
         self.model.eval()
         frame_size = self.INPUT_CHANNELS * self.SAMPLE_WIDTH
         self._nframe = self.model.size_in // frame_size
-        self.frame_count = 0
 
     def preprocess(self, inputs: Input, **preprocess_params) -> Dict[str, Any]:
         if isinstance(inputs, bytes):
             return dict(input_file=inputs)
+        elif isinstance(inputs, str):
+            return dict(input_file=inputs)
         elif isinstance(inputs, Dict):
             return inputs
         else:
@@ -47,35 +54,38 @@ class KWSFarfieldPipeline(Pipeline):
     def forward(self, inputs: Dict[str, Any],
                 **forward_params) -> Dict[str, Any]:
         input_file = inputs['input_file']
-        if isinstance(input_file, bytes):
-            input_file = io.BytesIO(input_file)
-        self.frame_count = 0
+        if isinstance(input_file, str):
+            input_file = File.read(input_file)
+        frames, samplerate = sf.read(io.BytesIO(input_file), dtype='int16')
+        if len(frames.shape) == 1:
+            frames = numpy.stack((frames, frames, numpy.zeros_like(frames)), 1)
+
         kws_list = []
-        with wave.open(input_file, 'rb') as fin:
-            if 'output_file' in inputs:
-                with wave.open(inputs['output_file'], 'wb') as fout:
-                    fout.setframerate(self.SAMPLE_RATE)
-                    fout.setnchannels(self.OUTPUT_CHANNELS)
-                    fout.setsampwidth(self.SAMPLE_WIDTH)
-                    self._process(fin, kws_list, fout)
-            else:
-                self._process(fin, kws_list)
+        if 'output_file' in inputs:
+            with wave.open(inputs['output_file'], 'wb') as fout:
+                fout.setframerate(self.SAMPLE_RATE)
+                fout.setnchannels(self.OUTPUT_CHANNELS)
+                fout.setsampwidth(self.SAMPLE_WIDTH)
+                self._process(frames, kws_list, fout)
+        else:
+            self._process(frames, kws_list)
         return {OutputKeys.KWS_LIST: kws_list}
 
     def _process(self,
-                 fin: wave.Wave_read,
+                 frames: numpy.ndarray,
                  kws_list,
                  fout: wave.Wave_write = None):
-        data = fin.readframes(self._nframe)
-        while len(data) >= self.model.size_in:
-            self.frame_count += self._nframe
+        for start_index in range(0, frames.shape[0], self._nframe):
+            end_index = start_index + self._nframe
+            if end_index > frames.shape[0]:
+                end_index = frames.shape[0]
+            data = frames[start_index:end_index, :].tobytes()
             result = self.model.forward_decode(data)
             if fout:
                 fout.writeframes(result['pcm'])
             if 'kws' in result:
-                result['kws']['offset'] += self.frame_count / self.SAMPLE_RATE
+                result['kws']['offset'] += start_index / self.SAMPLE_RATE
                 kws_list.append(result['kws'])
-            data = fin.readframes(self._nframe)
 
     def postprocess(self, inputs: Dict[str, Any], **kwargs) -> Dict[str, Any]:
         return inputs
diff --git a/modelscope/pipelines/audio/kws_kwsbp_pipeline.py b/modelscope/pipelines/audio/kws_kwsbp_pipeline.py
index 1f31766a..866b8d0b 100644
--- a/modelscope/pipelines/audio/kws_kwsbp_pipeline.py
+++ b/modelscope/pipelines/audio/kws_kwsbp_pipeline.py
@@ -8,6 +8,8 @@ from modelscope.models import Model
 from modelscope.pipelines.base import Pipeline
 from modelscope.pipelines.builder import PIPELINES
 from modelscope.preprocessors import WavToLists
+from modelscope.utils.audio.audio_utils import (extract_pcm_from_wav,
+                                                load_bytes_from_url)
 from modelscope.utils.constant import Tasks
 from modelscope.utils.logger import get_logger
 
@@ -40,6 +42,13 @@ class KeyWordSpottingKwsbpPipeline(Pipeline):
         if self.preprocessor is None:
             self.preprocessor = WavToLists()
 
+        if isinstance(audio_in, str):
+            # load pcm data from url if audio_in is url str
+            audio_in = load_bytes_from_url(audio_in)
+        elif isinstance(audio_in, bytes):
+            # load pcm data from wav data if audio_in is wave format
+            audio_in = extract_pcm_from_wav(audio_in)
+
         output = self.preprocessor.forward(self.model.forward(), audio_in)
         output = self.forward(output)
         rst = self.postprocess(output)
diff --git a/modelscope/pipelines/audio/linear_aec_pipeline.py b/modelscope/pipelines/audio/linear_aec_pipeline.py
index b59bc475..e1e75ddb 100644
--- a/modelscope/pipelines/audio/linear_aec_pipeline.py
+++ b/modelscope/pipelines/audio/linear_aec_pipeline.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import importlib
 import os
 from typing import Any, Dict
@@ -51,7 +53,7 @@ class LinearAECPipeline(Pipeline):
 
     When invoke the class with pipeline.__call__(), you should provide two params:
         Dict[str, Any]
-            the path of wav files，eg:{
+            the path of wav files, eg:{
             "nearend_mic": "/your/data/near_end_mic_audio.wav",
             "farend_speech": "/your/data/far_end_speech_audio.wav"}
         output_path (str, optional): "/your/output/audio_after_aec.wav"
diff --git a/modelscope/pipelines/audio/text_to_speech_pipeline.py b/modelscope/pipelines/audio/text_to_speech_pipeline.py
index f9e7d80a..2063da68 100644
--- a/modelscope/pipelines/audio/text_to_speech_pipeline.py
+++ b/modelscope/pipelines/audio/text_to_speech_pipeline.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 from typing import Any, Dict, List
 
 import numpy as np
@@ -42,3 +44,6 @@ class TextToSpeechSambertHifiganPipeline(Pipeline):
 
     def preprocess(self, inputs: Input, **preprocess_params) -> Dict[str, Any]:
         return inputs
+
+    def _sanitize_parameters(self, **pipeline_parameters):
+        return {}, pipeline_parameters, {}
diff --git a/modelscope/pipelines/base.py b/modelscope/pipelines/base.py
index 180ad757..c5db2b57 100644
--- a/modelscope/pipelines/base.py
+++ b/modelscope/pipelines/base.py
@@ -1,8 +1,10 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 
+import os
 import os.path as osp
 from abc import ABC, abstractmethod
-from contextlib import contextmanager
+from functools import partial
+from multiprocessing import Pool
 from threading import Lock
 from typing import Any, Dict, Generator, List, Mapping, Union
 
@@ -16,15 +18,17 @@ from modelscope.utils.config import Config
 from modelscope.utils.constant import Frameworks, ModelFile
 from modelscope.utils.device import (create_device, device_placement,
                                      verify_device)
+from modelscope.utils.hub import read_config, snapshot_download
 from modelscope.utils.import_utils import is_tf_available, is_torch_available
 from modelscope.utils.logger import get_logger
+from modelscope.utils.torch_utils import _find_free_port, _is_free_port
 from .util import is_model, is_official_hub_path
 
 if is_torch_available():
     import torch
 
 if is_tf_available():
-    import tensorflow as tf
+    pass
 
 Tensor = Union['torch.Tensor', 'tf.Tensor']
 Input = Union[str, tuple, MsDataset, 'Image.Image', 'numpy.ndarray']
@@ -200,49 +204,12 @@ class Pipeline(ABC):
             yield self._process_single(ele, *args, **kwargs)
 
     def _collate_fn(self, data):
-        """Prepare the input just before the forward function.
-        This method will move the tensors to the right device.
-        Usually this method does not need to be overridden.
-
-        Args:
-            data: The data out of the dataloader.
-
-        Returns: The processed data.
-
-        """
-        from torch.utils.data.dataloader import default_collate
-        from modelscope.preprocessors import InputFeatures
-        if isinstance(data, dict) or isinstance(data, Mapping):
-            return type(data)(
-                {k: self._collate_fn(v)
-                 for k, v in data.items()})
-        elif isinstance(data, (tuple, list)):
-            if isinstance(data[0], (int, float)):
-                return default_collate(data).to(self.device)
-            else:
-                return type(data)(self._collate_fn(v) for v in data)
-        elif isinstance(data, np.ndarray):
-            if data.dtype.type is np.str_:
-                return data
-            else:
-                return self._collate_fn(torch.from_numpy(data))
-        elif isinstance(data, torch.Tensor):
-            return data.to(self.device)
-        elif isinstance(data, (bytes, str, int, float, bool, type(None))):
-            return data
-        elif isinstance(data, InputFeatures):
-            return data
-        else:
-            import mmcv
-            if isinstance(data, mmcv.parallel.data_container.DataContainer):
-                return data
-            else:
-                raise ValueError(f'Unsupported data type {type(data)}')
+        return collate_fn(data, self.device)
 
     def _process_single(self, input: Input, *args, **kwargs) -> Dict[str, Any]:
-        preprocess_params = kwargs.get('preprocess_params')
-        forward_params = kwargs.get('forward_params')
-        postprocess_params = kwargs.get('postprocess_params')
+        preprocess_params = kwargs.get('preprocess_params', {})
+        forward_params = kwargs.get('forward_params', {})
+        postprocess_params = kwargs.get('postprocess_params', {})
 
         out = self.preprocess(input, **preprocess_params)
         with device_placement(self.framework, self.device_name):
@@ -303,3 +270,146 @@ class Pipeline(ABC):
                 output should have the standard output name.
         """
         raise NotImplementedError('postprocess')
+
+
+class DistributedPipeline(Pipeline):
+    """This pipeline is used to load multi gpu models.
+
+    What will this class do:
+    1. Read the global config from the configuration.json
+    2. Set the multiprocessing method to spawn
+    3. Open a multiprocessing pool of the world_size to instantiate model pieces.
+    4. Set the master port and ip
+    5. Call _instantiate_one to instantiate one model piece
+        This method should be implemented by the derived class.
+    6. After the forward method is called, do preprocess in main process
+        and call _forward_one to collect results, and do
+        post process in main process.
+
+    NOTE: _instantiate_one and _forward_one are class methods, any derived class should implement them and
+    store the model handler in the class field.
+    """
+
+    def __init__(self,
+                 model: str = None,
+                 preprocessor: Union[Preprocessor, List[Preprocessor]] = None,
+                 auto_collate=True,
+                 **kwargs):
+        self.preprocessor = preprocessor
+        self._model_prepare = False
+        self._model_prepare_lock = Lock()
+        self._auto_collate = auto_collate
+
+        if os.path.exists(model):
+            self.model_dir = model
+        else:
+            self.model_dir = snapshot_download(model)
+        self.cfg = read_config(self.model_dir)
+        self.world_size = self.cfg.model.world_size
+        self.model_pool = None
+        self.device_name = 'cpu'
+        self.device = create_device(self.device_name)
+        self.has_multiple_models = False
+        self.framework = self.cfg.framework
+        if torch.multiprocessing.get_start_method(allow_none=True) is None:
+            torch.multiprocessing.set_start_method('spawn')
+
+        ranks = list(range(self.world_size))
+        self.model_pool = Pool(self.world_size)
+        master_ip = '127.0.0.1' if 'master_ip' not in kwargs else kwargs[
+            'master_ip']
+        master_port = '29500' if 'master_port' not in kwargs else kwargs[
+            'master_port']
+        if not _is_free_port(int(master_port)):
+            master_port = str(_find_free_port())
+        self.model_pool.map(
+            partial(
+                self.__class__._instantiate_one,
+                model_dir=self.model_dir,
+                master_ip=master_ip,
+                master_port=master_port,
+                **self.cfg.model,
+                **kwargs), ranks)
+
+    def __del__(self):
+        if hasattr(self, 'model_pool') and self.model_pool is not None:
+            self.model_pool.terminate()
+
+    def __getstate__(self):
+        self_dict = self.__dict__.copy()
+        del self_dict['model_pool']
+        del self_dict['preprocessor']
+        del self_dict['_model_prepare_lock']
+        return self_dict
+
+    @classmethod
+    def _instantiate_one(cls, rank, model_dir, **kwargs):
+        """Instantiate one model piece.
+
+        @param rank: The model rank.
+        @param model_dir: The model_dir in the node.
+        @param kwargs: Any extra args.
+        @return: None. The model handler should be kept in the class field.
+        """
+        pass
+
+    def forward(self, inputs: Dict[str, Any],
+                **forward_params) -> Dict[str, Any]:
+        inputs = {
+            'inputs': inputs,
+            'forward_params': forward_params,
+        }
+        res = self.model_pool.map(self.__class__._forward_one,
+                                  [inputs] * self.world_size)
+        return res[0]
+
+    @classmethod
+    def _forward_one(cls, inputs):
+        """Forward the inputs to one model piece.
+
+        Use the model handler kept in the class field to forward.
+
+        @param inputs: The inputs after the preprocessing.
+        @return: The forward results.
+        """
+        pass
+
+
+def collate_fn(data, device):
+    """Prepare the input just before the forward function.
+    This method will move the tensors to the right device.
+    Usually this method does not need to be overridden.
+
+    Args:
+        data: The data out of the dataloader.
+        device: The device to move data to.
+
+    Returns: The processed data.
+
+    """
+    from torch.utils.data.dataloader import default_collate
+    from modelscope.preprocessors import InputFeatures
+    if isinstance(data, dict) or isinstance(data, Mapping):
+        return type(data)({k: collate_fn(v, device) for k, v in data.items()})
+    elif isinstance(data, (tuple, list)):
+        if isinstance(data[0], (int, float)):
+            return default_collate(data).to(device)
+        else:
+            return type(data)(collate_fn(v, device) for v in data)
+    elif isinstance(data, np.ndarray):
+        if data.dtype.type is np.str_:
+            return data
+        else:
+            return collate_fn(torch.from_numpy(data), device)
+    elif isinstance(data, torch.Tensor):
+        return data.to(device)
+    elif isinstance(data, (bytes, str, int, float, bool, type(None))):
+        return data
+    elif isinstance(data, InputFeatures):
+        return data
+    else:
+        import mmcv
+        if isinstance(data, mmcv.parallel.data_container.DataContainer):
+            return data
+        else:
+            raise ValueError(f'Unsupported data type {type(data)}')
diff --git a/modelscope/pipelines/builder.py b/modelscope/pipelines/builder.py
index 52dfa41b..7fa66b5f 100644
--- a/modelscope/pipelines/builder.py
+++ b/modelscope/pipelines/builder.py
@@ -17,12 +17,23 @@ PIPELINES = Registry('pipelines')
 
 DEFAULT_MODEL_FOR_PIPELINE = {
     # TaskName: (pipeline_module_name, model_repo)
+    Tasks.sentence_embedding:
+    (Pipelines.sentence_embedding,
+     'damo/nlp_corom_sentence-embedding_english-base'),
+    Tasks.passage_ranking: (Pipelines.passage_ranking,
+                            'damo/nlp_corom_passage-ranking_english-base'),
     Tasks.word_segmentation:
     (Pipelines.word_segmentation,
      'damo/nlp_structbert_word-segmentation_chinese-base'),
+    Tasks.token_classification:
+    (Pipelines.part_of_speech,
+     'damo/nlp_structbert_part-of-speech_chinese-base'),
     Tasks.named_entity_recognition:
     (Pipelines.named_entity_recognition,
      'damo/nlp_raner_named-entity-recognition_chinese-base-news'),
+    Tasks.information_extraction:
+    (Pipelines.relation_extraction,
+     'damo/nlp_bert_relation-extraction_chinese-base'),
     Tasks.sentence_similarity:
     (Pipelines.sentence_similarity,
      'damo/nlp_structbert_sentence-similarity_chinese-base'),
@@ -41,8 +52,9 @@ DEFAULT_MODEL_FOR_PIPELINE = {
                                    'damo/cv_vit_object-detection_coco'),
     Tasks.image_denoising: (Pipelines.image_denoise,
                             'damo/cv_nafnet_image-denoise_sidd'),
-    Tasks.text_classification: (Pipelines.sentiment_analysis,
-                                'damo/bert-base-sst2'),
+    Tasks.text_classification:
+    (Pipelines.sentiment_classification,
+     'damo/nlp_structbert_sentiment-classification_chinese-base'),
     Tasks.text_generation: (Pipelines.text_generation,
                             'damo/nlp_palm2.0_text-generation_chinese-base'),
     Tasks.zero_shot_classification:
@@ -55,6 +67,9 @@ DEFAULT_MODEL_FOR_PIPELINE = {
     Tasks.conversational_text_to_sql:
     (Pipelines.conversational_text_to_sql,
      'damo/nlp_star_conversational-text-to-sql'),
+    Tasks.table_question_answering:
+    (Pipelines.table_question_answering_pipeline,
+     'damo/nlp-convai-text2sql-pretrain-cn'),
     Tasks.text_error_correction:
     (Pipelines.text_error_correction,
      'damo/nlp_bart_text-error-correction_chinese'),
@@ -66,8 +81,12 @@ DEFAULT_MODEL_FOR_PIPELINE = {
     Tasks.ocr_detection: (Pipelines.ocr_detection,
                           'damo/cv_resnet18_ocr-detection-line-level_damo'),
     Tasks.fill_mask: (Pipelines.fill_mask, 'damo/nlp_veco_fill-mask-large'),
+    Tasks.feature_extraction: (Pipelines.feature_extraction,
+                               'damo/pert_feature-extraction_base-test'),
     Tasks.action_recognition: (Pipelines.action_recognition,
                                'damo/cv_TAdaConv_action-recognition'),
+    Tasks.action_detection: (Pipelines.action_detection,
+                             'damo/cv_ResNetC3D_action-detection_detection2d'),
     Tasks.live_category: (Pipelines.live_category,
                           'damo/cv_resnet50_live-category'),
     Tasks.video_category: (Pipelines.video_category,
@@ -79,6 +98,9 @@ DEFAULT_MODEL_FOR_PIPELINE = {
     (Pipelines.generative_multi_modal_embedding,
      'damo/multi-modal_gemm-vit-large-patch14_generative-multi-modal-embedding'
      ),
+    Tasks.multi_modal_similarity:
+    (Pipelines.multi_modal_similarity,
+     'damo/multi-modal_team-vit-large-patch14_multi-modal-similarity'),
     Tasks.visual_question_answering:
     (Pipelines.visual_question_answering,
      'damo/mplug_visual-question-answering_coco_large_en'),
@@ -89,10 +111,20 @@ DEFAULT_MODEL_FOR_PIPELINE = {
      'damo/cv_diffusion_text-to-image-synthesis_tiny'),
     Tasks.body_2d_keypoints: (Pipelines.body_2d_keypoints,
                               'damo/cv_hrnetv2w32_body-2d-keypoints_image'),
+    Tasks.body_3d_keypoints: (Pipelines.body_3d_keypoints,
+                              'damo/cv_canonical_body-3d-keypoints_video'),
+    Tasks.hand_2d_keypoints:
+    (Pipelines.hand_2d_keypoints,
+     'damo/cv_hrnetw18_hand-pose-keypoints_coco-wholebody'),
     Tasks.face_detection: (Pipelines.face_detection,
                            'damo/cv_resnet_facedetection_scrfd10gkps'),
     Tasks.face_recognition: (Pipelines.face_recognition,
                              'damo/cv_ir101_facerecognition_cfglint'),
+    Tasks.facial_expression_recognition:
+    (Pipelines.facial_expression_recognition,
+     'damo/cv_vgg19_facial-expression-recognition_fer'),
+    Tasks.face_2d_keypoints: (Pipelines.face_2d_keypoints,
+                              'damo/cv_mobilenet_face-2d-keypoints_alignment'),
     Tasks.video_multi_modal_embedding:
     (Pipelines.video_multi_modal_embedding,
      'damo/multi_modal_clip_vtretrival_msrvtt_53'),
@@ -129,6 +161,9 @@ DEFAULT_MODEL_FOR_PIPELINE = {
      'damo/cv_convnextTiny_ocr-recognition-general_damo'),
     Tasks.skin_retouching: (Pipelines.skin_retouching,
                             'damo/cv_unet_skin-retouching'),
+    Tasks.faq_question_answering:
+    (Pipelines.faq_question_answering,
+     'damo/nlp_structbert_faq-question-answering_chinese-base'),
     Tasks.crowd_counting: (Pipelines.crowd_counting,
                            'damo/cv_hrnet_crowd-counting_dcanet'),
     Tasks.video_single_object_tracking:
@@ -136,6 +171,24 @@ DEFAULT_MODEL_FOR_PIPELINE = {
      'damo/cv_vitb_video-single-object-tracking_ostrack'),
     Tasks.image_reid_person: (Pipelines.image_reid_person,
                               'damo/cv_passvitb_image-reid-person_market'),
+    Tasks.text_driven_segmentation:
+    (Pipelines.text_driven_segmentation,
+     'damo/cv_vitl16_segmentation_text-driven-seg'),
+    Tasks.movie_scene_segmentation:
+    (Pipelines.movie_scene_segmentation,
+     'damo/cv_resnet50-bert_video-scene-segmentation_movienet'),
+    Tasks.shop_segmentation: (Pipelines.shop_segmentation,
+                              'damo/cv_vitb16_segmentation_shop-seg'),
+    Tasks.video_inpainting: (Pipelines.video_inpainting,
+                             'damo/cv_video-inpainting'),
+    Tasks.hand_static: (Pipelines.hand_static,
+                        'damo/cv_mobileface_hand-static'),
+    Tasks.face_human_hand_detection:
+    (Pipelines.face_human_hand_detection,
+     'damo/cv_nanodet_face-human-hand-detection'),
+    Tasks.face_emotion: (Pipelines.face_emotion, 'damo/cv_face-emotion'),
+    Tasks.product_segmentation: (Pipelines.product_segmentation,
+                                 'damo/cv_F3Net_product-segmentation'),
 }
 
 
@@ -218,7 +271,6 @@ def pipeline(task: str = None,
         f'model should be either None, str, List[str], Model, or List[Model], but got {type(model)}'
 
     model = normalize_model_input(model, model_revision)
-
     if pipeline_name is None:
         # get default pipeline for this task
         if isinstance(model, str) \
diff --git a/modelscope/pipelines/cv/__init__.py b/modelscope/pipelines/cv/__init__.py
index 4ff1b856..55bad09a 100644
--- a/modelscope/pipelines/cv/__init__.py
+++ b/modelscope/pipelines/cv/__init__.py
@@ -5,9 +5,13 @@ from modelscope.utils.import_utils import LazyImportModule
 
 if TYPE_CHECKING:
     from .action_recognition_pipeline import ActionRecognitionPipeline
+    from .action_detection_pipeline import ActionDetectionPipeline
     from .animal_recognition_pipeline import AnimalRecognitionPipeline
     from .body_2d_keypoints_pipeline import Body2DKeypointsPipeline
+    from .body_3d_keypoints_pipeline import Body3DKeypointsPipeline
+    from .hand_2d_keypoints_pipeline import Hand2DKeypointsPipeline
     from .cmdssl_video_embedding_pipeline import CMDSSLVideoEmbeddingPipeline
+    from .hicossl_video_embedding_pipeline import HICOSSLVideoEmbeddingPipeline
     from .crowd_counting_pipeline import CrowdCountingPipeline
     from .image_detection_pipeline import ImageDetectionPipeline
     from .image_salient_detection_pipeline import ImageSalientDetectionPipeline
@@ -23,13 +27,16 @@ if TYPE_CHECKING:
     from .image_denoise_pipeline import ImageDenoisePipeline
     from .image_instance_segmentation_pipeline import ImageInstanceSegmentationPipeline
     from .image_matting_pipeline import ImageMattingPipeline
+    from .image_panoptic_segmentation_pipeline import ImagePanopticSegmentationPipeline
     from .image_portrait_enhancement_pipeline import ImagePortraitEnhancementPipeline
     from .image_reid_person_pipeline import ImageReidPersonPipeline
+    from .image_semantic_segmentation_pipeline import ImageSemanticSegmentationPipeline
     from .image_style_transfer_pipeline import ImageStyleTransferPipeline
     from .image_super_resolution_pipeline import ImageSuperResolutionPipeline
     from .image_to_image_generate_pipeline import Image2ImageGenerationPipeline
     from .image_to_image_translation_pipeline import Image2ImageTranslationPipeline
     from .product_retrieval_embedding_pipeline import ProductRetrievalEmbeddingPipeline
+    from .realtime_object_detection_pipeline import RealtimeObjectDetectionPipeline
     from .live_category_pipeline import LiveCategoryPipeline
     from .ocr_detection_pipeline import OCRDetectionPipeline
     from .ocr_recognition_pipeline import OCRRecognitionPipeline
@@ -37,12 +44,27 @@ if TYPE_CHECKING:
     from .tinynas_classification_pipeline import TinynasClassificationPipeline
     from .video_category_pipeline import VideoCategoryPipeline
     from .virtual_try_on_pipeline import VirtualTryonPipeline
+    from .shop_segmentation_pipleline import ShopSegmentationPipeline
+    from .easycv_pipelines import EasyCVDetectionPipeline, EasyCVSegmentationPipeline, Face2DKeypointsPipeline
+    from .text_driven_segmentation_pipleline import TextDrivenSegmentationPipeline
+    from .movie_scene_segmentation_pipeline import MovieSceneSegmentationPipeline
+    from .mog_face_detection_pipeline import MogFaceDetectionPipeline
+    from .ulfd_face_detection_pipeline import UlfdFaceDetectionPipeline
+    from .retina_face_detection_pipeline import RetinaFaceDetectionPipeline
+    from .facial_expression_recognition_pipeline import FacialExpressionRecognitionPipeline
+    from .mtcnn_face_detection_pipeline import MtcnnFaceDetectionPipelin
+    from .hand_static_pipeline import HandStaticPipeline
+
 else:
     _import_structure = {
         'action_recognition_pipeline': ['ActionRecognitionPipeline'],
+        'action_detection_pipeline': ['ActionDetectionPipeline'],
         'animal_recognition_pipeline': ['AnimalRecognitionPipeline'],
         'body_2d_keypoints_pipeline': ['Body2DKeypointsPipeline'],
+        'body_3d_keypoints_pipeline': ['Body3DKeypointsPipeline'],
+        'hand_2d_keypoints_pipeline': ['Hand2DKeypointsPipeline'],
         'cmdssl_video_embedding_pipeline': ['CMDSSLVideoEmbeddingPipeline'],
+        'hicossl_video_embedding_pipeline': ['HICOSSLVideoEmbeddingPipeline'],
         'crowd_counting_pipeline': ['CrowdCountingPipeline'],
         'image_detection_pipeline': ['ImageDetectionPipeline'],
         'image_salient_detection_pipeline': ['ImageSalientDetectionPipeline'],
@@ -59,15 +81,21 @@ else:
         'image_instance_segmentation_pipeline':
         ['ImageInstanceSegmentationPipeline'],
         'image_matting_pipeline': ['ImageMattingPipeline'],
+        'image_panoptic_segmentation_pipeline':
+        ['ImagePanopticSegmentationPipeline'],
         'image_portrait_enhancement_pipeline':
         ['ImagePortraitEnhancementPipeline'],
         'image_reid_person_pipeline': ['ImageReidPersonPipeline'],
+        'image_semantic_segmentation_pipeline':
+        ['ImageSemanticSegmentationPipeline'],
         'image_style_transfer_pipeline': ['ImageStyleTransferPipeline'],
         'image_super_resolution_pipeline': ['ImageSuperResolutionPipeline'],
         'image_to_image_translation_pipeline':
         ['Image2ImageTranslationPipeline'],
         'product_retrieval_embedding_pipeline':
         ['ProductRetrievalEmbeddingPipeline'],
+        'realtime_object_detection_pipeline':
+        ['RealtimeObjectDetectionPipeline'],
         'live_category_pipeline': ['LiveCategoryPipeline'],
         'image_to_image_generation_pipeline':
         ['Image2ImageGenerationPipeline'],
@@ -77,6 +105,22 @@ else:
         'tinynas_classification_pipeline': ['TinynasClassificationPipeline'],
         'video_category_pipeline': ['VideoCategoryPipeline'],
         'virtual_try_on_pipeline': ['VirtualTryonPipeline'],
+        'shop_segmentation_pipleline': ['ShopSegmentationPipeline'],
+        'easycv_pipeline': [
+            'EasyCVDetectionPipeline', 'EasyCVSegmentationPipeline',
+            'Face2DKeypointsPipeline'
+        ],
+        'text_driven_segmentation_pipeline':
+        ['TextDrivenSegmentationPipeline'],
+        'movie_scene_segmentation_pipeline':
+        ['MovieSceneSegmentationPipeline'],
+        'mog_face_detection_pipeline': ['MogFaceDetectionPipeline'],
+        'ulfd_face_detection_pipeline': ['UlfdFaceDetectionPipeline'],
+        'retina_face_detection_pipeline': ['RetinaFaceDetectionPipeline'],
+        'facial_expression_recognition_pipelin':
+        ['FacialExpressionRecognitionPipeline'],
+        'mtcnn_face_detection_pipeline': ['MtcnnFaceDetectionPipeline'],
+        'hand_static_pipeline': ['HandStaticPipeline'],
     }
 
     import sys
diff --git a/modelscope/pipelines/cv/action_detection_pipeline.py b/modelscope/pipelines/cv/action_detection_pipeline.py
new file mode 100644
index 00000000..74d1862e
--- /dev/null
+++ b/modelscope/pipelines/cv/action_detection_pipeline.py
@@ -0,0 +1,65 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import math
+import os.path as osp
+from typing import Any, Dict
+
+from modelscope.metainfo import Pipelines
+from modelscope.models.cv.action_detection import ActionDetONNX
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines.base import Input, Pipeline
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.utils.config import Config
+from modelscope.utils.constant import ModelFile, Tasks
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+
+@PIPELINES.register_module(
+    Tasks.action_detection, module_name=Pipelines.action_detection)
+class ActionDetectionPipeline(Pipeline):
+
+    def __init__(self, model: str, **kwargs):
+        """
+        use `model` to create a action detection pipeline for prediction
+        Args:
+            model: model id on modelscope hub.
+        """
+        super().__init__(model=model, **kwargs)
+        model_path = osp.join(self.model, ModelFile.ONNX_MODEL_FILE)
+        logger.info(f'loading model from {model_path}')
+        config_path = osp.join(self.model, ModelFile.CONFIGURATION)
+        logger.info(f'loading config from {config_path}')
+        self.cfg = Config.from_file(config_path)
+        self.cfg.MODEL.model_file = model_path
+        self.model = ActionDetONNX(self.model, self.cfg.MODEL,
+                                   self.device_name)
+        logger.info('load model done')
+
+    def preprocess(self, input: Input) -> Dict[str, Any]:
+        if isinstance(input, str):
+            video_name = input
+        else:
+            raise TypeError(f'input should be a str,'
+                            f'  but got {type(input)}')
+        result = {'video_name': video_name}
+        return result
+
+    def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
+        preds = self.model.forward(input['video_name'])
+        labels = sum([pred['actions']['labels'] for pred in preds], [])
+        scores = sum([pred['actions']['scores'] for pred in preds], [])
+        boxes = sum([pred['actions']['boxes'] for pred in preds], [])
+        timestamps = sum([[pred['timestamp']] * len(pred['actions']['labels'])
+                          for pred in preds], [])
+        out = {
+            OutputKeys.TIMESTAMPS: timestamps,
+            OutputKeys.LABELS: labels,
+            OutputKeys.SCORES: scores,
+            OutputKeys.BOXES: boxes
+        }
+        return out
+
+    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
+        return inputs
diff --git a/modelscope/pipelines/cv/action_recognition_pipeline.py b/modelscope/pipelines/cv/action_recognition_pipeline.py
index 087548f0..993a32f0 100644
--- a/modelscope/pipelines/cv/action_recognition_pipeline.py
+++ b/modelscope/pipelines/cv/action_recognition_pipeline.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import math
 import os.path as osp
 from typing import Any, Dict
@@ -5,7 +7,8 @@ from typing import Any, Dict
 import torch
 
 from modelscope.metainfo import Pipelines
-from modelscope.models.cv.action_recognition import BaseVideoModel
+from modelscope.models.cv.action_recognition import (BaseVideoModel,
+                                                     PatchShiftTransformer)
 from modelscope.outputs import OutputKeys
 from modelscope.pipelines.base import Input, Pipeline
 from modelscope.pipelines.builder import PIPELINES
@@ -33,6 +36,7 @@ class ActionRecognitionPipeline(Pipeline):
         config_path = osp.join(self.model, ModelFile.CONFIGURATION)
         logger.info(f'loading config from {config_path}')
         self.cfg = Config.from_file(config_path)
+
         self.infer_model = BaseVideoModel(cfg=self.cfg).to(self.device)
         self.infer_model.eval()
         self.infer_model.load_state_dict(
@@ -66,3 +70,54 @@ class ActionRecognitionPipeline(Pipeline):
 
     def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
         return inputs
+
+
+@PIPELINES.register_module(
+    Tasks.action_recognition, module_name=Pipelines.pst_action_recognition)
+class PSTActionRecognitionPipeline(Pipeline):
+
+    def __init__(self, model: str, **kwargs):
+        """
+        use `model` to create a PST action recognition pipeline for prediction
+        Args:
+            model: model id on modelscope hub.
+        """
+        super().__init__(model=model, **kwargs)
+        model_path = osp.join(self.model, ModelFile.TORCH_MODEL_FILE)
+        logger.info(f'loading model from {model_path}')
+        config_path = osp.join(self.model, ModelFile.CONFIGURATION)
+        logger.info(f'loading config from {config_path}')
+        self.cfg = Config.from_file(config_path)
+        self.infer_model = PatchShiftTransformer(model).to(self.device)
+        self.infer_model.eval()
+        self.infer_model.load_state_dict(
+            torch.load(model_path, map_location=self.device)['state_dict'])
+        self.label_mapping = self.cfg.label_mapping
+        logger.info('load model done')
+
+    def preprocess(self, input: Input) -> Dict[str, Any]:
+        if isinstance(input, str):
+            video_input_data = ReadVideoData(self.cfg, input).to(self.device)
+        else:
+            raise TypeError(f'input should be a str,'
+                            f'  but got {type(input)}')
+        result = {'video_data': video_input_data}
+        return result
+
+    def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
+        pred = self.perform_inference(input['video_data'])
+        output_label = self.label_mapping[str(pred)]
+        return {OutputKeys.LABELS: output_label}
+
+    @torch.no_grad()
+    def perform_inference(self, data, max_bsz=4):
+        iter_num = math.ceil(data.size(0) / max_bsz)
+        preds_list = []
+        for i in range(iter_num):
+            preds_list.append(
+                self.infer_model(data[i * max_bsz:(i + 1) * max_bsz]))
+        pred = torch.cat(preds_list, dim=0)
+        return pred.mean(dim=0).argmax().item()
+
+    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
+        return inputs
diff --git a/modelscope/pipelines/cv/animal_recognition_pipeline.py b/modelscope/pipelines/cv/animal_recognition_pipeline.py
index 18cba92c..fad14680 100644
--- a/modelscope/pipelines/cv/animal_recognition_pipeline.py
+++ b/modelscope/pipelines/cv/animal_recognition_pipeline.py
@@ -1,3 +1,4 @@
+# Copyright 2021-2022 The Alibaba Fundamental Vision  Team Authors. All rights reserved.
 import os.path as osp
 from typing import Any, Dict
 
diff --git a/modelscope/pipelines/cv/body_2d_keypoints_pipeline.py b/modelscope/pipelines/cv/body_2d_keypoints_pipeline.py
index f9ae4b2c..d6afbae4 100644
--- a/modelscope/pipelines/cv/body_2d_keypoints_pipeline.py
+++ b/modelscope/pipelines/cv/body_2d_keypoints_pipeline.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import os.path as osp
 from typing import Any, Dict, List, Union
 
@@ -76,8 +78,11 @@ class Body2DKeypointsPipeline(Pipeline):
             }
 
         poses, scores, boxes = self.keypoint_model.postprocess(input)
+        result_boxes = []
+        for box in boxes:
+            result_boxes.append([box[0][0], box[0][1], box[1][0], box[1][1]])
         return {
-            OutputKeys.BOXES: boxes,
+            OutputKeys.BOXES: result_boxes,
             OutputKeys.POSES: poses,
             OutputKeys.SCORES: scores
         }
diff --git a/modelscope/pipelines/cv/body_3d_keypoints_pipeline.py b/modelscope/pipelines/cv/body_3d_keypoints_pipeline.py
new file mode 100644
index 00000000..474c0e54
--- /dev/null
+++ b/modelscope/pipelines/cv/body_3d_keypoints_pipeline.py
@@ -0,0 +1,362 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import datetime
+import os.path as osp
+import tempfile
+from typing import Any, Dict, List, Union
+
+import cv2
+import matplotlib
+import matplotlib.pyplot as plt
+import mpl_toolkits.mplot3d.axes3d as p3
+import numpy as np
+import torch
+from matplotlib import animation
+from matplotlib.animation import writers
+from matplotlib.ticker import MultipleLocator
+
+from modelscope.metainfo import Pipelines
+from modelscope.models.cv.body_3d_keypoints.body_3d_pose import (
+    BodyKeypointsDetection3D, KeypointsTypes)
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines import pipeline
+from modelscope.pipelines.base import Input, Model, Pipeline, Tensor
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.utils.constant import Tasks
+from modelscope.utils.logger import get_logger
+
+matplotlib.use('Agg')
+
+logger = get_logger()
+
+
+def convert_2_h36m(joints, joints_nbr=15):
+    lst_mappings = [[0, 8], [1, 7], [2, 12], [3, 13], [4, 14], [5, 9], [6, 10],
+                    [7, 11], [8, 1], [9, 2], [10, 3], [11, 4], [12, 5],
+                    [13, 6], [14, 0]]
+    nbr, dim = joints.shape
+    h36m_joints = np.zeros((nbr, dim))
+    for mapping in lst_mappings:
+        h36m_joints[mapping[1]] = joints[mapping[0]]
+
+    if joints_nbr == 17:
+        lst_mappings_17 = np.array([[0, 0], [1, 1], [2, 2], [3, 3], [4, 4],
+                                    [5, 5], [6, 6], [7, 8], [8, 10], [9, 11],
+                                    [10, 12], [11, 13], [12, 14], [13, 15],
+                                    [14, 16]])
+        h36m_joints_17 = np.zeros((17, 2))
+        h36m_joints_17[lst_mappings_17[:, 1]] = h36m_joints[lst_mappings_17[:,
+                                                                            0]]
+        h36m_joints_17[7] = (h36m_joints_17[0] + h36m_joints_17[8]) * 0.5
+        h36m_joints_17[9] = (h36m_joints_17[8] + h36m_joints_17[10]) * 0.5
+        h36m_joints = h36m_joints_17
+
+    return h36m_joints
+
+
+def smooth_pts(cur_pts, pre_pts, bbox, smooth_x=15.0, smooth_y=15.0):
+    if pre_pts is None:
+        return cur_pts
+
+    w, h = bbox[1] - bbox[0]
+    if w == 0 or h == 0:
+        return cur_pts
+
+    size_pre = len(pre_pts)
+    size_cur = len(cur_pts)
+    if (size_pre == 0 or size_cur == 0):
+        return cur_pts
+
+    factor_x = -(smooth_x / w)
+    factor_y = -(smooth_y / w)
+
+    for i in range(size_cur):
+        w_x = np.exp(factor_x * np.abs(cur_pts[i][0] - pre_pts[i][0]))
+        w_y = np.exp(factor_y * np.abs(cur_pts[i][1] - pre_pts[i][1]))
+        cur_pts[i][0] = (1.0 - w_x) * cur_pts[i][0] + w_x * pre_pts[i][0]
+        cur_pts[i][1] = (1.0 - w_y) * cur_pts[i][1] + w_y * pre_pts[i][1]
+    return cur_pts
+
+
+def smoothing(lst_kps, lst_bboxes, smooth_x=15.0, smooth_y=15.0):
+    assert lst_kps.shape[0] == lst_bboxes.shape[0]
+
+    lst_smoothed_kps = []
+    prev_pts = None
+    for i in range(lst_kps.shape[0]):
+        smoothed_cur_kps = smooth_pts(lst_kps[i], prev_pts,
+                                      lst_bboxes[i][0:-1].reshape(2, 2),
+                                      smooth_x, smooth_y)
+        lst_smoothed_kps.append(smoothed_cur_kps)
+        prev_pts = smoothed_cur_kps
+
+    return np.array(lst_smoothed_kps)
+
+
+def convert_2_h36m_data(lst_kps, lst_bboxes, joints_nbr=15):
+    lst_kps = lst_kps.squeeze()
+    lst_bboxes = lst_bboxes.squeeze()
+
+    assert lst_kps.shape[0] == lst_bboxes.shape[0]
+
+    lst_kps = smoothing(lst_kps, lst_bboxes)
+
+    keypoints = []
+    for i in range(lst_kps.shape[0]):
+        h36m_joints_2d = convert_2_h36m(lst_kps[i], joints_nbr=joints_nbr)
+        keypoints.append(h36m_joints_2d)
+    return keypoints
+
+
+@PIPELINES.register_module(
+    Tasks.body_3d_keypoints, module_name=Pipelines.body_3d_keypoints)
+class Body3DKeypointsPipeline(Pipeline):
+
+    def __init__(self, model: Union[str, BodyKeypointsDetection3D], **kwargs):
+        """Human body 3D pose estimation.
+
+        Args:
+            model (Union[str, BodyKeypointsDetection3D]): model id on modelscope hub.
+        """
+        super().__init__(model=model, **kwargs)
+
+        self.keypoint_model_3d = model if isinstance(
+            model, BodyKeypointsDetection3D) else Model.from_pretrained(model)
+        self.keypoint_model_3d.eval()
+
+        # init human body 2D keypoints detection pipeline
+        self.human_body_2d_kps_det_pipeline = 'damo/cv_hrnetv2w32_body-2d-keypoints_image'
+        self.human_body_2d_kps_detector = pipeline(
+            Tasks.body_2d_keypoints,
+            model=self.human_body_2d_kps_det_pipeline,
+            device='gpu' if torch.cuda.is_available() else 'cpu')
+
+    def preprocess(self, input: Input) -> Dict[str, Any]:
+        video_url = input.get('input_video')
+        self.output_video_path = input.get('output_video_path')
+        if self.output_video_path is None:
+            self.output_video_path = tempfile.NamedTemporaryFile(
+                suffix='.mp4').name
+
+        video_frames = self.read_video_frames(video_url)
+        if 0 == len(video_frames):
+            res = {'success': False, 'msg': 'get video frame failed.'}
+            return res
+
+        all_2d_poses = []
+        all_boxes_with_socre = []
+        max_frame = self.keypoint_model_3d.cfg.model.INPUT.MAX_FRAME  # max video frame number to be predicted 3D joints
+        for i, frame in enumerate(video_frames):
+            kps_2d = self.human_body_2d_kps_detector(frame)
+            box = kps_2d['boxes'][
+                0]  # box: [[[x1, y1], [x2, y2]]], N human boxes per frame, [0] represent using first detected bbox
+            pose = kps_2d['poses'][0]  # keypoints: [15, 2]
+            score = kps_2d['scores'][0]  # keypoints: [15, 2]
+            all_2d_poses.append(pose)
+            all_boxes_with_socre.append(
+                list(np.array(box).reshape(
+                    (-1))) + [score])  # construct to list with shape [5]
+            if (i + 1) >= max_frame:
+                break
+
+        all_2d_poses_np = np.array(all_2d_poses).reshape(
+            (len(all_2d_poses), 15,
+             2))  # 15: 2d keypoints number, 2: keypoint coordinate (x, y)
+        all_boxes_np = np.array(all_boxes_with_socre).reshape(
+            (len(all_boxes_with_socre), 5))  # [x1, y1, x2, y2, score]
+
+        kps_2d_h36m_17 = convert_2_h36m_data(
+            all_2d_poses_np,
+            all_boxes_np,
+            joints_nbr=self.keypoint_model_3d.cfg.model.MODEL.IN_NUM_JOINTS)
+        kps_2d_h36m_17 = np.array(kps_2d_h36m_17)
+        res = {'success': True, 'input_2d_pts': kps_2d_h36m_17}
+        return res
+
+    def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
+        if not input['success']:
+            res = {'success': False, 'msg': 'preprocess failed.'}
+            return res
+
+        input_2d_pts = input['input_2d_pts']
+        outputs = self.keypoint_model_3d.preprocess(input_2d_pts)
+        outputs = self.keypoint_model_3d.forward(outputs)
+        res = dict({'success': True}, **outputs)
+        return res
+
+    def postprocess(self, input: Dict[str, Any], **kwargs) -> Dict[str, Any]:
+        res = {OutputKeys.POSES: [], OutputKeys.TIMESTAMPS: []}
+
+        if not input['success']:
+            pass
+        else:
+            poses = input[KeypointsTypes.POSES_CAMERA]
+            pred_3d_pose = poses.data.cpu().numpy()[
+                0]  # [frame_num, joint_num, joint_dim]
+
+            if 'render' in self.keypoint_model_3d.cfg.keys():
+                self.render_prediction(pred_3d_pose)
+                res[OutputKeys.OUTPUT_VIDEO] = self.output_video_path
+
+            res[OutputKeys.POSES] = pred_3d_pose
+            res[OutputKeys.TIMESTAMPS] = self.timestamps
+        return res
+
+    def read_video_frames(self, video_url: Union[str, cv2.VideoCapture]):
+        """Read video from local video file or from a video stream URL.
+
+        Args:
+            video_url (str or cv2.VideoCapture): Video path or video stream.
+
+        Raises:
+            Exception: Open video fail.
+
+        Returns:
+            [nd.array]: List of video frames.
+        """
+
+        def timestamp_format(seconds):
+            m, s = divmod(seconds, 60)
+            h, m = divmod(m, 60)
+            time = '%02d:%02d:%06.3f' % (h, m, s)
+            return time
+
+        frames = []
+        self.timestamps = []  # for video render
+        if isinstance(video_url, str):
+            cap = cv2.VideoCapture(video_url)
+            if not cap.isOpened():
+                raise Exception(
+                    'modelscope error: %s cannot be decoded by OpenCV.' %
+                    (video_url))
+        else:
+            cap = video_url
+
+        self.fps = cap.get(cv2.CAP_PROP_FPS)
+        if self.fps is None or self.fps <= 0:
+            raise Exception('modelscope error: %s cannot get video fps info.' %
+                            (video_url))
+
+        max_frame_num = self.keypoint_model_3d.cfg.model.INPUT.MAX_FRAME
+        frame_idx = 0
+        while True:
+            ret, frame = cap.read()
+            if not ret:
+                break
+            self.timestamps.append(
+                timestamp_format(seconds=frame_idx / self.fps))
+            frame_idx += 1
+            frames.append(frame)
+            if frame_idx >= max_frame_num:
+                break
+        cap.release()
+        return frames
+
+    def render_prediction(self, pose3d_cam_rr):
+        """render predict result 3d poses.
+
+        Args:
+            pose3d_cam_rr (nd.array): [frame_num, joint_num, joint_dim], 3d pose joints
+
+        Returns:
+        """
+        frame_num = pose3d_cam_rr.shape[0]
+
+        left_points = [11, 12, 13, 4, 5, 6]  # joints of left body
+        edges = [[0, 1], [0, 4], [0, 7], [1, 2], [4, 5], [5, 6], [2,
+                                                                  3], [7, 8],
+                 [8, 9], [8, 11], [8, 14], [14, 15], [15, 16], [11, 12],
+                 [12, 13], [9, 10]]  # connection between joints
+
+        fig = plt.figure()
+        ax = p3.Axes3D(fig)
+        x_major_locator = MultipleLocator(0.5)
+
+        ax.xaxis.set_major_locator(x_major_locator)
+        ax.yaxis.set_major_locator(x_major_locator)
+        ax.zaxis.set_major_locator(x_major_locator)
+        ax.set_xlabel('X')
+        ax.set_ylabel('Y')
+        ax.set_zlabel('Z')
+        ax.set_xlim(-1, 1)
+        ax.set_ylim(-1, 1)
+        ax.set_zlim(-1, 1)
+        # view direction
+        azim = self.keypoint_model_3d.cfg.render.azim
+        elev = self.keypoint_model_3d.cfg.render.elev
+        ax.view_init(elev, azim)
+
+        # init plot, essentially
+        x = pose3d_cam_rr[0, :, 0]
+        y = pose3d_cam_rr[0, :, 1]
+        z = pose3d_cam_rr[0, :, 2]
+        points, = ax.plot(x, y, z, 'r.')
+
+        def renderBones(xs, ys, zs):
+            """render bones in skeleton
+
+            Args:
+                xs (nd.array): [joint_num, joint_channel]
+                ys (nd.array): [joint_num, joint_channel]
+                zs (nd.array): [joint_num, joint_channel]
+            """
+            bones = {}
+            for idx, edge in enumerate(edges):
+                index1, index2 = edge[0], edge[1]
+                if index1 in left_points:
+                    edge_color = 'red'
+                else:
+                    edge_color = 'blue'
+                connect = ax.plot([xs[index1], xs[index2]],
+                                  [ys[index1], ys[index2]],
+                                  [zs[index1], zs[index2]],
+                                  linewidth=2,
+                                  color=edge_color)  # plot edge
+                bones[idx] = connect[0]
+            return bones
+
+        bones = renderBones(x, y, z)
+
+        def update(frame_idx, points, bones):
+            """update animation
+
+            Args:
+                frame_idx (int): frame index
+                points (mpl_toolkits.mplot3d.art3d.Line3D): skeleton points ploter
+                bones (dict[int, mpl_toolkits.mplot3d.art3d.Line3D]): connection ploter
+
+            Returns:
+                tuple: points and bones ploter
+            """
+            xs = pose3d_cam_rr[frame_idx, :, 0]
+            ys = pose3d_cam_rr[frame_idx, :, 1]
+            zs = pose3d_cam_rr[frame_idx, :, 2]
+
+            # update bones
+            for idx, edge in enumerate(edges):
+                index1, index2 = edge[0], edge[1]
+                x1x2 = (xs[index1], xs[index2])
+                y1y2 = (ys[index1], ys[index2])
+                z1z2 = (zs[index1], zs[index2])
+                bones[idx].set_xdata(x1x2)
+                bones[idx].set_ydata(y1y2)
+                bones[idx].set_3d_properties(z1z2, 'z')
+
+            # update joints
+            points.set_data(xs, ys)
+            points.set_3d_properties(zs, 'z')
+            if 0 == frame_idx / 100:
+                logger.info(f'rendering {frame_idx}/{frame_num}')
+            return points, bones
+
+        ani = animation.FuncAnimation(
+            fig=fig,
+            func=update,
+            frames=frame_num,
+            interval=self.fps,
+            fargs=(points, bones))
+
+        # save mp4
+        Writer = writers['ffmpeg']
+        writer = Writer(fps=self.fps, metadata={}, bitrate=4096)
+        ani.save(self.output_video_path, writer=writer)
diff --git a/modelscope/pipelines/cv/cmdssl_video_embedding_pipeline.py b/modelscope/pipelines/cv/cmdssl_video_embedding_pipeline.py
index 9f4e2d93..deb17561 100644
--- a/modelscope/pipelines/cv/cmdssl_video_embedding_pipeline.py
+++ b/modelscope/pipelines/cv/cmdssl_video_embedding_pipeline.py
@@ -1,3 +1,5 @@
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
+
 import os.path as osp
 from typing import Any, Dict
 
diff --git a/modelscope/pipelines/cv/easycv_pipelines/__init__.py b/modelscope/pipelines/cv/easycv_pipelines/__init__.py
new file mode 100644
index 00000000..4f149130
--- /dev/null
+++ b/modelscope/pipelines/cv/easycv_pipelines/__init__.py
@@ -0,0 +1,25 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .detection_pipeline import EasyCVDetectionPipeline
+    from .segmentation_pipeline import EasyCVSegmentationPipeline
+    from .face_2d_keypoints_pipeline import Face2DKeypointsPipeline
+else:
+    _import_structure = {
+        'detection_pipeline': ['EasyCVDetectionPipeline'],
+        'segmentation_pipeline': ['EasyCVSegmentationPipeline'],
+        'face_2d_keypoints_pipeline': ['Face2DKeypointsPipeline']
+    }
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/pipelines/cv/easycv_pipelines/base.py b/modelscope/pipelines/cv/easycv_pipelines/base.py
new file mode 100644
index 00000000..8aea1146
--- /dev/null
+++ b/modelscope/pipelines/cv/easycv_pipelines/base.py
@@ -0,0 +1,98 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import glob
+import os
+import os.path as osp
+from typing import Any
+
+from easycv.utils.ms_utils import EasyCVMeta
+
+from modelscope.hub.snapshot_download import snapshot_download
+from modelscope.pipelines.util import is_official_hub_path
+from modelscope.utils.config import Config
+from modelscope.utils.constant import DEFAULT_MODEL_REVISION, ModelFile
+from modelscope.utils.device import create_device
+
+
+class EasyCVPipeline(object):
+    """Base pipeline for EasyCV.
+    Loading configuration file of modelscope style by default,
+    but it is actually use the predictor api of easycv to predict.
+    So here we do some adaptation work for configuration and predict api.
+    """
+
+    def __init__(self, model: str, model_file_pattern='*.pt', *args, **kwargs):
+        """
+            model (str): model id on modelscope hub or local model path.
+            model_file_pattern (str): model file pattern.
+
+        """
+        self.model_file_pattern = model_file_pattern
+
+        assert isinstance(model, str)
+        if osp.exists(model):
+            model_dir = model
+        else:
+            assert is_official_hub_path(
+                model), 'Only support local model path and official hub path!'
+            model_dir = snapshot_download(
+                model_id=model, revision=DEFAULT_MODEL_REVISION)
+
+        assert osp.isdir(model_dir)
+        model_files = glob.glob(
+            os.path.join(model_dir, self.model_file_pattern))
+        assert len(
+            model_files
+        ) == 1, f'Need one model file, but find {len(model_files)}: {model_files}'
+
+        model_path = model_files[0]
+        self.model_path = model_path
+
+        # get configuration file from source model dir
+        self.config_file = os.path.join(model_dir, ModelFile.CONFIGURATION)
+        assert os.path.exists(
+            self.config_file
+        ), f'Not find "{ModelFile.CONFIGURATION}" in model directory!'
+
+        self.cfg = Config.from_file(self.config_file)
+        if 'device' in kwargs:
+            kwargs['device'] = create_device(kwargs['device'])
+        self.predict_op = self._build_predict_op(**kwargs)
+
+    def _build_predict_op(self, **kwargs):
+        """Build EasyCV predictor."""
+        from easycv.predictors.builder import build_predictor
+
+        easycv_config = self._to_easycv_config()
+        pipeline_op = build_predictor(self.cfg.pipeline.predictor_config, {
+            'model_path': self.model_path,
+            'config_file': easycv_config,
+            **kwargs
+        })
+        return pipeline_op
+
+    def _to_easycv_config(self):
+        """Adapt to EasyCV predictor."""
+        # TODO: refine config compatibility problems
+
+        easycv_arch = self.cfg.model.pop(EasyCVMeta.ARCH, None)
+        model_cfg = self.cfg.model
+        # Revert to the configuration of easycv
+        if easycv_arch is not None:
+            model_cfg.update(easycv_arch)
+
+        easycv_config = Config(dict(model=model_cfg))
+
+        reserved_keys = []
+        if hasattr(self.cfg, EasyCVMeta.META):
+            easycv_meta_cfg = getattr(self.cfg, EasyCVMeta.META)
+            reserved_keys = easycv_meta_cfg.get(EasyCVMeta.RESERVED_KEYS, [])
+            for key in reserved_keys:
+                easycv_config.merge_from_dict({key: getattr(self.cfg, key)})
+        if 'test_pipeline' not in reserved_keys:
+            easycv_config.merge_from_dict(
+                {'test_pipeline': self.cfg.dataset.val.get('pipeline', [])})
+
+        return easycv_config
+
+    def __call__(self, inputs) -> Any:
+        return self.predict_op(inputs)
diff --git a/modelscope/pipelines/cv/easycv_pipelines/detection_pipeline.py b/modelscope/pipelines/cv/easycv_pipelines/detection_pipeline.py
new file mode 100644
index 00000000..32365102
--- /dev/null
+++ b/modelscope/pipelines/cv/easycv_pipelines/detection_pipeline.py
@@ -0,0 +1,23 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from modelscope.metainfo import Pipelines
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.utils.constant import Tasks
+from .base import EasyCVPipeline
+
+
+@PIPELINES.register_module(
+    Tasks.image_object_detection, module_name=Pipelines.easycv_detection)
+class EasyCVDetectionPipeline(EasyCVPipeline):
+    """Pipeline for easycv detection task."""
+
+    def __init__(self, model: str, model_file_pattern='*.pt', *args, **kwargs):
+        """
+            model (str): model id on modelscope hub or local model path.
+            model_file_pattern (str): model file pattern.
+        """
+
+        super(EasyCVDetectionPipeline, self).__init__(
+            model=model,
+            model_file_pattern=model_file_pattern,
+            *args,
+            **kwargs)
diff --git a/modelscope/pipelines/cv/easycv_pipelines/face_2d_keypoints_pipeline.py b/modelscope/pipelines/cv/easycv_pipelines/face_2d_keypoints_pipeline.py
new file mode 100644
index 00000000..7c32e0fc
--- /dev/null
+++ b/modelscope/pipelines/cv/easycv_pipelines/face_2d_keypoints_pipeline.py
@@ -0,0 +1,43 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import Any
+
+from modelscope.metainfo import Pipelines
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.utils.constant import ModelFile, Tasks
+from .base import EasyCVPipeline
+
+
+@PIPELINES.register_module(
+    Tasks.face_2d_keypoints, module_name=Pipelines.face_2d_keypoints)
+class Face2DKeypointsPipeline(EasyCVPipeline):
+    """Pipeline for face 2d keypoints detection."""
+
+    def __init__(self,
+                 model: str,
+                 model_file_pattern=ModelFile.TORCH_MODEL_FILE,
+                 *args,
+                 **kwargs):
+        """
+            model (str): model id on modelscope hub or local model path.
+            model_file_pattern (str): model file pattern.
+        """
+
+        super(Face2DKeypointsPipeline, self).__init__(
+            model=model,
+            model_file_pattern=model_file_pattern,
+            *args,
+            **kwargs)
+
+    def show_result(self, img, points, scale=2, save_path=None):
+        return self.predict_op.show_result(img, points, scale, save_path)
+
+    def __call__(self, inputs) -> Any:
+        outputs = self.predict_op(inputs)
+
+        results = [{
+            OutputKeys.KEYPOINTS: output['point'],
+            OutputKeys.POSES: output['pose']
+        } for output in outputs]
+
+        return results
diff --git a/modelscope/pipelines/cv/easycv_pipelines/segmentation_pipeline.py b/modelscope/pipelines/cv/easycv_pipelines/segmentation_pipeline.py
new file mode 100644
index 00000000..bd09fc9b
--- /dev/null
+++ b/modelscope/pipelines/cv/easycv_pipelines/segmentation_pipeline.py
@@ -0,0 +1,47 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import Any
+
+import numpy as np
+
+from modelscope.metainfo import Pipelines
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.utils.constant import Tasks
+from .base import EasyCVPipeline
+
+
+@PIPELINES.register_module(
+    Tasks.image_segmentation, module_name=Pipelines.easycv_segmentation)
+class EasyCVSegmentationPipeline(EasyCVPipeline):
+    """Pipeline for easycv segmentation task."""
+
+    def __init__(self, model: str, model_file_pattern='*.pt', *args, **kwargs):
+        """
+            model (str): model id on modelscope hub or local model path.
+            model_file_pattern (str): model file pattern.
+        """
+
+        super(EasyCVSegmentationPipeline, self).__init__(
+            model=model,
+            model_file_pattern=model_file_pattern,
+            *args,
+            **kwargs)
+
+    def __call__(self, inputs) -> Any:
+        outputs = self.predict_op(inputs)
+
+        semantic_result = outputs[0]['seg_pred']
+
+        ids = np.unique(semantic_result)[::-1]
+        legal_indices = ids != len(self.predict_op.CLASSES)  # for VOID label
+        ids = ids[legal_indices]
+        segms = (semantic_result[None] == ids[:, None, None])
+        masks = [it.astype(np.int) for it in segms]
+        labels_txt = np.array(self.predict_op.CLASSES)[ids].tolist()
+
+        results = {
+            OutputKeys.MASKS: masks,
+            OutputKeys.LABELS: labels_txt,
+            OutputKeys.SCORES: [0.999 for _ in range(len(labels_txt))]
+        }
+        return results
diff --git a/modelscope/pipelines/cv/face_detection_pipeline.py b/modelscope/pipelines/cv/face_detection_pipeline.py
index 8fda5b46..eff5b70f 100644
--- a/modelscope/pipelines/cv/face_detection_pipeline.py
+++ b/modelscope/pipelines/cv/face_detection_pipeline.py
@@ -1,3 +1,4 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
 import os.path as osp
 from typing import Any, Dict
 
diff --git a/modelscope/pipelines/cv/face_emotion_pipeline.py b/modelscope/pipelines/cv/face_emotion_pipeline.py
new file mode 100644
index 00000000..249493b6
--- /dev/null
+++ b/modelscope/pipelines/cv/face_emotion_pipeline.py
@@ -0,0 +1,39 @@
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
+from typing import Any, Dict
+
+from modelscope.metainfo import Pipelines
+from modelscope.models.cv.face_emotion import emotion_infer
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines.base import Input, Pipeline
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.utils.constant import ModelFile, Tasks
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+
+@PIPELINES.register_module(
+    Tasks.face_emotion, module_name=Pipelines.face_emotion)
+class FaceEmotionPipeline(Pipeline):
+
+    def __init__(self, model: str, **kwargs):
+        """
+        use `model` to create face emotion pipeline for prediction
+        Args:
+            model: model id on modelscope hub.
+        """
+
+        super().__init__(model=model, **kwargs)
+        self.face_model = model + '/' + ModelFile.TF_GRAPH_FILE
+        logger.info('load model done')
+
+    def preprocess(self, input: Input) -> Dict[str, Any]:
+        return input
+
+    def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
+        result, bbox = emotion_infer.inference(input['img_path'], self.model,
+                                               self.face_model)
+        return {OutputKeys.OUTPUT: result, OutputKeys.BOXES: bbox}
+
+    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
+        return inputs
diff --git a/modelscope/pipelines/cv/face_human_hand_detection_pipeline.py b/modelscope/pipelines/cv/face_human_hand_detection_pipeline.py
new file mode 100644
index 00000000..d9f214c9
--- /dev/null
+++ b/modelscope/pipelines/cv/face_human_hand_detection_pipeline.py
@@ -0,0 +1,42 @@
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
+
+from typing import Any, Dict
+
+from modelscope.metainfo import Pipelines
+from modelscope.models.cv.face_human_hand_detection import det_infer
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines.base import Input, Pipeline
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.utils.constant import Tasks
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+
+@PIPELINES.register_module(
+    Tasks.face_human_hand_detection,
+    module_name=Pipelines.face_human_hand_detection)
+class NanoDettForFaceHumanHandDetectionPipeline(Pipeline):
+
+    def __init__(self, model: str, **kwargs):
+        """
+        use `model` to create face-human-hand detection pipeline for prediction
+        Args:
+            model: model id on modelscope hub.
+        """
+
+        super().__init__(model=model, **kwargs)
+        logger.info('load model done')
+
+    def preprocess(self, input: Input) -> Dict[str, Any]:
+        return input
+
+    def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
+
+        result = det_infer.inference(self.model, self.device,
+                                     input['input_path'])
+        logger.info(result)
+        return {OutputKeys.OUTPUT: result}
+
+    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
+        return inputs
diff --git a/modelscope/pipelines/cv/face_recognition_pipeline.py b/modelscope/pipelines/cv/face_recognition_pipeline.py
index 506346df..873e4a1f 100644
--- a/modelscope/pipelines/cv/face_recognition_pipeline.py
+++ b/modelscope/pipelines/cv/face_recognition_pipeline.py
@@ -1,3 +1,4 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
 import os.path as osp
 from typing import Any, Dict
 
diff --git a/modelscope/pipelines/cv/facial_expression_recognition_pipeline.py b/modelscope/pipelines/cv/facial_expression_recognition_pipeline.py
new file mode 100644
index 00000000..1b1f13d1
--- /dev/null
+++ b/modelscope/pipelines/cv/facial_expression_recognition_pipeline.py
@@ -0,0 +1,129 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os.path as osp
+from typing import Any, Dict
+
+import cv2
+import numpy as np
+import PIL
+import torch
+
+from modelscope.metainfo import Pipelines
+from modelscope.models.cv.face_recognition.align_face import align_face
+from modelscope.models.cv.facial_expression_recognition import \
+    FacialExpressionRecognition
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines import pipeline
+from modelscope.pipelines.base import Input, Pipeline
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.preprocessors import LoadImage
+from modelscope.utils.constant import ModelFile, Tasks
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+
+@PIPELINES.register_module(
+    Tasks.facial_expression_recognition,
+    module_name=Pipelines.facial_expression_recognition)
+class FacialExpressionRecognitionPipeline(Pipeline):
+
+    def __init__(self, model: str, **kwargs):
+        """
+        use `model` to create a face detection pipeline for prediction
+        Args:
+            model: model id on modelscope hub.
+        """
+        super().__init__(model=model, **kwargs)
+        ckpt_path = osp.join(model, ModelFile.TORCH_MODEL_FILE)
+        logger.info(f'loading model from {ckpt_path}')
+        device = torch.device(
+            f'cuda:{0}' if torch.cuda.is_available() else 'cpu')
+        fer = FacialExpressionRecognition(model_path=ckpt_path, device=device)
+        self.fer = fer
+        self.device = device
+        logger.info('load model done')
+
+        # face detect pipeline
+        det_model_id = 'damo/cv_resnet_facedetection_scrfd10gkps'
+        self.face_detection = pipeline(
+            Tasks.face_detection, model=det_model_id)
+
+    def _choose_face(self,
+                     det_result,
+                     min_face=10,
+                     top_face=1,
+                     center_face=False):
+        '''
+        choose face with maximum area
+        Args:
+            det_result: output of face detection pipeline
+            min_face: minimum size of valid face w/h
+            top_face: take faces with top max areas
+            center_face: choose the most centerd face from multi faces, only valid if top_face > 1
+        '''
+        bboxes = np.array(det_result[OutputKeys.BOXES])
+        landmarks = np.array(det_result[OutputKeys.KEYPOINTS])
+        if bboxes.shape[0] == 0:
+            logger.info('Warning: No face detected!')
+            return None
+        # face idx with enough size
+        face_idx = []
+        for i in range(bboxes.shape[0]):
+            box = bboxes[i]
+            if (box[2] - box[0]) >= min_face and (box[3] - box[1]) >= min_face:
+                face_idx += [i]
+        if len(face_idx) == 0:
+            logger.info(
+                f'Warning: Face size not enough, less than {min_face}x{min_face}!'
+            )
+            return None
+        bboxes = bboxes[face_idx]
+        landmarks = landmarks[face_idx]
+        # find max faces
+        boxes = np.array(bboxes)
+        area = (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])
+        sort_idx = np.argsort(area)[-top_face:]
+        # find center face
+        if top_face > 1 and center_face and bboxes.shape[0] > 1:
+            img_center = [img.shape[1] // 2, img.shape[0] // 2]
+            min_dist = float('inf')
+            sel_idx = -1
+            for _idx in sort_idx:
+                box = boxes[_idx]
+                dist = np.square(
+                    np.abs((box[0] + box[2]) / 2 - img_center[0])) + np.square(
+                        np.abs((box[1] + box[3]) / 2 - img_center[1]))
+                if dist < min_dist:
+                    min_dist = dist
+                    sel_idx = _idx
+            sort_idx = [sel_idx]
+        main_idx = sort_idx[-1]
+        return bboxes[main_idx], landmarks[main_idx]
+
+    def preprocess(self, input: Input) -> Dict[str, Any]:
+        img = LoadImage.convert_to_ndarray(input)
+        img = img[:, :, ::-1]
+        det_result = self.face_detection(img.copy())
+        rtn = self._choose_face(det_result)
+        face_img = None
+        if rtn is not None:
+            _, face_lmks = rtn
+            face_lmks = face_lmks.reshape(5, 2)
+            face_img, _ = align_face(img, (112, 112), face_lmks)
+            face_img = face_img.astype(np.float32)
+        result = {}
+        result['img'] = face_img
+        return result
+
+    def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
+        result = self.fer(input)
+        assert result is not None
+        scores = result[0].tolist()
+        labels = result[1].tolist()
+        return {
+            OutputKeys.SCORES: scores,
+            OutputKeys.LABELS: labels,
+        }
+
+    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
+        return inputs
diff --git a/modelscope/pipelines/cv/general_recognition_pipeline.py b/modelscope/pipelines/cv/general_recognition_pipeline.py
index 9ba5117b..07222086 100644
--- a/modelscope/pipelines/cv/general_recognition_pipeline.py
+++ b/modelscope/pipelines/cv/general_recognition_pipeline.py
@@ -1,3 +1,4 @@
+# Copyright 2021-2022 The Alibaba Fundamental Vision  Team Authors. All rights reserved.
 import os.path as osp
 from typing import Any, Dict
 
diff --git a/modelscope/pipelines/cv/hand_2d_keypoints_pipeline.py b/modelscope/pipelines/cv/hand_2d_keypoints_pipeline.py
new file mode 100644
index 00000000..bad0c652
--- /dev/null
+++ b/modelscope/pipelines/cv/hand_2d_keypoints_pipeline.py
@@ -0,0 +1,52 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os.path
+
+from modelscope.metainfo import Pipelines
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.utils.constant import ModelFile, Tasks
+from .easycv_pipelines.base import EasyCVPipeline
+
+
+@PIPELINES.register_module(
+    Tasks.hand_2d_keypoints, module_name=Pipelines.hand_2d_keypoints)
+class Hand2DKeypointsPipeline(EasyCVPipeline):
+    """Pipeline for hand pose keypoint task."""
+
+    def __init__(self,
+                 model: str,
+                 model_file_pattern=ModelFile.TORCH_MODEL_FILE,
+                 *args,
+                 **kwargs):
+        """
+            model (str): model id on modelscope hub or local model path.
+            model_file_pattern (str): model file pattern.
+        """
+        self.model_dir = model
+        super(Hand2DKeypointsPipeline, self).__init__(
+            model=model,
+            model_file_pattern=model_file_pattern,
+            *args,
+            **kwargs)
+
+    def _build_predict_op(self, **kwargs):
+        """Build EasyCV predictor."""
+        from easycv.predictors.builder import build_predictor
+        detection_predictor_type = self.cfg['DETECTION']['type']
+        detection_model_path = os.path.join(
+            self.model_dir, self.cfg['DETECTION']['model_path'])
+        detection_cfg_file = os.path.join(self.model_dir,
+                                          self.cfg['DETECTION']['config_file'])
+        detection_score_threshold = self.cfg['DETECTION']['score_threshold']
+        self.cfg.pipeline.predictor_config[
+            'detection_predictor_config'] = dict(
+                type=detection_predictor_type,
+                model_path=detection_model_path,
+                config_file=detection_cfg_file,
+                score_threshold=detection_score_threshold)
+        easycv_config = self._to_easycv_config()
+        pipeline_op = build_predictor(self.cfg.pipeline.predictor_config, {
+            'model_path': self.model_path,
+            'config_file': easycv_config,
+            **kwargs
+        })
+        return pipeline_op
diff --git a/modelscope/pipelines/cv/hand_static_pipeline.py b/modelscope/pipelines/cv/hand_static_pipeline.py
new file mode 100644
index 00000000..1219c873
--- /dev/null
+++ b/modelscope/pipelines/cv/hand_static_pipeline.py
@@ -0,0 +1,37 @@
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
+from typing import Any, Dict
+
+from modelscope.metainfo import Pipelines
+from modelscope.models.cv.hand_static import hand_model
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines.base import Input, Pipeline
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.utils.constant import Tasks
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+
+@PIPELINES.register_module(
+    Tasks.hand_static, module_name=Pipelines.hand_static)
+class HandStaticPipeline(Pipeline):
+
+    def __init__(self, model: str, **kwargs):
+        """
+        use `model` to create hand static pipeline for prediction
+        Args:
+            model: model id on modelscope hub.
+        """
+
+        super().__init__(model=model, **kwargs)
+        logger.info('load model done')
+
+    def preprocess(self, input: Input) -> Dict[str, Any]:
+        return input
+
+    def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
+        result = hand_model.infer(input['img_path'], self.model, self.device)
+        return {OutputKeys.OUTPUT: result}
+
+    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
+        return inputs
diff --git a/modelscope/pipelines/cv/hicossl_video_embedding_pipeline.py b/modelscope/pipelines/cv/hicossl_video_embedding_pipeline.py
new file mode 100644
index 00000000..5e4cd4c6
--- /dev/null
+++ b/modelscope/pipelines/cv/hicossl_video_embedding_pipeline.py
@@ -0,0 +1,75 @@
+import math
+import os.path as osp
+from typing import Any, Dict
+
+import torch
+
+from modelscope.metainfo import Pipelines
+from modelscope.models.cv.action_recognition import BaseVideoModel
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines.base import Input, Pipeline
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.preprocessors import ReadVideoData
+from modelscope.utils.config import Config
+from modelscope.utils.constant import ModelFile, Tasks
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+
+@PIPELINES.register_module(
+    Tasks.video_embedding, module_name=Pipelines.hicossl_video_embedding)
+class HICOSSLVideoEmbeddingPipeline(Pipeline):
+
+    def __init__(self, model: str, **kwargs):
+        """
+        use `model` to create a hicossl video embedding pipeline for prediction
+        Args:
+            model: model id on modelscope hub.
+        """
+        super().__init__(model=model, **kwargs)
+        model_path = osp.join(self.model, ModelFile.TORCH_MODEL_FILE)
+        logger.info(f'loading model from {model_path}')
+        config_path = osp.join(self.model, ModelFile.CONFIGURATION)
+        logger.info(f'loading config from {config_path}')
+        self.cfg = Config.from_file(config_path)
+        self.infer_model = BaseVideoModel(cfg=self.cfg).to(self.device)
+        self.infer_model.eval()
+        self.infer_model.load_state_dict(
+            torch.load(model_path, map_location=self.device)['model_state'],
+            strict=False)
+        logger.info('load model done')
+
+    def preprocess(self, input: Input) -> Dict[str, Any]:
+        if isinstance(input, str):
+            video_input_data = ReadVideoData(
+                self.cfg, input, num_temporal_views_override=1).to(self.device)
+        else:
+            raise TypeError(f'input should be a str,'
+                            f'  but got {type(input)}')
+        result = {'video_data': video_input_data}
+        return result
+
+    def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
+        feature = self.perform_inference(input['video_data'])
+        return {OutputKeys.VIDEO_EMBEDDING: feature.data.cpu().numpy()}
+
+    @torch.no_grad()
+    def perform_inference(self, data, max_bsz=4):
+        """ Perform feature extracting for a given video
+        Args:
+            model (BaseVideoModel): video model with loadded state dict.
+            max_bsz (int): the maximum batch size, limited by GPU memory.
+        Returns:
+            pred (Tensor): the extracted features for input video clips.
+        """
+        iter_num = math.ceil(data.size(0) / max_bsz)
+        preds_list = []
+        for i in range(iter_num):
+            preds_list.append(
+                self.infer_model(data[i * max_bsz:(i + 1) * max_bsz])[0])
+        pred = torch.cat(preds_list, dim=0)
+        return pred
+
+    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
+        return inputs
diff --git a/modelscope/pipelines/cv/image_cartoon_pipeline.py b/modelscope/pipelines/cv/image_cartoon_pipeline.py
index eb669354..8606915c 100644
--- a/modelscope/pipelines/cv/image_cartoon_pipeline.py
+++ b/modelscope/pipelines/cv/image_cartoon_pipeline.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import os
 from typing import Any, Dict
 
@@ -37,14 +39,12 @@ class ImageCartoonPipeline(Pipeline):
             model: model id on modelscope hub.
         """
         super().__init__(model=model, **kwargs)
-        with device_placement(self.framework, self.device_name):
-            self.facer = FaceAna(self.model)
+        self.facer = FaceAna(self.model)
+        with tf.Graph().as_default():
             self.sess_anime_head = self.load_sess(
-                os.path.join(self.model, 'cartoon_anime_h.pb'),
-                'model_anime_head')
+                os.path.join(self.model, 'cartoon_h.pb'), 'model_anime_head')
             self.sess_anime_bg = self.load_sess(
-                os.path.join(self.model, 'cartoon_anime_bg.pb'),
-                'model_anime_bg')
+                os.path.join(self.model, 'cartoon_bg.pb'), 'model_anime_bg')
 
         self.box_width = 288
         global_mask = cv2.imread(os.path.join(self.model, 'alpha.jpg'))
diff --git a/modelscope/pipelines/cv/image_detection_pipeline.py b/modelscope/pipelines/cv/image_detection_pipeline.py
index 8df10d45..f5554ca2 100644
--- a/modelscope/pipelines/cv/image_detection_pipeline.py
+++ b/modelscope/pipelines/cv/image_detection_pipeline.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 from typing import Any, Dict
 
 import numpy as np
diff --git a/modelscope/pipelines/cv/image_instance_segmentation_pipeline.py b/modelscope/pipelines/cv/image_instance_segmentation_pipeline.py
index ce0bf907..5a0f0d7e 100644
--- a/modelscope/pipelines/cv/image_instance_segmentation_pipeline.py
+++ b/modelscope/pipelines/cv/image_instance_segmentation_pipeline.py
@@ -1,3 +1,4 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
 import os
 from typing import Any, Dict, Optional, Union
 
diff --git a/modelscope/pipelines/cv/image_panoptic_segmentation_pipeline.py b/modelscope/pipelines/cv/image_panoptic_segmentation_pipeline.py
new file mode 100644
index 00000000..b96e709c
--- /dev/null
+++ b/modelscope/pipelines/cv/image_panoptic_segmentation_pipeline.py
@@ -0,0 +1,101 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import Any, Dict, Union
+
+import cv2
+import numpy as np
+import PIL
+import torch
+
+from modelscope.metainfo import Pipelines
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines.base import Input, Pipeline
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.preprocessors import load_image
+from modelscope.utils.constant import Tasks
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+
+@PIPELINES.register_module(
+    Tasks.image_segmentation,
+    module_name=Pipelines.image_panoptic_segmentation)
+class ImagePanopticSegmentationPipeline(Pipeline):
+
+    def __init__(self, model: str, **kwargs):
+        """
+        use `model` to create a image panoptic segmentation pipeline for prediction
+        Args:
+            model: model id on modelscope hub.
+        """
+        super().__init__(model=model, **kwargs)
+
+        logger.info('panoptic segmentation model, pipeline init')
+
+    def preprocess(self, input: Input) -> Dict[str, Any]:
+        from mmdet.datasets.pipelines import Compose
+        from mmcv.parallel import collate, scatter
+        from mmdet.datasets import replace_ImageToTensor
+
+        cfg = self.model.cfg
+        # build the data pipeline
+
+        if isinstance(input, str):
+            cfg.data.test.pipeline[0].type = 'LoadImageFromWebcam'
+            img = np.array(load_image(input))
+            img = img[:, :, ::-1]  # convert to bgr
+        elif isinstance(input, PIL.Image.Image):
+            cfg.data.test.pipeline[0].type = 'LoadImageFromWebcam'
+            img = np.array(input.convert('RGB'))
+        elif isinstance(input, np.ndarray):
+            cfg.data.test.pipeline[0].type = 'LoadImageFromWebcam'
+            if len(input.shape) == 2:
+                img = cv2.cvtColor(input, cv2.COLOR_GRAY2BGR)
+            else:
+                img = input
+        else:
+            raise TypeError(f'input should be either str, PIL.Image,'
+                            f' np.array, but got {type(input)}')
+
+        # collect data
+        data = dict(img=img)
+        cfg.data.test.pipeline = replace_ImageToTensor(cfg.data.test.pipeline)
+        test_pipeline = Compose(cfg.data.test.pipeline)
+
+        data = test_pipeline(data)
+        # copy from mmdet_model collect data
+        data = collate([data], samples_per_gpu=1)
+        data['img_metas'] = [
+            img_metas.data[0] for img_metas in data['img_metas']
+        ]
+        data['img'] = [img.data[0] for img in data['img']]
+        if next(self.model.parameters()).is_cuda:
+            # scatter to specified GPU
+            data = scatter(data, [next(self.model.parameters()).device])[0]
+
+        return data
+
+    def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
+        results = self.model.inference(input)
+
+        return results
+
+    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
+        # bz=1, tcguo
+        pan_results = inputs[0]['pan_results']
+        INSTANCE_OFFSET = 1000
+
+        ids = np.unique(pan_results)[::-1]
+        legal_indices = ids != self.model.num_classes  # for VOID label
+        ids = ids[legal_indices]
+        labels = np.array([id % INSTANCE_OFFSET for id in ids], dtype=np.int64)
+        segms = (pan_results[None] == ids[:, None, None])
+        masks = [it.astype(np.int) for it in segms]
+        labels_txt = np.array(self.model.CLASSES)[labels].tolist()
+
+        outputs = {
+            OutputKeys.MASKS: masks,
+            OutputKeys.LABELS: labels_txt,
+            OutputKeys.SCORES: [0.999 for _ in range(len(labels_txt))]
+        }
+        return outputs
diff --git a/modelscope/pipelines/cv/image_reid_person_pipeline.py b/modelscope/pipelines/cv/image_reid_person_pipeline.py
index a14666a1..64674a65 100644
--- a/modelscope/pipelines/cv/image_reid_person_pipeline.py
+++ b/modelscope/pipelines/cv/image_reid_person_pipeline.py
@@ -1,3 +1,4 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
 import math
 import os
 from typing import Any, Dict
diff --git a/modelscope/pipelines/cv/image_salient_detection_pipeline.py b/modelscope/pipelines/cv/image_salient_detection_pipeline.py
index 433275ba..4a3eaa65 100644
--- a/modelscope/pipelines/cv/image_salient_detection_pipeline.py
+++ b/modelscope/pipelines/cv/image_salient_detection_pipeline.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 from typing import Any, Dict
 
 from modelscope.metainfo import Pipelines
@@ -9,7 +11,7 @@ from modelscope.utils.constant import Tasks
 
 
 @PIPELINES.register_module(
-    Tasks.image_segmentation, module_name=Pipelines.salient_detection)
+    Tasks.semantic_segmentation, module_name=Pipelines.salient_detection)
 class ImageSalientDetectionPipeline(Pipeline):
 
     def __init__(self, model: str, **kwargs):
@@ -39,9 +41,5 @@ class ImageSalientDetectionPipeline(Pipeline):
     def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
 
         data = self.model.postprocess(inputs)
-        outputs = {
-            OutputKeys.SCORES: None,
-            OutputKeys.LABELS: None,
-            OutputKeys.MASKS: data
-        }
+        outputs = {OutputKeys.MASKS: data}
         return outputs
diff --git a/modelscope/pipelines/cv/image_semantic_segmentation_pipeline.py b/modelscope/pipelines/cv/image_semantic_segmentation_pipeline.py
new file mode 100644
index 00000000..023d9712
--- /dev/null
+++ b/modelscope/pipelines/cv/image_semantic_segmentation_pipeline.py
@@ -0,0 +1,90 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import Any, Dict, Union
+
+import cv2
+import numpy as np
+import PIL
+import torch
+
+from modelscope.metainfo import Pipelines
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines.base import Input, Model, Pipeline
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.preprocessors import load_image
+from modelscope.utils.constant import Tasks
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+
+@PIPELINES.register_module(
+    Tasks.image_segmentation,
+    module_name=Pipelines.image_semantic_segmentation)
+class ImageSemanticSegmentationPipeline(Pipeline):
+
+    def __init__(self, model: str, **kwargs):
+        """
+        use `model` to create a image semantic segmentation pipeline for prediction
+        Args:
+            model: model id on modelscope hub.
+        """
+        super().__init__(model=model, **kwargs)
+
+        logger.info('semantic segmentation model, pipeline init')
+
+    def preprocess(self, input: Input) -> Dict[str, Any]:
+        from mmdet.datasets.pipelines import Compose
+        from mmcv.parallel import collate, scatter
+        from mmdet.datasets import replace_ImageToTensor
+
+        cfg = self.model.cfg
+        # build the data pipeline
+
+        if isinstance(input, str):
+            cfg.data.test.pipeline[0].type = 'LoadImageFromWebcam'
+            img = np.array(load_image(input))
+            img = img[:, :, ::-1]  # convert to bgr
+        elif isinstance(input, PIL.Image.Image):  # BGR
+            cfg.data.test.pipeline[0].type = 'LoadImageFromWebcam'
+            img = np.array(input)[:, :, ::-1]
+        elif isinstance(input, np.ndarray):
+            cfg.data.test.pipeline[0].type = 'LoadImageFromWebcam'
+            if len(input.shape) == 2:
+                img = cv2.cvtColor(input, cv2.COLOR_GRAY2BGR)
+            else:
+                img = input
+        else:
+            raise TypeError(f'input should be either str, PIL.Image,'
+                            f' np.array, but got {type(input)}')
+
+        # collect data
+        data = dict(img=img)
+        cfg.data.test.pipeline = replace_ImageToTensor(cfg.data.test.pipeline)
+        test_pipeline = Compose(cfg.data.test.pipeline)
+
+        data = test_pipeline(data)
+        # copy from mmdet_model collect data
+        data = collate([data], samples_per_gpu=1)
+        data['img_metas'] = [
+            img_metas.data[0] for img_metas in data['img_metas']
+        ]
+        data['img'] = [img.data[0] for img in data['img']]
+        if next(self.model.parameters()).is_cuda:
+            # scatter to specified GPU
+            data = scatter(data, [next(self.model.parameters()).device])[0]
+
+        return data
+
+    def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
+        results = self.model.inference(input)
+        return results
+
+    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
+        results = self.model.postprocess(inputs)
+        outputs = {
+            OutputKeys.MASKS: results[OutputKeys.MASKS],
+            OutputKeys.LABELS: results[OutputKeys.LABELS],
+            OutputKeys.SCORES: results[OutputKeys.SCORES]
+        }
+
+        return outputs
diff --git a/modelscope/pipelines/cv/image_style_transfer_pipeline.py b/modelscope/pipelines/cv/image_style_transfer_pipeline.py
index 827a0d44..e5fd0d48 100644
--- a/modelscope/pipelines/cv/image_style_transfer_pipeline.py
+++ b/modelscope/pipelines/cv/image_style_transfer_pipeline.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import os.path as osp
 from typing import Any, Dict
 
@@ -61,7 +63,13 @@ class ImageStyleTransferPipeline(Pipeline):
     def _sanitize_parameters(self, **pipeline_parameters):
         return pipeline_parameters, {}, {}
 
-    def preprocess(self, content: Input, style: Input) -> Dict[str, Any]:
+    def preprocess(self,
+                   content: Input,
+                   style: Input = None) -> Dict[str, Any]:
+        if type(content) is dict:  # for demo service
+            style = content['style']
+            content = content['content']
+
         content = LoadImage.convert_to_ndarray(content)
         if len(content.shape) == 2:
             content = cv2.cvtColor(content, cv2.COLOR_GRAY2BGR)
diff --git a/modelscope/pipelines/cv/live_category_pipeline.py b/modelscope/pipelines/cv/live_category_pipeline.py
index c16ba6ba..715998cc 100644
--- a/modelscope/pipelines/cv/live_category_pipeline.py
+++ b/modelscope/pipelines/cv/live_category_pipeline.py
@@ -1,4 +1,4 @@
-# Copyright (c) Alibaba, Inc. and its affiliates.
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
 import os.path as osp
 from typing import Any, Dict
 
diff --git a/modelscope/pipelines/cv/mog_face_detection_pipeline.py b/modelscope/pipelines/cv/mog_face_detection_pipeline.py
new file mode 100644
index 00000000..124b605b
--- /dev/null
+++ b/modelscope/pipelines/cv/mog_face_detection_pipeline.py
@@ -0,0 +1,55 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os.path as osp
+from typing import Any, Dict
+
+import numpy as np
+
+from modelscope.metainfo import Pipelines
+from modelscope.models.cv.face_detection import MogFaceDetector
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines.base import Input, Pipeline
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.preprocessors import LoadImage
+from modelscope.utils.constant import ModelFile, Tasks
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+
+@PIPELINES.register_module(
+    Tasks.face_detection, module_name=Pipelines.mog_face_detection)
+class MogFaceDetectionPipeline(Pipeline):
+
+    def __init__(self, model: str, **kwargs):
+        """
+        use `model` to create a face detection pipeline for prediction
+        Args:
+            model: model id on modelscope hub.
+        """
+        super().__init__(model=model, **kwargs)
+        ckpt_path = osp.join(model, ModelFile.TORCH_MODEL_FILE)
+        logger.info(f'loading model from {ckpt_path}')
+        detector = MogFaceDetector(model_path=ckpt_path, device=self.device)
+        self.detector = detector
+        logger.info('load model done')
+
+    def preprocess(self, input: Input) -> Dict[str, Any]:
+        img = LoadImage.convert_to_ndarray(input)
+        img = img.astype(np.float32)
+        result = {'img': img}
+        return result
+
+    def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
+
+        result = self.detector(input)
+        assert result is not None
+        bboxes = result[:, :4].tolist()
+        scores = result[:, 4].tolist()
+        return {
+            OutputKeys.SCORES: scores,
+            OutputKeys.BOXES: bboxes,
+            OutputKeys.KEYPOINTS: None,
+        }
+
+    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
+        return inputs
diff --git a/modelscope/pipelines/cv/movie_scene_segmentation_pipeline.py b/modelscope/pipelines/cv/movie_scene_segmentation_pipeline.py
new file mode 100644
index 00000000..6704e4c0
--- /dev/null
+++ b/modelscope/pipelines/cv/movie_scene_segmentation_pipeline.py
@@ -0,0 +1,68 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import Any, Dict
+
+import torch
+
+from modelscope.metainfo import Pipelines
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines.base import Input, Pipeline
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.utils.constant import Tasks
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+
+@PIPELINES.register_module(
+    Tasks.movie_scene_segmentation,
+    module_name=Pipelines.movie_scene_segmentation)
+class MovieSceneSegmentationPipeline(Pipeline):
+
+    def __init__(self, model: str, **kwargs):
+        """use `model` to create a movie scene segmentation pipeline for prediction
+
+        Args:
+            model: model id on modelscope hub
+        """
+        _device = kwargs.pop('device', 'gpu')
+        if torch.cuda.is_available() and _device == 'gpu':
+            device = 'gpu'
+        else:
+            device = 'cpu'
+        super().__init__(model=model, device=device, **kwargs)
+
+        logger.info('Load model done!')
+
+    def preprocess(self, input: Input) -> Dict[str, Any]:
+        """ use pyscenedetect to detect shot from the input video, and generate key-frame jpg, anno.ndjson, and shot-frame.txt
+            Then use shot-encoder to encoder feat of the detected key-frame
+
+        Args:
+            input: path of the input video
+
+        """
+        self.input_video_pth = input
+        if isinstance(input, str):
+            shot_feat, sid = self.model.preprocess(input)
+        else:
+            raise TypeError(f'input should be a str,'
+                            f'  but got {type(input)}')
+
+        result = {'sid': sid, 'shot_feat': shot_feat}
+
+        return result
+
+    def forward(self, input: Dict[str, Any],
+                **forward_params) -> Dict[str, Any]:
+        with torch.no_grad():
+            output = self.model.inference(input)
+        return output
+
+    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
+        data = {'input_video_pth': self.input_video_pth, 'feat': inputs}
+        video_num, meta_lst = self.model.postprocess(data)
+        result = {
+            OutputKeys.SPLIT_VIDEO_NUM: video_num,
+            OutputKeys.SPLIT_META_LIST: meta_lst
+        }
+        return result
diff --git a/modelscope/pipelines/cv/mtcnn_face_detection_pipeline.py b/modelscope/pipelines/cv/mtcnn_face_detection_pipeline.py
new file mode 100644
index 00000000..bda46a70
--- /dev/null
+++ b/modelscope/pipelines/cv/mtcnn_face_detection_pipeline.py
@@ -0,0 +1,57 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os.path as osp
+from typing import Any, Dict
+
+import torch
+
+from modelscope.metainfo import Pipelines
+from modelscope.models.cv.face_detection import MtcnnFaceDetector
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines.base import Input, Pipeline
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.preprocessors import LoadImage
+from modelscope.utils.constant import Tasks
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+
+@PIPELINES.register_module(
+    Tasks.face_detection, module_name=Pipelines.mtcnn_face_detection)
+class MtcnnFaceDetectionPipeline(Pipeline):
+
+    def __init__(self, model: str, **kwargs):
+        """
+        use `model` to create a face detection pipeline for prediction
+        Args:
+            model: model id on modelscope hub.
+        """
+        super().__init__(model=model, **kwargs)
+        ckpt_path = osp.join(model, './weights')
+        logger.info(f'loading model from {ckpt_path}')
+        device = torch.device(
+            f'cuda:{0}' if torch.cuda.is_available() else 'cpu')
+        detector = MtcnnFaceDetector(model_path=ckpt_path, device=device)
+        self.detector = detector
+        self.device = device
+        logger.info('load model done')
+
+    def preprocess(self, input: Input) -> Dict[str, Any]:
+        img = LoadImage.convert_to_ndarray(input)
+        result = {'img': img}
+        return result
+
+    def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
+        result = self.detector(input)
+        assert result is not None
+        bboxes = result[0][:, :4].tolist()
+        scores = result[0][:, 4].tolist()
+        lms = result[1].tolist()
+        return {
+            OutputKeys.SCORES: scores,
+            OutputKeys.BOXES: bboxes,
+            OutputKeys.KEYPOINTS: lms,
+        }
+
+    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
+        return inputs
diff --git a/modelscope/pipelines/cv/ocr_recognition_pipeline.py b/modelscope/pipelines/cv/ocr_recognition_pipeline.py
index 4b095042..c20d020c 100644
--- a/modelscope/pipelines/cv/ocr_recognition_pipeline.py
+++ b/modelscope/pipelines/cv/ocr_recognition_pipeline.py
@@ -91,7 +91,8 @@ class OCRRecognitionPipeline(Pipeline):
                 data.append(mask)
 
         data = torch.FloatTensor(data).view(
-            len(data), 1, IMG_HEIGHT, IMG_WIDTH).cuda() / 255.
+            len(data), 1, IMG_HEIGHT, IMG_WIDTH) / 255.
+        data = data.to(self.device)
 
         result = {'img': data}
 
diff --git a/modelscope/pipelines/cv/ocr_utils/ops.py b/modelscope/pipelines/cv/ocr_utils/ops.py
index eeab36a0..09807b10 100644
--- a/modelscope/pipelines/cv/ocr_utils/ops.py
+++ b/modelscope/pipelines/cv/ocr_utils/ops.py
@@ -1,8 +1,10 @@
 import math
 import os
 import shutil
+import sys
 import uuid
 
+import absl.flags as absl_flags
 import cv2
 import numpy as np
 import tensorflow as tf
@@ -12,6 +14,10 @@ from . import utils
 if tf.__version__ >= '2.0':
     tf = tf.compat.v1
 
+# skip parse sys.argv in tf, so fix bug:
+# absl.flags._exceptions.UnrecognizedFlagError:
+# Unknown command line flag 'OCRDetectionPipeline: Unknown command line flag
+absl_flags.FLAGS(sys.argv, known_only=True)
 FLAGS = tf.app.flags.FLAGS
 tf.app.flags.DEFINE_string('weight_init_method', 'xavier',
                            'Weight initialization method')
diff --git a/modelscope/pipelines/cv/product_segmentation_pipeline.py b/modelscope/pipelines/cv/product_segmentation_pipeline.py
new file mode 100644
index 00000000..244b01d7
--- /dev/null
+++ b/modelscope/pipelines/cv/product_segmentation_pipeline.py
@@ -0,0 +1,40 @@
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
+
+from typing import Any, Dict
+
+from modelscope.metainfo import Pipelines
+from modelscope.models.cv.product_segmentation import seg_infer
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines.base import Input, Pipeline
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.utils.constant import Tasks
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+
+@PIPELINES.register_module(
+    Tasks.product_segmentation, module_name=Pipelines.product_segmentation)
+class F3NetForProductSegmentationPipeline(Pipeline):
+
+    def __init__(self, model: str, **kwargs):
+        """
+        use `model` to create product segmentation pipeline for prediction
+        Args:
+            model: model id on modelscope hub.
+        """
+
+        super().__init__(model=model, **kwargs)
+        logger.info('load model done')
+
+    def preprocess(self, input: Input) -> Dict[str, Any]:
+        return input
+
+    def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
+
+        mask = seg_infer.inference(self.model, self.device,
+                                   input['input_path'])
+        return {OutputKeys.MASKS: mask}
+
+    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
+        return inputs
diff --git a/modelscope/pipelines/cv/realtime_object_detection_pipeline.py b/modelscope/pipelines/cv/realtime_object_detection_pipeline.py
new file mode 100644
index 00000000..9f558f88
--- /dev/null
+++ b/modelscope/pipelines/cv/realtime_object_detection_pipeline.py
@@ -0,0 +1,51 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os.path as osp
+from typing import Any, Dict, List, Union
+
+import cv2
+import json
+import numpy as np
+import torch
+from PIL import Image
+from torchvision import transforms
+
+from modelscope.metainfo import Pipelines
+from modelscope.models.cv.realtime_object_detection import RealtimeDetector
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines import pipeline
+from modelscope.pipelines.base import Input, Model, Pipeline, Tensor
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.preprocessors import load_image
+from modelscope.utils.constant import ModelFile, Tasks
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+
+@PIPELINES.register_module(
+    Tasks.image_object_detection,
+    module_name=Pipelines.realtime_object_detection)
+class RealtimeObjectDetectionPipeline(Pipeline):
+
+    def __init__(self, model: str, **kwargs):
+        super().__init__(model=model, **kwargs)
+        self.model = RealtimeDetector(model)
+
+    def preprocess(self, input: Input) -> Dict[Tensor, Union[str, np.ndarray]]:
+        output = self.model.preprocess(input)
+        return {'pre_output': output}
+
+    def forward(self, input: Tensor) -> Dict[Tensor, Dict[str, np.ndarray]]:
+        pre_output = input['pre_output']
+        forward_output = self.model(pre_output)
+        return {'forward_output': forward_output}
+
+    def postprocess(self, input: Dict[Tensor, Dict[str, np.ndarray]],
+                    **kwargs) -> str:
+        forward_output = input['forward_output']
+        bboxes, scores, labels = forward_output
+        return {
+            OutputKeys.BOXES: bboxes,
+            OutputKeys.SCORES: scores,
+            OutputKeys.LABELS: labels,
+        }
diff --git a/modelscope/pipelines/cv/retina_face_detection_pipeline.py b/modelscope/pipelines/cv/retina_face_detection_pipeline.py
new file mode 100644
index 00000000..40f2336a
--- /dev/null
+++ b/modelscope/pipelines/cv/retina_face_detection_pipeline.py
@@ -0,0 +1,59 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os.path as osp
+from typing import Any, Dict
+
+import cv2
+import numpy as np
+import PIL
+import torch
+
+from modelscope.metainfo import Pipelines
+from modelscope.models.cv.face_detection import RetinaFaceDetection
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines.base import Input, Pipeline
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.preprocessors import LoadImage
+from modelscope.utils.constant import ModelFile, Tasks
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+
+@PIPELINES.register_module(
+    Tasks.face_detection, module_name=Pipelines.retina_face_detection)
+class RetinaFaceDetectionPipeline(Pipeline):
+
+    def __init__(self, model: str, **kwargs):
+        """
+        use `model` to create a face detection pipeline for prediction
+        Args:
+            model: model id on modelscope hub.
+        """
+        super().__init__(model=model, **kwargs)
+        ckpt_path = osp.join(model, ModelFile.TORCH_MODEL_FILE)
+        logger.info(f'loading model from {ckpt_path}')
+        detector = RetinaFaceDetection(
+            model_path=ckpt_path, device=self.device)
+        self.detector = detector
+        logger.info('load model done')
+
+    def preprocess(self, input: Input) -> Dict[str, Any]:
+        img = LoadImage.convert_to_ndarray(input)
+        img = img.astype(np.float32)
+        result = {'img': img}
+        return result
+
+    def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
+        result = self.detector(input)
+        assert result is not None
+        bboxes = result[0][:, :4].tolist()
+        scores = result[0][:, 4].tolist()
+        lms = result[1].tolist()
+        return {
+            OutputKeys.SCORES: scores,
+            OutputKeys.BOXES: bboxes,
+            OutputKeys.KEYPOINTS: lms,
+        }
+
+    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
+        return inputs
diff --git a/modelscope/pipelines/cv/shop_segmentation_pipleline.py b/modelscope/pipelines/cv/shop_segmentation_pipleline.py
new file mode 100644
index 00000000..d08058c3
--- /dev/null
+++ b/modelscope/pipelines/cv/shop_segmentation_pipleline.py
@@ -0,0 +1,52 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import Any, Dict
+
+from modelscope.metainfo import Pipelines
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines.base import Input, Pipeline
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.preprocessors import LoadImage
+from modelscope.utils.constant import Tasks
+
+
+@PIPELINES.register_module(
+    Tasks.shop_segmentation, module_name=Pipelines.shop_segmentation)
+class ShopSegmentationPipeline(Pipeline):
+
+    def __init__(self, model: str, **kwargs):
+        """
+            model: model id on modelscope hub.
+        """
+        super().__init__(model=model, auto_collate=False, **kwargs)
+
+    def preprocess(self, input: Input) -> Dict[str, Any]:
+        img = LoadImage.convert_to_ndarray(input)
+        img_tensor, ori_h, ori_w, crop_h, crop_w = self.model.preprocess(img)
+        result = {
+            'img': img_tensor,
+            'ori_h': ori_h,
+            'ori_w': ori_w,
+            'crop_h': crop_h,
+            'crop_w': crop_w
+        }
+        return result
+
+    def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
+
+        outputs = self.model.inference(input['img'])
+        result = {
+            'data': outputs,
+            'ori_h': input['ori_h'],
+            'ori_w': input['ori_w'],
+            'crop_h': input['crop_h'],
+            'crop_w': input['crop_w'],
+        }
+        return result
+
+    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
+
+        data = self.model.postprocess(inputs['data'], inputs['crop_h'],
+                                      inputs['crop_w'], inputs['ori_h'],
+                                      inputs['ori_w'])
+        outputs = {OutputKeys.MASKS: data}
+        return outputs
diff --git a/modelscope/pipelines/cv/skin_retouching_pipeline.py b/modelscope/pipelines/cv/skin_retouching_pipeline.py
index f8c9de60..c6571bef 100644
--- a/modelscope/pipelines/cv/skin_retouching_pipeline.py
+++ b/modelscope/pipelines/cv/skin_retouching_pipeline.py
@@ -1,3 +1,4 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
 import os
 from typing import Any, Dict
 
diff --git a/modelscope/pipelines/cv/text_driven_segmentation_pipleline.py b/modelscope/pipelines/cv/text_driven_segmentation_pipleline.py
new file mode 100644
index 00000000..c7f9d4c2
--- /dev/null
+++ b/modelscope/pipelines/cv/text_driven_segmentation_pipleline.py
@@ -0,0 +1,52 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import Any, Dict
+
+from modelscope.metainfo import Pipelines
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines.base import Input, Pipeline
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.preprocessors import LoadImage
+from modelscope.utils.constant import Tasks
+
+
+@PIPELINES.register_module(
+    Tasks.text_driven_segmentation,
+    module_name=Pipelines.text_driven_segmentation)
+class TextDrivenSegmentationPipeline(Pipeline):
+
+    def __init__(self, model: str, **kwargs):
+        """
+            model: model id on modelscope hub.
+        """
+        super().__init__(model=model, auto_collate=False, **kwargs)
+
+    def preprocess(self, input: Dict) -> Dict[str, Any]:
+        img = LoadImage.convert_to_ndarray(input['image'])
+        img_tensor, ori_h, ori_w, crop_h, crop_w = self.model.preprocess(img)
+        result = {
+            'img': img_tensor,
+            'ori_h': ori_h,
+            'ori_w': ori_w,
+            'crop_h': crop_h,
+            'crop_w': crop_w,
+            'text': input['text'],
+        }
+        return result
+
+    def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
+        outputs = self.model.inference(input['img'], input['text'])
+        result = {
+            'data': outputs,
+            'ori_h': input['ori_h'],
+            'ori_w': input['ori_w'],
+            'crop_h': input['crop_h'],
+            'crop_w': input['crop_w'],
+        }
+        return result
+
+    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
+        data = self.model.postprocess(inputs['data'], inputs['crop_h'],
+                                      inputs['crop_w'], inputs['ori_h'],
+                                      inputs['ori_w'])
+        outputs = {OutputKeys.MASKS: data}
+        return outputs
diff --git a/modelscope/pipelines/cv/tinynas_detection_pipeline.py b/modelscope/pipelines/cv/tinynas_detection_pipeline.py
new file mode 100644
index 00000000..b2063629
--- /dev/null
+++ b/modelscope/pipelines/cv/tinynas_detection_pipeline.py
@@ -0,0 +1,61 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+from typing import Any, Dict
+
+import cv2
+import numpy as np
+import torch
+
+from modelscope.metainfo import Pipelines
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines.base import Input, Pipeline
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.preprocessors import LoadImage
+from modelscope.utils.constant import Tasks
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+
+@PIPELINES.register_module(
+    Tasks.image_object_detection, module_name=Pipelines.tinynas_detection)
+class TinynasDetectionPipeline(Pipeline):
+
+    def __init__(self, model: str, **kwargs):
+        """
+            model: model id on modelscope hub.
+        """
+        super().__init__(model=model, auto_collate=False, **kwargs)
+        if torch.cuda.is_available():
+            self.device = 'cuda'
+        else:
+            self.device = 'cpu'
+        self.model.to(self.device)
+        self.model.eval()
+
+    def preprocess(self, input: Input) -> Dict[str, Any]:
+
+        img = LoadImage.convert_to_ndarray(input)
+        self.img = img
+        img = img.astype(np.float)
+        img = self.model.preprocess(img)
+        result = {'img': img.to(self.device)}
+        return result
+
+    def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
+
+        outputs = self.model.inference(input['img'])
+        result = {'data': outputs}
+        return result
+
+    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
+
+        bboxes, scores, labels = self.model.postprocess(inputs['data'])
+        if bboxes is None:
+            return None
+        outputs = {
+            OutputKeys.SCORES: scores,
+            OutputKeys.LABELS: labels,
+            OutputKeys.BOXES: bboxes
+        }
+        return outputs
diff --git a/modelscope/pipelines/cv/ulfd_face_detection_pipeline.py b/modelscope/pipelines/cv/ulfd_face_detection_pipeline.py
new file mode 100644
index 00000000..e9901d64
--- /dev/null
+++ b/modelscope/pipelines/cv/ulfd_face_detection_pipeline.py
@@ -0,0 +1,57 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os.path as osp
+from typing import Any, Dict
+
+import cv2
+import numpy as np
+import PIL
+import torch
+
+from modelscope.metainfo import Pipelines
+from modelscope.models.cv.face_detection import UlfdFaceDetector
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines.base import Input, Pipeline
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.preprocessors import LoadImage
+from modelscope.utils.constant import ModelFile, Tasks
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+
+@PIPELINES.register_module(
+    Tasks.face_detection, module_name=Pipelines.ulfd_face_detection)
+class UlfdFaceDetectionPipeline(Pipeline):
+
+    def __init__(self, model: str, **kwargs):
+        """
+        use `model` to create a face detection pipeline for prediction
+        Args:
+            model: model id on modelscope hub.
+        """
+        super().__init__(model=model, **kwargs)
+        ckpt_path = osp.join(model, ModelFile.TORCH_MODEL_FILE)
+        logger.info(f'loading model from {ckpt_path}')
+        detector = UlfdFaceDetector(model_path=ckpt_path, device=self.device)
+        self.detector = detector
+        logger.info('load model done')
+
+    def preprocess(self, input: Input) -> Dict[str, Any]:
+        img = LoadImage.convert_to_ndarray(input)
+        img = img.astype(np.float32)
+        result = {'img': img}
+        return result
+
+    def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
+        result = self.detector(input)
+        assert result is not None
+        bboxes = result[0].tolist()
+        scores = result[1].tolist()
+        return {
+            OutputKeys.SCORES: scores,
+            OutputKeys.BOXES: bboxes,
+            OutputKeys.KEYPOINTS: None,
+        }
+
+    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
+        return inputs
diff --git a/modelscope/pipelines/cv/video_category_pipeline.py b/modelscope/pipelines/cv/video_category_pipeline.py
index 196d3115..e4c73649 100644
--- a/modelscope/pipelines/cv/video_category_pipeline.py
+++ b/modelscope/pipelines/cv/video_category_pipeline.py
@@ -1,4 +1,4 @@
-# Copyright (c) Alibaba, Inc. and its affiliates.
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
 import os.path as osp
 from typing import Any, Dict
 
diff --git a/modelscope/pipelines/cv/video_inpainting_pipeline.py b/modelscope/pipelines/cv/video_inpainting_pipeline.py
new file mode 100644
index 00000000..85133474
--- /dev/null
+++ b/modelscope/pipelines/cv/video_inpainting_pipeline.py
@@ -0,0 +1,48 @@
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
+from typing import Any, Dict
+
+from modelscope.metainfo import Pipelines
+from modelscope.models.cv.video_inpainting import inpainting
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines.base import Input, Pipeline
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.utils.constant import Tasks
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+
+@PIPELINES.register_module(
+    Tasks.video_inpainting, module_name=Pipelines.video_inpainting)
+class VideoInpaintingPipeline(Pipeline):
+
+    def __init__(self, model: str, **kwargs):
+        """
+        use `model` to create video inpainting pipeline for prediction
+        Args:
+            model: model id on modelscope hub.
+        """
+
+        super().__init__(model=model, **kwargs)
+        logger.info('load model done')
+
+    def preprocess(self, input: Input) -> Dict[str, Any]:
+        return input
+
+    def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
+        decode_error, fps, w, h = inpainting.video_process(
+            input['video_input_path'])
+
+        if decode_error is not None:
+            return {OutputKeys.OUTPUT: 'decode_error'}
+
+        inpainting.inpainting_by_model_balance(self.model,
+                                               input['video_input_path'],
+                                               input['mask_path'],
+                                               input['video_output_path'], fps,
+                                               w, h)
+
+        return {OutputKeys.OUTPUT: 'Done'}
+
+    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
+        return inputs
diff --git a/modelscope/pipelines/cv/video_single_object_tracking_pipeline.py b/modelscope/pipelines/cv/video_single_object_tracking_pipeline.py
index f4ba4d0b..4169def7 100644
--- a/modelscope/pipelines/cv/video_single_object_tracking_pipeline.py
+++ b/modelscope/pipelines/cv/video_single_object_tracking_pipeline.py
@@ -1,3 +1,4 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
 import os.path as osp
 from typing import Any, Dict
 
@@ -8,8 +9,8 @@ from modelscope.models.cv.video_single_object_tracking.config.ostrack import \
     cfg
 from modelscope.models.cv.video_single_object_tracking.tracker.ostrack import \
     OSTrack
-from modelscope.models.cv.video_single_object_tracking.utils.utils import \
-    check_box
+from modelscope.models.cv.video_single_object_tracking.utils.utils import (
+    check_box, timestamp_format)
 from modelscope.outputs import OutputKeys
 from modelscope.pipelines.base import Input, Pipeline
 from modelscope.pipelines.builder import PIPELINES
@@ -44,7 +45,10 @@ class VideoSingleObjectTrackingPipeline(Pipeline):
 
     def forward(self, input: Input) -> Dict[str, Any]:
         output_boxes = []
+        output_timestamps = []
         cap = cv2.VideoCapture(self.video_path)
+        fps = cap.get(cv2.CAP_PROP_FPS)
+        frame_idx = 0
         success, frame = cap.read()
         if success is False:
             raise Exception(
@@ -57,6 +61,7 @@ class VideoSingleObjectTrackingPipeline(Pipeline):
             raise Exception('modelscope error: init_box out of image range ',
                             init_box)
         output_boxes.append(init_box.copy())
+        output_timestamps.append(timestamp_format(seconds=frame_idx / fps))
         init_box[2] = init_box[2] - init_box[0]
         init_box[3] = init_box[3] - init_box[1]
         self.tracker.initialize(frame, {'init_bbox': init_box})
@@ -66,14 +71,17 @@ class VideoSingleObjectTrackingPipeline(Pipeline):
             ret, frame = cap.read()
             if frame is None:
                 break
+            frame_idx += 1
             out = self.tracker.track(frame)
             state = [int(s) for s in out['target_bbox']]
             output_boxes.append(state)
+            output_timestamps.append(timestamp_format(seconds=frame_idx / fps))
         cap.release()
         logger.info('tracking process done')
 
         return {
             OutputKeys.BOXES: output_boxes,
+            OutputKeys.TIMESTAMPS: output_timestamps
         }
 
     def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
diff --git a/modelscope/pipelines/cv/video_summarization_pipeline.py b/modelscope/pipelines/cv/video_summarization_pipeline.py
index 9ed9c867..001780e1 100644
--- a/modelscope/pipelines/cv/video_summarization_pipeline.py
+++ b/modelscope/pipelines/cv/video_summarization_pipeline.py
@@ -106,4 +106,4 @@ class VideoSummarizationPipeline(Pipeline):
             summary = generate_summary([change_points], [scores], [n_frames],
                                        [picks])[0]
 
-        return summary
+        return summary.tolist()
diff --git a/modelscope/pipelines/multi_modal/generative_multi_modal_embedding_pipeline.py b/modelscope/pipelines/multi_modal/generative_multi_modal_embedding_pipeline.py
index f5a180b6..13032314 100644
--- a/modelscope/pipelines/multi_modal/generative_multi_modal_embedding_pipeline.py
+++ b/modelscope/pipelines/multi_modal/generative_multi_modal_embedding_pipeline.py
@@ -1,3 +1,5 @@
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
+
 from typing import Any, Dict
 
 from modelscope.metainfo import Pipelines
diff --git a/modelscope/pipelines/multi_modal/image_captioning_pipeline.py b/modelscope/pipelines/multi_modal/image_captioning_pipeline.py
index 99cccee1..81a5f8cd 100644
--- a/modelscope/pipelines/multi_modal/image_captioning_pipeline.py
+++ b/modelscope/pipelines/multi_modal/image_captioning_pipeline.py
@@ -52,6 +52,4 @@ class ImageCaptioningPipeline(Pipeline):
             return super().forward(inputs, **forward_params)
 
     def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
-        if isinstance(self.model, OfaForAllTasks):
-            return inputs
-        return {OutputKeys.CAPTION: inputs}
+        return inputs
diff --git a/modelscope/pipelines/multi_modal/image_text_retrieval_pipeline.py b/modelscope/pipelines/multi_modal/image_text_retrieval_pipeline.py
new file mode 100644
index 00000000..329d79bf
--- /dev/null
+++ b/modelscope/pipelines/multi_modal/image_text_retrieval_pipeline.py
@@ -0,0 +1,51 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import Any, Dict, Optional, Union
+
+import torch
+
+from modelscope.metainfo import Pipelines
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines.base import Model, Pipeline
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.preprocessors import MPlugPreprocessor, Preprocessor
+from modelscope.utils.constant import Tasks
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+
+@PIPELINES.register_module(
+    Tasks.image_text_retrieval, module_name=Pipelines.image_text_retrieval)
+class ImageTextRetrievalPipeline(Pipeline):
+
+    def __init__(self,
+                 model: Union[Model, str],
+                 preprocessor: Optional[Preprocessor] = None,
+                 **kwargs):
+        """
+        use `model` and `preprocessor` to create a
+        image text retrieval pipeline for prediction
+        Args:
+            model: model id on modelscope hub.
+        """
+        super().__init__(model=model)
+        assert isinstance(model, str) or isinstance(model, Model), \
+            f'model must be a single str or Model, but got {type(model)}'
+        if isinstance(model, str):
+            pipe_model = Model.from_pretrained(model)
+        elif isinstance(model, Model):
+            pipe_model = model
+        else:
+            raise NotImplementedError
+        pipe_model.model.eval()
+        if preprocessor is None:
+            preprocessor = MPlugPreprocessor(pipe_model.model_dir)
+        super().__init__(model=pipe_model, preprocessor=preprocessor, **kwargs)
+
+    def forward(self, inputs: Dict[str, Any],
+                **forward_params) -> Dict[str, Any]:
+        with torch.no_grad():
+            return super().forward(inputs, **forward_params)
+
+    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
+        return inputs
diff --git a/modelscope/pipelines/multi_modal/multi_modal_embedding_pipeline.py b/modelscope/pipelines/multi_modal/multi_modal_embedding_pipeline.py
index d15970d2..76011be0 100644
--- a/modelscope/pipelines/multi_modal/multi_modal_embedding_pipeline.py
+++ b/modelscope/pipelines/multi_modal/multi_modal_embedding_pipeline.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 from typing import Any, Dict
 
 from modelscope.metainfo import Pipelines
diff --git a/modelscope/pipelines/multi_modal/team_multi_modal_similarity_pipeline.py b/modelscope/pipelines/multi_modal/team_multi_modal_similarity_pipeline.py
new file mode 100644
index 00000000..cafd6555
--- /dev/null
+++ b/modelscope/pipelines/multi_modal/team_multi_modal_similarity_pipeline.py
@@ -0,0 +1,32 @@
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
+from typing import Any, Dict
+
+from modelscope.metainfo import Pipelines
+from modelscope.pipelines.base import Input, Model, Pipeline
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.utils.constant import Tasks
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+
+@PIPELINES.register_module(
+    Tasks.multi_modal_similarity, module_name=Pipelines.multi_modal_similarity)
+class TEAMMultiModalSimilarityPipeline(Pipeline):
+
+    def __init__(self, model: str, **kwargs):
+        """
+        use `model` to create a multimodal similarity pipeline
+        Args:
+            model: model id on modelscope hub.
+        """
+        super().__init__(model=model, **kwargs)
+
+    def preprocess(self, input: Input) -> Dict[str, Any]:
+        return input
+
+    def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
+        return self.model(input)
+
+    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
+        return inputs
diff --git a/modelscope/pipelines/multi_modal/text_to_image_synthesis_pipeline.py b/modelscope/pipelines/multi_modal/text_to_image_synthesis_pipeline.py
index 406538cf..7516c5be 100644
--- a/modelscope/pipelines/multi_modal/text_to_image_synthesis_pipeline.py
+++ b/modelscope/pipelines/multi_modal/text_to_image_synthesis_pipeline.py
@@ -1,9 +1,12 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 from typing import Any, Dict, Optional
 
 import torch
 
 from modelscope.metainfo import Pipelines
-from modelscope.models.multi_modal import OfaForTextToImageSynthesis
+from modelscope.models.multi_modal import (
+    MultiStageDiffusionForTextToImageSynthesis, OfaForTextToImageSynthesis)
 from modelscope.outputs import OutputKeys
 from modelscope.pipelines.base import Input, Model, Pipeline
 from modelscope.pipelines.builder import PIPELINES
@@ -48,7 +51,9 @@ class TextToImageSynthesisPipeline(Pipeline):
             return input
 
     def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
-        if isinstance(self.model, OfaForTextToImageSynthesis):
+        if isinstance(self.model,
+                      (OfaForTextToImageSynthesis,
+                       MultiStageDiffusionForTextToImageSynthesis)):
             return self.model(input)
         return self.model.generate(input)
 
diff --git a/modelscope/pipelines/multi_modal/video_multi_modal_embedding_pipeline.py b/modelscope/pipelines/multi_modal/video_multi_modal_embedding_pipeline.py
index bc697b05..3a9284f1 100644
--- a/modelscope/pipelines/multi_modal/video_multi_modal_embedding_pipeline.py
+++ b/modelscope/pipelines/multi_modal/video_multi_modal_embedding_pipeline.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 from typing import Any, Dict
 
 from modelscope.metainfo import Pipelines
diff --git a/modelscope/pipelines/multi_modal/visual_question_answering_pipeline.py b/modelscope/pipelines/multi_modal/visual_question_answering_pipeline.py
index b2442a3e..86177074 100644
--- a/modelscope/pipelines/multi_modal/visual_question_answering_pipeline.py
+++ b/modelscope/pipelines/multi_modal/visual_question_answering_pipeline.py
@@ -56,6 +56,4 @@ class VisualQuestionAnsweringPipeline(Pipeline):
         Returns:
             Dict[str, str]: the prediction results
         """
-        if isinstance(self.model, OfaForAllTasks):
-            return inputs
-        return {OutputKeys.TEXT: inputs}
+        return inputs
diff --git a/modelscope/pipelines/nlp/__init__.py b/modelscope/pipelines/nlp/__init__.py
index 0cdb633c..5267b5b2 100644
--- a/modelscope/pipelines/nlp/__init__.py
+++ b/modelscope/pipelines/nlp/__init__.py
@@ -5,46 +5,61 @@ from modelscope.utils.import_utils import LazyImportModule
 
 if TYPE_CHECKING:
     from .conversational_text_to_sql_pipeline import ConversationalTextToSqlPipeline
+    from .table_question_answering_pipeline import TableQuestionAnsweringPipeline
     from .dialog_intent_prediction_pipeline import DialogIntentPredictionPipeline
     from .dialog_modeling_pipeline import DialogModelingPipeline
     from .dialog_state_tracking_pipeline import DialogStateTrackingPipeline
+    from .document_segmentation_pipeline import DocumentSegmentationPipeline
+    from .faq_question_answering_pipeline import FaqQuestionAnsweringPipeline
+    from .feature_extraction_pipeline import FeatureExtractionPipeline
     from .fill_mask_pipeline import FillMaskPipeline
+    from .fill_mask_ponet_pipeline import FillMaskPonetPipeline
+    from .information_extraction_pipeline import InformationExtractionPipeline
     from .named_entity_recognition_pipeline import NamedEntityRecognitionPipeline
-    from .pair_sentence_classification_pipeline import PairSentenceClassificationPipeline
-    from .single_sentence_classification_pipeline import SingleSentenceClassificationPipeline
+    from .passage_ranking_pipeline import PassageRankingPipeline
+    from .sentence_embedding_pipeline import SentenceEmbeddingPipeline
     from .sequence_classification_pipeline import SequenceClassificationPipeline
-    from .text_generation_pipeline import TextGenerationPipeline
-    from .translation_pipeline import TranslationPipeline
-    from .word_segmentation_pipeline import WordSegmentationPipeline
-    from .zero_shot_classification_pipeline import ZeroShotClassificationPipeline
     from .summarization_pipeline import SummarizationPipeline
     from .text_classification_pipeline import TextClassificationPipeline
     from .text_error_correction_pipeline import TextErrorCorrectionPipeline
+    from .text_generation_pipeline import TextGenerationPipeline
+    from .text2text_generation_pipeline import Text2TextGenerationPipeline
+    from .token_classification_pipeline import TokenClassificationPipeline
+    from .translation_pipeline import TranslationPipeline
+    from .word_segmentation_pipeline import WordSegmentationPipeline
+    from .zero_shot_classification_pipeline import ZeroShotClassificationPipeline
 
 else:
     _import_structure = {
         'conversational_text_to_sql_pipeline':
         ['ConversationalTextToSqlPipeline'],
+        'table_question_answering_pipeline':
+        ['TableQuestionAnsweringPipeline'],
         'dialog_intent_prediction_pipeline':
         ['DialogIntentPredictionPipeline'],
         'dialog_modeling_pipeline': ['DialogModelingPipeline'],
         'dialog_state_tracking_pipeline': ['DialogStateTrackingPipeline'],
+        'document_segmentation_pipeline': ['DocumentSegmentationPipeline'],
+        'faq_question_answering_pipeline': ['FaqQuestionAnsweringPipeline'],
+        'feature_extraction_pipeline': ['FeatureExtractionPipeline'],
         'fill_mask_pipeline': ['FillMaskPipeline'],
-        'single_sentence_classification_pipeline':
-        ['SingleSentenceClassificationPipeline'],
-        'pair_sentence_classification_pipeline':
-        ['PairSentenceClassificationPipeline'],
+        'fill_mask_ponet_pipeline': ['FillMaskPoNetPipeline'],
+        'information_extraction_pipeline': ['InformationExtractionPipeline'],
+        'named_entity_recognition_pipeline':
+        ['NamedEntityRecognitionPipeline'],
+        'passage_ranking_pipeline': ['PassageRankingPipeline'],
+        'sentence_embedding_pipeline': ['SentenceEmbeddingPipeline'],
         'sequence_classification_pipeline': ['SequenceClassificationPipeline'],
+        'summarization_pipeline': ['SummarizationPipeline'],
+        'text_classification_pipeline': ['TextClassificationPipeline'],
+        'text_error_correction_pipeline': ['TextErrorCorrectionPipeline'],
         'text_generation_pipeline': ['TextGenerationPipeline'],
+        'text2text_generation_pipeline': ['Text2TextGenerationPipeline'],
+        'token_classification_pipeline': ['TokenClassificationPipeline'],
+        'translation_pipeline': ['TranslationPipeline'],
         'word_segmentation_pipeline': ['WordSegmentationPipeline'],
         'zero_shot_classification_pipeline':
         ['ZeroShotClassificationPipeline'],
-        'named_entity_recognition_pipeline':
-        ['NamedEntityRecognitionPipeline'],
-        'translation_pipeline': ['TranslationPipeline'],
-        'summarization_pipeline': ['SummarizationPipeline'],
-        'text_classification_pipeline': ['TextClassificationPipeline'],
-        'text_error_correction_pipeline': ['TextErrorCorrectionPipeline']
     }
 
     import sys
diff --git a/modelscope/pipelines/nlp/conversational_text_to_sql_pipeline.py b/modelscope/pipelines/nlp/conversational_text_to_sql_pipeline.py
index 399dad5a..c46e8c81 100644
--- a/modelscope/pipelines/nlp/conversational_text_to_sql_pipeline.py
+++ b/modelscope/pipelines/nlp/conversational_text_to_sql_pipeline.py
@@ -11,8 +11,8 @@ from modelscope.outputs import OutputKeys
 from modelscope.pipelines.base import Pipeline
 from modelscope.pipelines.builder import PIPELINES
 from modelscope.preprocessors import ConversationalTextToSqlPreprocessor
-from modelscope.preprocessors.star.fields.common_utils import SubPreprocessor
-from modelscope.preprocessors.star.fields.process_dataset import process_tables
+from modelscope.preprocessors.star.fields import (SubPreprocessor,
+                                                  process_tables)
 from modelscope.utils.constant import Tasks
 
 __all__ = ['ConversationalTextToSqlPipeline']
diff --git a/modelscope/pipelines/nlp/dialog_state_tracking_pipeline.py b/modelscope/pipelines/nlp/dialog_state_tracking_pipeline.py
index 0d2c96d7..79d32ace 100644
--- a/modelscope/pipelines/nlp/dialog_state_tracking_pipeline.py
+++ b/modelscope/pipelines/nlp/dialog_state_tracking_pipeline.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 from typing import Any, Dict, Union
 
 from modelscope.metainfo import Pipelines
diff --git a/modelscope/pipelines/nlp/distributed_plug_pipeline.py b/modelscope/pipelines/nlp/distributed_plug_pipeline.py
new file mode 100644
index 00000000..e5c05e86
--- /dev/null
+++ b/modelscope/pipelines/nlp/distributed_plug_pipeline.py
@@ -0,0 +1,109 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+from typing import Any, Dict
+
+import torch
+
+from modelscope.metainfo import Pipelines
+from modelscope.models.nlp.plug import DistributedPlug
+from modelscope.pipelines.base import DistributedPipeline
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.preprocessors import TextGenerationPreprocessor
+from modelscope.utils.constant import Tasks
+
+
+@PIPELINES.register_module(
+    Tasks.text_generation, module_name=Pipelines.plug_generation)
+class DistributedPlugPipeline(DistributedPipeline):
+    """This class is used to instantiate the plug model.
+    """
+
+    model = None
+
+    def __init__(self,
+                 model,
+                 preprocessor=None,
+                 first_sequence='sentence',
+                 **kwargs):
+        """Create a plug pipeline instance.
+
+        @param model: The model_id of plug(damo/nlp_plug_text-generation_27B).
+        The default path to damo/nlp_plug_text-generation_27B can be obtained by function
+        get_cache_dir("damo/nlp_plug_text-generation_27B"), the model should be downloaded to
+        this path before calling this class by model_id.
+        The model can be downloaded from the link on
+        https://modelscope.cn/models/damo/nlp_plug_text-generation_27B/summary.
+        After downloading, you should have a plug model structure like this:
+        /your/path/to/damo/nlp_plug_text-generation_27B
+            |_ config.json
+            |_ configuration.json
+            |_ ds_zero-offload_10B_config.json
+            |_ vocab.txt
+            |_ model <-- an empty directory
+
+        Model binaries shall be downloaded separately to populate the model directory, so that
+        the model directory would contain the following binaries:
+            |_ model
+                |_ mp_rank_00_model_states.pt
+                |_ mp_rank_01_model_states.pt
+                |_ mp_rank_02_model_states.pt
+                |_ mp_rank_03_model_states.pt
+                |_ mp_rank_04_model_states.pt
+                |_ mp_rank_05_model_states.pt
+                |_ mp_rank_06_model_states.pt
+                |_ mp_rank_07_model_states.pt
+        @param preprocessor: The optional preprocessor, if not passed in, a TextGenerationPreprocessor will
+            be used as default.
+        @param first_sequence: The first_sequence key name if the input format is a dict.
+        @param kwargs:
+            sequence_length: The input sequence_length.
+        """
+        if preprocessor is None:
+            preprocessor = TextGenerationPreprocessor(
+                model,
+                first_sequence=first_sequence,
+                sequence_length=kwargs.pop('sequence_length', 512))
+        super().__init__(model, preprocessor=preprocessor, **kwargs)
+        assert hasattr(preprocessor, 'tokenizer')
+        self.cls_token_id = preprocessor.tokenizer.cls_token_id
+
+    @classmethod
+    def _forward_one(cls, inputs: Dict[str, Any]) -> Dict[str, Any]:
+        with torch.no_grad():
+            return cls.model.generate(inputs['inputs'],
+                                      **inputs['forward_params'])
+
+    def _sanitize_parameters(self, **pipeline_parameters):
+        return {}, pipeline_parameters, {}
+
+    def forward(self, inputs: Dict[str, Any],
+                **forward_params) -> Dict[str, Any]:
+        batch_size = inputs['input_ids'].shape[0]
+        dec_input_ids = torch.full([batch_size, 1],
+                                   self.cls_token_id,
+                                   dtype=torch.long)
+        inputs['dec_input_ids'] = dec_input_ids
+        res = super().forward(inputs, **forward_params)
+        return res
+
+    @classmethod
+    def _instantiate_one(cls, rank, model_dir, **kwargs):
+        cls.model = DistributedPlug(model_dir, rank, **kwargs)
+        cls.model.eval()
+
+    def postprocess(self, inputs: Dict[str, Any],
+                    **postprocess_params) -> Dict[str, str]:
+        """process the prediction results
+
+        Args:
+            inputs (Dict[str, Any]): _description_
+
+        Returns:
+            Dict[str, str]: the prediction results
+        """
+        from modelscope.outputs import OutputKeys
+        generate_context = inputs['generate_context']
+        generate_context = ''.join(
+            self.preprocessor.tokenizer.convert_ids_to_tokens(
+                generate_context)).replace('[UNK]', '“').replace('##', '')
+        return {OutputKeys.TEXT: generate_context}
diff --git a/modelscope/pipelines/nlp/document_segmentation_pipeline.py b/modelscope/pipelines/nlp/document_segmentation_pipeline.py
new file mode 100644
index 00000000..00837bf3
--- /dev/null
+++ b/modelscope/pipelines/nlp/document_segmentation_pipeline.py
@@ -0,0 +1,175 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import re
+from typing import Any, Dict, List, Union
+
+import numpy as np
+import torch
+from datasets import Dataset
+from transformers.models.bert.modeling_bert import BertConfig
+
+from modelscope.metainfo import Pipelines
+from modelscope.models import Model
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines.base import Pipeline, Tensor
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.preprocessors import DocumentSegmentationPreprocessor
+from modelscope.utils.constant import Tasks
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+__all__ = ['DocumentSegmentationPipeline']
+
+
+@PIPELINES.register_module(
+    Tasks.document_segmentation, module_name=Pipelines.document_segmentation)
+class DocumentSegmentationPipeline(Pipeline):
+
+    def __init__(self,
+                 model: Union[Model, str],
+                 preprocessor: DocumentSegmentationPreprocessor = None,
+                 **kwargs):
+
+        model = model if isinstance(model,
+                                    Model) else Model.from_pretrained(model)
+
+        self.model_dir = model.model_dir
+        config = BertConfig.from_pretrained(model.model_dir, num_labels=2)
+
+        self.document_segmentation_model = model.build_with_config(
+            config=config)
+
+        if preprocessor is None:
+            preprocessor = DocumentSegmentationPreprocessor(
+                self.model_dir, config)
+        super().__init__(model=model, preprocessor=preprocessor, **kwargs)
+
+        self.preprocessor = preprocessor
+
+    def __call__(self, documents: Union[List[str], str]) -> Dict[str, Any]:
+        output = self.predict(documents)
+        output = self.postprocess(output)
+        return output
+
+    def predict(self, documents: Union[List[str], str]) -> Dict[str, Any]:
+        pred_samples = self.cut_documents(documents)
+        predict_examples = Dataset.from_dict(pred_samples)
+
+        # Predict Feature Creation
+        predict_dataset = self.preprocessor(predict_examples)
+        num_examples = len(
+            predict_examples[self.preprocessor.context_column_name])
+        num_samples = len(
+            predict_dataset[self.preprocessor.context_column_name])
+
+        predict_dataset.pop('segment_ids')
+        labels = predict_dataset.pop('labels')
+        sentences = predict_dataset.pop('sentences')
+        example_ids = predict_dataset.pop(
+            self.preprocessor.example_id_column_name)
+
+        with torch.no_grad():
+            input = {
+                key: torch.tensor(val)
+                for key, val in predict_dataset.items()
+            }
+            predictions = self.document_segmentation_model.forward(
+                **input).logits
+
+        predictions = np.argmax(predictions, axis=2)
+        assert len(sentences) == len(
+            predictions), 'sample {}  infer_sample {} prediction {}'.format(
+                num_samples, len(sentences), len(predictions))
+        # Remove ignored index (special tokens)
+        true_predictions = [
+            [
+                self.preprocessor.label_list[p]
+                for (p, l) in zip(prediction, label) if l != -100  # noqa *
+            ] for prediction, label in zip(predictions, labels)
+        ]
+
+        true_labels = [
+            [
+                self.preprocessor.label_list[l]
+                for (p, l) in zip(prediction, label) if l != -100  # noqa *
+            ] for prediction, label in zip(predictions, labels)
+        ]
+
+        # Save predictions
+        out = []
+        for i in range(num_examples):
+            out.append({'sentences': [], 'labels': [], 'predictions': []})
+
+        for prediction, sentence_list, label, example_id in zip(
+                true_predictions, sentences, true_labels, example_ids):
+            if len(label) < len(sentence_list):
+                label.append('B-EOP')
+                prediction.append('B-EOP')
+            assert len(sentence_list) == len(prediction), '{} {}'.format(
+                len(sentence_list), len(prediction))
+            assert len(sentence_list) == len(label), '{} {}'.format(
+                len(sentence_list), len(label))
+            out[example_id]['sentences'].extend(sentence_list)
+            out[example_id]['labels'].extend(label)
+            out[example_id]['predictions'].extend(prediction)
+
+        return out
+
+    def postprocess(self, inputs: Dict[str, Tensor]) -> Dict[str, Tensor]:
+        """process the prediction results
+
+        Args:
+            inputs (Dict[str, Any]): _description_
+
+        Returns:
+            Dict[str, str]: the prediction results
+        """
+        result = []
+        list_count = len(inputs)
+        for num in range(list_count):
+            res = []
+            for s, p in zip(inputs[num]['sentences'],
+                            inputs[num]['predictions']):
+                s = s.strip()
+                if p == 'B-EOP':
+                    s = ''.join([s, '\n\t'])
+                res.append(s)
+
+            document = ('\t' + ''.join(res))
+            result.append(document)
+
+        if list_count == 1:
+            return {OutputKeys.TEXT: result[0]}
+        else:
+            return {OutputKeys.TEXT: result}
+
+    def cut_documents(self, para: Union[List[str], str]):
+        document_list = para
+        if isinstance(para, str):
+            document_list = [para]
+        sentences = []
+        labels = []
+        example_id = []
+        id = 0
+        for document in document_list:
+            sentence = self.cut_sentence(document)
+            label = ['O'] * (len(sentence) - 1) + ['B-EOP']
+            sentences.append(sentence)
+            labels.append(label)
+            example_id.append(id)
+            id += 1
+
+        return {
+            'example_id': example_id,
+            'sentences': sentences,
+            'labels': labels
+        }
+
+    def cut_sentence(self, para):
+        para = re.sub(r'([。！.!？\?])([^”’])', r'\1\n\2', para)  # noqa *
+        para = re.sub(r'(\.{6})([^”’])', r'\1\n\2', para)  # noqa *
+        para = re.sub(r'(\…{2})([^”’])', r'\1\n\2', para)  # noqa *
+        para = re.sub(r'([。！？\?][”’])([^，。！？\?])', r'\1\n\2', para)  # noqa *
+        para = para.rstrip()
+        return [_ for _ in para.split('\n') if _]
diff --git a/modelscope/pipelines/nlp/faq_question_answering_pipeline.py b/modelscope/pipelines/nlp/faq_question_answering_pipeline.py
new file mode 100644
index 00000000..1d46d8fd
--- /dev/null
+++ b/modelscope/pipelines/nlp/faq_question_answering_pipeline.py
@@ -0,0 +1,78 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+from typing import Any, Dict, Union
+
+import torch
+
+from modelscope.metainfo import Pipelines
+from modelscope.models import Model
+from modelscope.models.nlp import SbertForFaqQuestionAnswering
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines.base import Pipeline
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.preprocessors import FaqQuestionAnsweringPreprocessor
+from modelscope.utils.constant import Tasks
+
+__all__ = ['FaqQuestionAnsweringPipeline']
+
+
+@PIPELINES.register_module(
+    Tasks.faq_question_answering, module_name=Pipelines.faq_question_answering)
+class FaqQuestionAnsweringPipeline(Pipeline):
+
+    def __init__(self,
+                 model: Union[str, SbertForFaqQuestionAnswering],
+                 preprocessor: FaqQuestionAnsweringPreprocessor = None,
+                 **kwargs):
+        model = model if isinstance(
+            model,
+            SbertForFaqQuestionAnswering) else Model.from_pretrained(model)
+        model.eval()
+        if preprocessor is None:
+            preprocessor = FaqQuestionAnsweringPreprocessor(
+                model.model_dir, **kwargs)
+        self.preprocessor = preprocessor
+        super(FaqQuestionAnsweringPipeline, self).__init__(
+            model=model, preprocessor=preprocessor, **kwargs)
+
+    def _sanitize_parameters(self, **pipeline_parameters):
+        return pipeline_parameters, pipeline_parameters, pipeline_parameters
+
+    def get_sentence_embedding(self, inputs, max_len=None):
+        inputs = self.preprocessor.batch_encode(inputs, max_length=max_len)
+        sentence_vecs = self.model.forward_sentence_embedding(inputs)
+        sentence_vecs = sentence_vecs.detach().tolist()
+        return sentence_vecs
+
+    def forward(self, inputs: [list, Dict[str, Any]],
+                **forward_params) -> Dict[str, Any]:
+        with torch.no_grad():
+            return self.model(inputs)
+
+    def postprocess(self, inputs: [list, Dict[str, Any]],
+                    **postprocess_params) -> Dict[str, Any]:
+        scores = inputs['scores']
+        labels = []
+        for item in scores:
+            tmplabels = [
+                self.preprocessor.get_label(label_id)
+                for label_id in range(len(item))
+            ]
+            labels.append(tmplabels)
+
+        predictions = []
+        for tmp_scores, tmp_labels in zip(scores.tolist(), labels):
+            prediction = []
+            for score, label in zip(tmp_scores, tmp_labels):
+                prediction.append({
+                    OutputKeys.LABEL: label,
+                    OutputKeys.SCORE: score
+                })
+            predictions.append(
+                list(
+                    sorted(
+                        prediction,
+                        key=lambda d: d[OutputKeys.SCORE],
+                        reverse=True)))
+
+        return {OutputKeys.OUTPUT: predictions}
diff --git a/modelscope/pipelines/nlp/feature_extraction_pipeline.py b/modelscope/pipelines/nlp/feature_extraction_pipeline.py
new file mode 100644
index 00000000..3af0c28d
--- /dev/null
+++ b/modelscope/pipelines/nlp/feature_extraction_pipeline.py
@@ -0,0 +1,82 @@
+import os
+from typing import Any, Dict, Optional, Union
+
+import torch
+
+from modelscope.metainfo import Pipelines
+from modelscope.models import Model
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines.base import Pipeline, Tensor
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.preprocessors import NLPPreprocessor, Preprocessor
+from modelscope.utils.config import Config
+from modelscope.utils.constant import ModelFile, Tasks
+
+__all__ = ['FeatureExtractionPipeline']
+
+
+@PIPELINES.register_module(
+    Tasks.feature_extraction, module_name=Pipelines.feature_extraction)
+class FeatureExtractionPipeline(Pipeline):
+
+    def __init__(self,
+                 model: Union[Model, str],
+                 preprocessor: Optional[Preprocessor] = None,
+                 first_sequence='sentence',
+                 **kwargs):
+        """Use `model` and `preprocessor` to create a nlp feature extraction pipeline for prediction
+
+        Args:
+            model (str or Model): Supply either a local model dir which supported feature extraction task, or a
+            no-head model id from the model hub, or a torch model instance.
+            preprocessor (Preprocessor): An optional preprocessor instance, please make sure the preprocessor fits for
+            the model if supplied.
+            first_sequence: The key to read the sentence in.
+            sequence_length: Max sequence length in the user's custom scenario. 128 will be used as a default value.
+
+            NOTE: Inputs of type 'str' are also supported. In this scenario, the 'first_sequence'
+            param will have no effect.
+
+            Example:
+            >>> from modelscope.pipelines import pipeline
+            >>> pipe_ins = pipeline('feature_extraction', model='damo/nlp_structbert_feature-extraction_english-large')
+            >>> input = 'Everything you love is treasure'
+            >>> print(pipe_ins(input))
+
+
+        """
+        model = model if isinstance(model,
+                                    Model) else Model.from_pretrained(model)
+
+        if preprocessor is None:
+            preprocessor = NLPPreprocessor(
+                model.model_dir,
+                padding=kwargs.pop('padding', False),
+                sequence_length=kwargs.pop('sequence_length', 128))
+        model.eval()
+        super().__init__(model=model, preprocessor=preprocessor, **kwargs)
+
+        self.preprocessor = preprocessor
+        self.config = Config.from_file(
+            os.path.join(model.model_dir, ModelFile.CONFIGURATION))
+        self.tokenizer = preprocessor.tokenizer
+
+    def forward(self, inputs: Dict[str, Any],
+                **forward_params) -> Dict[str, Any]:
+        with torch.no_grad():
+            return self.model(**inputs, **forward_params)
+
+    def postprocess(self, inputs: Dict[str, Tensor]) -> Dict[str, Tensor]:
+        """process the prediction results
+
+        Args:
+            inputs (Dict[str, Any]): _description_
+
+        Returns:
+            Dict[str, str]: the prediction results
+        """
+
+        return {
+            OutputKeys.TEXT_EMBEDDING:
+            inputs[OutputKeys.TEXT_EMBEDDING].tolist()
+        }
diff --git a/modelscope/pipelines/nlp/fill_mask_pipeline.py b/modelscope/pipelines/nlp/fill_mask_pipeline.py
index 60a9631b..3d515e2d 100644
--- a/modelscope/pipelines/nlp/fill_mask_pipeline.py
+++ b/modelscope/pipelines/nlp/fill_mask_pipeline.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import os
 from typing import Any, Dict, Optional, Union
 
@@ -8,12 +10,15 @@ from modelscope.models import Model
 from modelscope.outputs import OutputKeys
 from modelscope.pipelines.base import Pipeline, Tensor
 from modelscope.pipelines.builder import PIPELINES
-from modelscope.preprocessors import FillMaskPreprocessor, Preprocessor
+from modelscope.preprocessors import NLPPreprocessor, Preprocessor
 from modelscope.utils.config import Config
 from modelscope.utils.constant import ModelFile, Tasks
 
 __all__ = ['FillMaskPipeline']
-_type_map = {'veco': 'roberta', 'sbert': 'bert'}
+_type_map = {
+    'veco': 'roberta',
+    'sbert': 'bert',
+}
 
 
 @PIPELINES.register_module(Tasks.fill_mask, module_name=Pipelines.fill_mask)
@@ -52,7 +57,7 @@ class FillMaskPipeline(Pipeline):
             model, Model) else Model.from_pretrained(model)
 
         if preprocessor is None:
-            preprocessor = FillMaskPreprocessor(
+            preprocessor = NLPPreprocessor(
                 fill_mask_model.model_dir,
                 first_sequence=first_sequence,
                 second_sequence=None,
@@ -65,7 +70,7 @@ class FillMaskPipeline(Pipeline):
         self.config = Config.from_file(
             os.path.join(fill_mask_model.model_dir, ModelFile.CONFIGURATION))
         self.tokenizer = preprocessor.tokenizer
-        self.mask_id = {'roberta': 250001, 'bert': 103}
+        self.mask_id = {'roberta': 250001, 'bert': 103, 'deberta_v2': 4}
 
         self.rep_map = {
             'bert': {
@@ -85,13 +90,20 @@ class FillMaskPipeline(Pipeline):
                 '<s>': '',
                 '</s>': '',
                 '<unk>': ' '
-            }
+            },
+            'deberta_v2': {
+                '[PAD]': '',
+                r' +': ' ',
+                '[SEP]': '',
+                '[CLS]': '',
+                '[UNK]': ''
+            },
         }
 
     def forward(self, inputs: Dict[str, Any],
                 **forward_params) -> Dict[str, Any]:
         with torch.no_grad():
-            return self.model(inputs, **forward_params)
+            return self.model(**inputs, **forward_params)
 
     def postprocess(self, inputs: Dict[str, Tensor]) -> Dict[str, Tensor]:
         """process the prediction results
@@ -106,7 +118,10 @@ class FillMaskPipeline(Pipeline):
         logits = inputs[OutputKeys.LOGITS].detach().cpu().numpy()
         input_ids = inputs[OutputKeys.INPUT_IDS].detach().cpu().numpy()
         pred_ids = np.argmax(logits, axis=-1)
-        model_type = self.model.config.model_type
+        if hasattr(self.model.config, 'backbone'):
+            model_type = self.model.config.backbone.type
+        else:
+            model_type = self.model.config.model_type
         process_type = model_type if model_type in self.mask_id else _type_map[
             model_type]
         rst_ids = np.where(input_ids == self.mask_id[process_type], pred_ids,
diff --git a/modelscope/pipelines/nlp/fill_mask_ponet_pipeline.py b/modelscope/pipelines/nlp/fill_mask_ponet_pipeline.py
new file mode 100644
index 00000000..9770fc38
--- /dev/null
+++ b/modelscope/pipelines/nlp/fill_mask_ponet_pipeline.py
@@ -0,0 +1,136 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import os
+from typing import Any, Dict, Optional, Union
+
+import torch
+
+from modelscope.metainfo import Pipelines
+from modelscope.models import Model
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines.base import Pipeline, Tensor
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.preprocessors import FillMaskPoNetPreprocessor, Preprocessor
+from modelscope.utils.config import Config
+from modelscope.utils.constant import ModelFile, Tasks
+
+__all__ = ['FillMaskPonetPipeline']
+_type_map = {'ponet': 'bert'}
+
+
+@PIPELINES.register_module(
+    Tasks.fill_mask, module_name=Pipelines.fill_mask_ponet)
+class FillMaskPonetPipeline(Pipeline):
+
+    def __init__(self,
+                 model: Union[Model, str],
+                 preprocessor: Optional[Preprocessor] = None,
+                 first_sequence='sentence',
+                 **kwargs):
+        """Use `model` and `preprocessor` to create a nlp fill mask pipeline for prediction
+
+        Args:
+            model (str or Model): Supply either a local model dir which supported fill-mask task,
+            or a fill-mask model id from the model hub, or a torch model instance.
+            preprocessor (Preprocessor): An optional preprocessor instance, please make sure the preprocessor fits for
+            the model if supplied.
+            first_sequence: The key to read the sentence in.
+
+            NOTE: Inputs of type 'str' are also supported. In this scenario, the 'first_sequence'
+            param will have no effect.
+
+            Example:
+            >>> from modelscope.pipelines import pipeline
+            >>> pipeline_ins = pipeline(
+                    'fill-mask', model='damo/nlp_ponet_fill-mask_english-base')
+            >>> input = 'Everything in [MASK] you call reality is really [MASK] a reflection of your [MASK].'
+            >>> print(pipeline_ins(input))
+
+            NOTE2: Please pay attention to the model's special tokens.
+            If bert based model(bert, structbert, etc.) is used, the mask token is '[MASK]'.
+            If the xlm-roberta(xlm-roberta, veco, etc.) based model is used, the mask token is '<mask>'.
+            To view other examples plese check the tests/pipelines/test_fill_mask.py.
+        """
+        fill_mask_model = model if isinstance(
+            model, Model) else Model.from_pretrained(model)
+
+        self.config = Config.from_file(
+            os.path.join(fill_mask_model.model_dir, ModelFile.CONFIGURATION))
+
+        if preprocessor is None:
+            preprocessor = FillMaskPoNetPreprocessor(
+                fill_mask_model.model_dir,
+                first_sequence=first_sequence,
+                second_sequence=None,
+                sequence_length=kwargs.pop('sequence_length', 512))
+
+        fill_mask_model.eval()
+        super().__init__(
+            model=fill_mask_model, preprocessor=preprocessor, **kwargs)
+
+        self.preprocessor = preprocessor
+
+        self.tokenizer = preprocessor.tokenizer
+        self.mask_id = {'roberta': 250001, 'bert': 103}
+
+        self.rep_map = {
+            'bert': {
+                '[unused0]': '',
+                '[PAD]': '',
+                '[unused1]': '',
+                r' +': ' ',
+                '[SEP]': '',
+                '[unused2]': '',
+                '[CLS]': '',
+                '[UNK]': ''
+            },
+            'roberta': {
+                r' +': ' ',
+                '<mask>': '<q>',
+                '<pad>': '',
+                '<s>': '',
+                '</s>': '',
+                '<unk>': ' '
+            }
+        }
+
+    def forward(self, inputs: Dict[str, Any],
+                **forward_params) -> Dict[str, Any]:
+        with torch.no_grad():
+            return self.model(**inputs, **forward_params)
+
+    def postprocess(self, inputs: Dict[str, Tensor]) -> Dict[str, Tensor]:
+        """process the prediction results
+
+        Args:
+            inputs (Dict[str, Any]): _description_
+
+        Returns:
+            Dict[str, str]: the prediction results
+        """
+        import numpy as np
+        logits = inputs[OutputKeys.LOGITS].detach().cpu().numpy()
+        input_ids = inputs[OutputKeys.INPUT_IDS].detach().cpu().numpy()
+        pred_ids = np.argmax(logits, axis=-1)
+        model_type = self.model.config.model_type
+        process_type = model_type if model_type in self.mask_id else _type_map[
+            model_type]
+        rst_ids = np.where(input_ids == self.mask_id[process_type], pred_ids,
+                           input_ids)
+
+        def rep_tokens(string, rep_map):
+            for k, v in rep_map.items():
+                string = string.replace(k, v)
+            return string.strip()
+
+        pred_strings = []
+        for ids in rst_ids:  # batch
+            if 'language' in self.config.model and self.config.model.language == 'zh':
+                pred_string = self.tokenizer.convert_ids_to_tokens(ids)
+                pred_string = ''.join(pred_string)
+            else:
+                pred_string = self.tokenizer.decode(ids)
+            pred_string = rep_tokens(pred_string, self.rep_map[process_type])
+            pred_strings.append(pred_string)
+
+        return {OutputKeys.TEXT: pred_strings}
diff --git a/modelscope/pipelines/nlp/information_extraction_pipeline.py b/modelscope/pipelines/nlp/information_extraction_pipeline.py
new file mode 100644
index 00000000..763e941c
--- /dev/null
+++ b/modelscope/pipelines/nlp/information_extraction_pipeline.py
@@ -0,0 +1,43 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+from typing import Any, Dict, Optional, Union
+
+import torch
+
+from modelscope.metainfo import Pipelines
+from modelscope.models import Model
+from modelscope.pipelines.base import Pipeline
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.preprocessors import (Preprocessor,
+                                      RelationExtractionPreprocessor)
+from modelscope.utils.constant import Tasks
+
+__all__ = ['InformationExtractionPipeline']
+
+
+@PIPELINES.register_module(
+    Tasks.information_extraction, module_name=Pipelines.relation_extraction)
+class InformationExtractionPipeline(Pipeline):
+
+    def __init__(self,
+                 model: Union[Model, str],
+                 preprocessor: Optional[Preprocessor] = None,
+                 **kwargs):
+
+        model = model if isinstance(model,
+                                    Model) else Model.from_pretrained(model)
+        if preprocessor is None:
+            preprocessor = RelationExtractionPreprocessor(
+                model.model_dir,
+                sequence_length=kwargs.pop('sequence_length', 512))
+        model.eval()
+        super().__init__(model=model, preprocessor=preprocessor, **kwargs)
+
+    def forward(self, inputs: Dict[str, Any],
+                **forward_params) -> Dict[str, Any]:
+        with torch.no_grad():
+            return self.model(**inputs, **forward_params)
+
+    def postprocess(self, inputs: Dict[str, Any],
+                    **postprocess_params) -> Dict[str, str]:
+        return inputs
diff --git a/modelscope/pipelines/nlp/named_entity_recognition_pipeline.py b/modelscope/pipelines/nlp/named_entity_recognition_pipeline.py
index b0b06c88..7275feca 100644
--- a/modelscope/pipelines/nlp/named_entity_recognition_pipeline.py
+++ b/modelscope/pipelines/nlp/named_entity_recognition_pipeline.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 from typing import Any, Dict, Optional, Union
 
 import torch
@@ -7,7 +9,8 @@ from modelscope.models import Model
 from modelscope.outputs import OutputKeys
 from modelscope.pipelines.base import Pipeline
 from modelscope.pipelines.builder import PIPELINES
-from modelscope.preprocessors import NERPreprocessor, Preprocessor
+from modelscope.preprocessors import (Preprocessor,
+                                      TokenClassificationPreprocessor)
 from modelscope.utils.constant import Tasks
 
 __all__ = ['NamedEntityRecognitionPipeline']
@@ -44,7 +47,7 @@ class NamedEntityRecognitionPipeline(Pipeline):
         model = model if isinstance(model,
                                     Model) else Model.from_pretrained(model)
         if preprocessor is None:
-            preprocessor = NERPreprocessor(
+            preprocessor = TokenClassificationPreprocessor(
                 model.model_dir,
                 sequence_length=kwargs.pop('sequence_length', 512))
         model.eval()
@@ -84,6 +87,9 @@ class NamedEntityRecognitionPipeline(Pipeline):
                     entity['span'] = text[entity['start']:entity['end']]
                     entities.append(entity)
                     entity = {}
+        if entity:
+            entity['span'] = text[entity['start']:entity['end']]
+            entities.append(entity)
         outputs = {OutputKeys.OUTPUT: entities}
 
         return outputs
diff --git a/modelscope/pipelines/nlp/pair_sentence_classification_pipeline.py b/modelscope/pipelines/nlp/pair_sentence_classification_pipeline.py
deleted file mode 100644
index 5248db8c..00000000
--- a/modelscope/pipelines/nlp/pair_sentence_classification_pipeline.py
+++ /dev/null
@@ -1,57 +0,0 @@
-from typing import Union
-
-from modelscope.models.base import Model
-from ...metainfo import Pipelines
-from ...preprocessors import (PairSentenceClassificationPreprocessor,
-                              Preprocessor)
-from ...utils.constant import Tasks
-from ..builder import PIPELINES
-from .sequence_classification_pipeline_base import \
-    SequenceClassificationPipelineBase
-
-__all__ = ['PairSentenceClassificationPipeline']
-
-
-@PIPELINES.register_module(Tasks.nli, module_name=Pipelines.nli)
-@PIPELINES.register_module(
-    Tasks.sentence_similarity, module_name=Pipelines.sentence_similarity)
-class PairSentenceClassificationPipeline(SequenceClassificationPipelineBase):
-
-    def __init__(self,
-                 model: Union[Model, str],
-                 preprocessor: Preprocessor = None,
-                 first_sequence='first_sequence',
-                 second_sequence='second_sequence',
-                 **kwargs):
-        """Use `model` and `preprocessor` to create a nlp pair sequence classification pipeline for prediction.
-
-        Args:
-            model (str or Model): Supply either a local model dir which supported the sequence classification task,
-            or a model id from the model hub, or a torch model instance.
-            preprocessor (Preprocessor): An optional preprocessor instance, please make sure the preprocessor fits for
-            the model if supplied.
-            first_sequence: The key to read the first sentence in.
-            second_sequence: The key to read the second sentence in.
-            sequence_length: Max sequence length in the user's custom scenario. 512 will be used as a default value.
-
-            NOTE: Inputs of type 'tuple' or 'list' are also supported. In this scenario, the 'first_sequence' and
-            'second_sequence' param will have no effect.
-
-            Example:
-            >>> from modelscope.pipelines import pipeline
-            >>> pipeline_ins = pipeline(task='nli', model='damo/nlp_structbert_nli_chinese-base')
-            >>> sentence1 = '四川商务职业学院和四川财经职业学院哪个好？'
-            >>> sentence2 = '四川商务职业学院商务管理在哪个校区？'
-            >>> print(pipeline_ins((sentence1, sentence2)))
-            >>> # Or use the dict input:
-            >>> print(pipeline_ins({'first_sequence': sentence1, 'second_sequence': sentence2}))
-
-            To view other examples plese check the tests/pipelines/test_nli.py.
-        """
-        if preprocessor is None:
-            preprocessor = PairSentenceClassificationPreprocessor(
-                model.model_dir if isinstance(model, Model) else model,
-                first_sequence=first_sequence,
-                second_sequence=second_sequence,
-                sequence_length=kwargs.pop('sequence_length', 512))
-        super().__init__(model=model, preprocessor=preprocessor, **kwargs)
diff --git a/modelscope/pipelines/nlp/passage_ranking_pipeline.py b/modelscope/pipelines/nlp/passage_ranking_pipeline.py
new file mode 100644
index 00000000..1d818ac0
--- /dev/null
+++ b/modelscope/pipelines/nlp/passage_ranking_pipeline.py
@@ -0,0 +1,60 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+from typing import Any, Dict, Optional, Union
+
+import torch
+
+from modelscope.metainfo import Pipelines
+from modelscope.models import Model
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines.base import Pipeline
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.preprocessors import PassageRankingPreprocessor, Preprocessor
+from modelscope.utils.constant import Tasks
+
+__all__ = ['PassageRankingPipeline']
+
+
+@PIPELINES.register_module(
+    Tasks.passage_ranking, module_name=Pipelines.passage_ranking)
+class PassageRankingPipeline(Pipeline):
+
+    def __init__(self,
+                 model: Union[Model, str],
+                 preprocessor: Optional[Preprocessor] = None,
+                 **kwargs):
+        """Use `model` and `preprocessor` to create a nlp word segment pipeline for prediction.
+
+        Args:
+            model (str or Model): Supply either a local model dir which supported the WS task,
+            or a model id from the model hub, or a torch model instance.
+            preprocessor (Preprocessor): An optional preprocessor instance, please make sure the preprocessor fits for
+            the model if supplied.
+            sequence_length: Max sequence length in the user's custom scenario. 128 will be used as a default value.
+        """
+        model = model if isinstance(model,
+                                    Model) else Model.from_pretrained(model)
+
+        if preprocessor is None:
+            preprocessor = PassageRankingPreprocessor(
+                model.model_dir if isinstance(model, Model) else model,
+                sequence_length=kwargs.pop('sequence_length', 128))
+        model.eval()
+        super().__init__(model=model, preprocessor=preprocessor, **kwargs)
+
+    def forward(self, inputs: Dict[str, Any],
+                **forward_params) -> Dict[str, Any]:
+        with torch.no_grad():
+            return {**self.model(inputs, **forward_params)}
+
+    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
+        """process the prediction results
+        Args:
+            inputs (Dict[str, Any]): _description_
+
+        Returns:
+            Dict[str, Any]: the predicted text representation
+        """
+        pred_list = inputs[OutputKeys.SCORES]
+
+        return {OutputKeys.SCORES: pred_list}
diff --git a/modelscope/pipelines/nlp/sentence_embedding_pipeline.py b/modelscope/pipelines/nlp/sentence_embedding_pipeline.py
new file mode 100644
index 00000000..16dedb2e
--- /dev/null
+++ b/modelscope/pipelines/nlp/sentence_embedding_pipeline.py
@@ -0,0 +1,62 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+from typing import Any, Dict, Optional, Union
+
+import torch
+
+from modelscope.metainfo import Pipelines
+from modelscope.models import Model
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines.base import Pipeline
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.preprocessors import (Preprocessor,
+                                      SentenceEmbeddingPreprocessor)
+from modelscope.utils.constant import Tasks
+
+__all__ = ['SentenceEmbeddingPipeline']
+
+
+@PIPELINES.register_module(
+    Tasks.sentence_embedding, module_name=Pipelines.sentence_embedding)
+class SentenceEmbeddingPipeline(Pipeline):
+
+    def __init__(self,
+                 model: Union[Model, str],
+                 preprocessor: Optional[Preprocessor] = None,
+                 first_sequence='first_sequence',
+                 **kwargs):
+        """Use `model` and `preprocessor` to create a nlp text dual encoder then generates the text representation.
+        Args:
+            model (str or Model): Supply either a local model dir which supported the WS task,
+            or a model id from the model hub, or a torch model instance.
+            preprocessor (Preprocessor): An optional preprocessor instance, please make sure the preprocessor fits for
+            the model if supplied.
+            sequence_length: Max sequence length in the user's custom scenario. 128 will be used as a default value.
+        """
+        model = model if isinstance(model,
+                                    Model) else Model.from_pretrained(model)
+        if preprocessor is None:
+            preprocessor = SentenceEmbeddingPreprocessor(
+                model.model_dir if isinstance(model, Model) else model,
+                first_sequence=first_sequence,
+                sequence_length=kwargs.pop('sequence_length', 128))
+        model.eval()
+        super().__init__(model=model, preprocessor=preprocessor, **kwargs)
+
+    def forward(self, inputs: Dict[str, Any],
+                **forward_params) -> Dict[str, Any]:
+        with torch.no_grad():
+            return {**self.model(inputs, **forward_params)}
+
+    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
+        """process the prediction results
+
+        Args:
+            inputs (Dict[str, Any]): _description_
+
+        Returns:
+            Dict[str, Any]: the predicted text representation
+        """
+        embs = inputs[OutputKeys.TEXT_EMBEDDING]
+        scores = inputs[OutputKeys.SCORES]
+        return {OutputKeys.TEXT_EMBEDDING: embs, OutputKeys.SCORES: scores}
diff --git a/modelscope/pipelines/nlp/sequence_classification_pipeline.py b/modelscope/pipelines/nlp/sequence_classification_pipeline.py
index 7fe8aace..8d0e1dcd 100644
--- a/modelscope/pipelines/nlp/sequence_classification_pipeline.py
+++ b/modelscope/pipelines/nlp/sequence_classification_pipeline.py
@@ -1,48 +1,64 @@
 from typing import Any, Dict, Union
 
 import numpy as np
+import torch
 
 from modelscope.metainfo import Pipelines
-from modelscope.models import Model
-from modelscope.models.nlp import BertForSequenceClassification
+from modelscope.models.base import Model
 from modelscope.outputs import OutputKeys
-from modelscope.pipelines.base import Input, Pipeline
+from modelscope.pipelines.base import Pipeline
 from modelscope.pipelines.builder import PIPELINES
-from modelscope.preprocessors import SequenceClassificationPreprocessor
+from modelscope.preprocessors import (Preprocessor,
+                                      SequenceClassificationPreprocessor)
 from modelscope.utils.constant import Tasks
 
-__all__ = ['SequenceClassificationPipeline']
-
 
 @PIPELINES.register_module(
     Tasks.text_classification, module_name=Pipelines.sentiment_analysis)
+@PIPELINES.register_module(Tasks.nli, module_name=Pipelines.nli)
+@PIPELINES.register_module(
+    Tasks.sentence_similarity, module_name=Pipelines.sentence_similarity)
+@PIPELINES.register_module(
+    Tasks.text_classification, module_name=Pipelines.sentiment_classification)
 class SequenceClassificationPipeline(Pipeline):
 
     def __init__(self,
-                 model: Union[BertForSequenceClassification, str],
-                 preprocessor: SequenceClassificationPreprocessor = None,
+                 model: Union[Model, str],
+                 preprocessor: Preprocessor = None,
                  **kwargs):
-        """use `model` and `preprocessor` to create a nlp text classification pipeline for prediction
+        """This is the base class for all the sequence classification sub-tasks.
 
         Args:
-            model (BertForSequenceClassification): a model instance
-            preprocessor (SequenceClassificationPreprocessor): a preprocessor instance
+            model (str or Model): A model instance or a model local dir or a model id in the model hub.
+            preprocessor (Preprocessor): a preprocessor instance, must not be None.
         """
-        assert isinstance(model, str) or isinstance(model, BertForSequenceClassification), \
-            'model must be a single str or BertForSequenceClassification'
-        sc_model = model if isinstance(
-            model,
-            BertForSequenceClassification) else Model.from_pretrained(model)
+        assert isinstance(model, str) or isinstance(model, Model), \
+            'model must be a single str or Model'
+        model = model if isinstance(model,
+                                    Model) else Model.from_pretrained(model)
+        first_sequence = kwargs.pop('first_sequence', 'first_sequence')
+        second_sequence = kwargs.pop('second_sequence', None)
+
         if preprocessor is None:
             preprocessor = SequenceClassificationPreprocessor(
-                sc_model.model_dir,
-                first_sequence='sentence',
-                second_sequence=None,
+                model.model_dir if isinstance(model, Model) else model,
+                first_sequence=first_sequence,
+                second_sequence=second_sequence,
                 sequence_length=kwargs.pop('sequence_length', 512))
-        super().__init__(model=sc_model, preprocessor=preprocessor, **kwargs)
 
-        assert hasattr(self.model, 'id2label'), \
-            'id2label map should be initalizaed in init function.'
+        assert preprocessor is not None
+        model.eval()
+        super().__init__(model=model, preprocessor=preprocessor, **kwargs)
+        self.id2label = kwargs.get('id2label')
+        if self.id2label is None and hasattr(self.preprocessor, 'id2label'):
+            self.id2label = self.preprocessor.id2label
+        assert self.id2label is not None, 'Cannot convert id to the original label, please pass in the mapping ' \
+                                          'as a parameter or make sure the preprocessor has the attribute.'
+
+    def forward(self, inputs: Dict[str, Any],
+                **forward_params) -> Dict[str, Any]:
+        with torch.no_grad():
+            return self.model(**inputs, **forward_params)
 
     def postprocess(self,
                     inputs: Dict[str, Any],
@@ -50,20 +66,18 @@ class SequenceClassificationPipeline(Pipeline):
         """process the prediction results
 
         Args:
-            inputs (Dict[str, Any]): input data dict
-            topk (int): return topk classification result.
-
+            inputs (Dict[str, Any]): _description_
+            topk (int): The topk probs to take
         Returns:
             Dict[str, str]: the prediction results
         """
-        # NxC np.ndarray
-        probs = inputs['probs'][0]
+
+        probs = inputs[OutputKeys.PROBABILITIES][0]
         num_classes = probs.shape[0]
         topk = min(topk, num_classes)
         top_indices = np.argpartition(probs, -topk)[-topk:]
         cls_ids = top_indices[np.argsort(probs[top_indices])]
         probs = probs[cls_ids].tolist()
 
-        cls_names = [self.model.id2label[cid] for cid in cls_ids]
-
+        cls_names = [self.id2label[cid] for cid in cls_ids]
         return {OutputKeys.SCORES: probs, OutputKeys.LABELS: cls_names}
diff --git a/modelscope/pipelines/nlp/sequence_classification_pipeline_base.py b/modelscope/pipelines/nlp/sequence_classification_pipeline_base.py
deleted file mode 100644
index 25d68993..00000000
--- a/modelscope/pipelines/nlp/sequence_classification_pipeline_base.py
+++ /dev/null
@@ -1,60 +0,0 @@
-from typing import Any, Dict, Union
-
-import numpy as np
-import torch
-
-from modelscope.models.base import Model
-from modelscope.outputs import OutputKeys
-from ...preprocessors import Preprocessor
-from ..base import Pipeline
-
-
-class SequenceClassificationPipelineBase(Pipeline):
-
-    def __init__(self, model: Union[Model, str], preprocessor: Preprocessor,
-                 **kwargs):
-        """This is the base class for all the sequence classification sub-tasks.
-
-        Args:
-            model (str or Model): A model instance or a model local dir or a model id in the model hub.
-            preprocessor (Preprocessor): a preprocessor instance, must not be None.
-        """
-        assert isinstance(model, str) or isinstance(model, Model), \
-            'model must be a single str or Model'
-        model = model if isinstance(model,
-                                    Model) else Model.from_pretrained(model)
-        assert preprocessor is not None
-        model.eval()
-        super().__init__(model=model, preprocessor=preprocessor, **kwargs)
-        self.id2label = kwargs.get('id2label')
-        if self.id2label is None and hasattr(self.preprocessor, 'id2label'):
-            self.id2label = self.preprocessor.id2label
-        assert self.id2label is not None, 'Cannot convert id to the original label, please pass in the mapping ' \
-                                          'as a parameter or make sure the preprocessor has the attribute.'
-
-    def forward(self, inputs: Dict[str, Any],
-                **forward_params) -> Dict[str, Any]:
-        with torch.no_grad():
-            return self.model(inputs, **forward_params)
-
-    def postprocess(self,
-                    inputs: Dict[str, Any],
-                    topk: int = 5) -> Dict[str, str]:
-        """process the prediction results
-
-        Args:
-            inputs (Dict[str, Any]): _description_
-            topk (int): The topk probs to take
-        Returns:
-            Dict[str, str]: the prediction results
-        """
-
-        probs = inputs[OutputKeys.PROBABILITIES][0]
-        num_classes = probs.shape[0]
-        topk = min(topk, num_classes)
-        top_indices = np.argpartition(probs, -topk)[-topk:]
-        cls_ids = top_indices[np.argsort(probs[top_indices])]
-        probs = probs[cls_ids].tolist()
-
-        cls_names = [self.id2label[cid] for cid in cls_ids]
-        return {OutputKeys.SCORES: probs, OutputKeys.LABELS: cls_names}
diff --git a/modelscope/pipelines/nlp/single_sentence_classification_pipeline.py b/modelscope/pipelines/nlp/single_sentence_classification_pipeline.py
deleted file mode 100644
index 844c6839..00000000
--- a/modelscope/pipelines/nlp/single_sentence_classification_pipeline.py
+++ /dev/null
@@ -1,54 +0,0 @@
-from typing import Union
-
-from ...metainfo import Pipelines
-from ...models import Model
-from ...preprocessors import (Preprocessor,
-                              SingleSentenceClassificationPreprocessor)
-from ...utils.constant import Tasks
-from ..builder import PIPELINES
-from .sequence_classification_pipeline_base import \
-    SequenceClassificationPipelineBase
-
-__all__ = ['SingleSentenceClassificationPipeline']
-
-
-@PIPELINES.register_module(
-    Tasks.sentiment_classification,
-    module_name=Pipelines.sentiment_classification)
-class SingleSentenceClassificationPipeline(SequenceClassificationPipelineBase):
-
-    def __init__(self,
-                 model: Union[Model, str],
-                 preprocessor: Preprocessor = None,
-                 first_sequence='first_sequence',
-                 **kwargs):
-        """Use `model` and `preprocessor` to create a nlp single sequence classification pipeline for prediction.
-
-        Args:
-            model (str or Model): Supply either a local model dir which supported the sequence classification task,
-            or a model id from the model hub, or a torch model instance.
-            preprocessor (Preprocessor): An optional preprocessor instance, please make sure the preprocessor fits for
-            the model if supplied.
-            first_sequence: The key to read the first sentence in.
-            sequence_length: Max sequence length in the user's custom scenario. 512 will be used as a default value.
-
-            NOTE: Inputs of type 'str' are also supported. In this scenario, the 'first_sequence'
-            param will have no effect.
-
-            Example:
-            >>> from modelscope.pipelines import pipeline
-            >>> pipeline_ins = pipeline(task='sentiment-classification',
-            >>>    model='damo/nlp_structbert_sentiment-classification_chinese-base')
-            >>> sentence1 = '启动的时候很大声音，然后就会听到1.2秒的卡察的声音，类似齿轮摩擦的声音'
-            >>> print(pipeline_ins(sentence1))
-            >>> # Or use the dict input:
-            >>> print(pipeline_ins({'first_sequence': sentence1}))
-
-            To view other examples plese check the tests/pipelines/test_sentiment-classification.py.
-        """
-        if preprocessor is None:
-            preprocessor = SingleSentenceClassificationPreprocessor(
-                model.model_dir if isinstance(model, Model) else model,
-                first_sequence=first_sequence,
-                sequence_length=kwargs.pop('sequence_length', 512))
-        super().__init__(model=model, preprocessor=preprocessor, **kwargs)
diff --git a/modelscope/pipelines/nlp/table_question_answering_pipeline.py b/modelscope/pipelines/nlp/table_question_answering_pipeline.py
new file mode 100644
index 00000000..96bfbc34
--- /dev/null
+++ b/modelscope/pipelines/nlp/table_question_answering_pipeline.py
@@ -0,0 +1,283 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os
+from typing import Any, Dict, Union
+
+from transformers import BertTokenizer
+
+from modelscope.metainfo import Pipelines
+from modelscope.models import Model
+from modelscope.models.nlp import TableQuestionAnswering
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines.base import Pipeline
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.preprocessors import TableQuestionAnsweringPreprocessor
+from modelscope.preprocessors.star3.fields.database import Database
+from modelscope.preprocessors.star3.fields.struct import Constant, SQLQuery
+from modelscope.utils.constant import ModelFile, Tasks
+
+__all__ = ['TableQuestionAnsweringPipeline']
+
+
+@PIPELINES.register_module(
+    Tasks.table_question_answering,
+    module_name=Pipelines.table_question_answering_pipeline)
+class TableQuestionAnsweringPipeline(Pipeline):
+
+    def __init__(self,
+                 model: Union[TableQuestionAnswering, str],
+                 preprocessor: TableQuestionAnsweringPreprocessor = None,
+                 db: Database = None,
+                 **kwargs):
+        """use `model` and `preprocessor` to create a table question answering prediction pipeline
+
+        Args:
+            model (TableQuestionAnswering): a model instance
+            preprocessor (TableQuestionAnsweringPreprocessor): a preprocessor instance
+            db (Database): a database to store tables in the database
+        """
+        model = model if isinstance(
+            model, TableQuestionAnswering) else Model.from_pretrained(model)
+        if preprocessor is None:
+            preprocessor = TableQuestionAnsweringPreprocessor(model.model_dir)
+
+        # initilize tokenizer
+        self.tokenizer = BertTokenizer(
+            os.path.join(model.model_dir, ModelFile.VOCAB_FILE))
+
+        # initialize database
+        if db is None:
+            self.db = Database(
+                tokenizer=self.tokenizer,
+                table_file_path=os.path.join(model.model_dir, 'table.json'),
+                syn_dict_file_path=os.path.join(model.model_dir,
+                                                'synonym.txt'))
+        else:
+            self.db = db
+
+        constant = Constant()
+        self.agg_ops = constant.agg_ops
+        self.cond_ops = constant.cond_ops
+        self.cond_conn_ops = constant.cond_conn_ops
+        self.action_ops = constant.action_ops
+        self.max_select_num = constant.max_select_num
+        self.max_where_num = constant.max_where_num
+        self.col_type_dict = constant.col_type_dict
+        self.schema_link_dict = constant.schema_link_dict
+
+        super().__init__(model=model, preprocessor=preprocessor, **kwargs)
+
+    def post_process_multi_turn(self, history_sql, result, table):
+        action = self.action_ops[result['action']]
+        headers = table['header_name']
+        current_sql = result['sql']
+
+        if history_sql is None:
+            return current_sql
+
+        if action == 'out_of_scripts':
+            return history_sql
+
+        elif action == 'switch_table':
+            return current_sql
+
+        elif action == 'restart':
+            return current_sql
+
+        elif action == 'firstTurn':
+            return current_sql
+
+        elif action == 'del_focus':
+            pre_final_sql = history_sql
+            pre_sels = []
+            pre_aggs = []
+            for idx, seli in enumerate(pre_final_sql['sel']):
+                if seli not in current_sql['sel']:
+                    pre_sels.append(seli)
+                    pre_aggs.append(pre_final_sql['agg'][idx])
+
+            if len(pre_sels) < 1:
+                pre_sels.append(len(headers))
+                pre_aggs.append(0)
+            pre_final_sql['sel'] = pre_sels
+            pre_final_sql['agg'] = pre_aggs
+
+            final_conds = []
+            for condi in pre_final_sql['conds']:
+                if condi[0] < len(headers):
+                    final_conds.append(condi)
+            if len(final_conds) < 1:
+                final_conds.append([len(headers), 2, 'Null'])
+            pre_final_sql['conds'] = final_conds
+
+            return pre_final_sql
+
+        elif action == 'change_agg_only':
+            pre_final_sql = history_sql
+            pre_sels = []
+            pre_aggs = []
+            for idx, seli in enumerate(pre_final_sql['sel']):
+                if seli in current_sql['sel']:
+                    pre_sels.append(seli)
+                    changed_aggi = -1
+                    for idx_single, aggi in enumerate(current_sql['agg']):
+                        if current_sql['sel'][idx_single] == seli:
+                            changed_aggi = aggi
+                    pre_aggs.append(changed_aggi)
+                else:
+                    pre_sels.append(seli)
+                    pre_aggs.append(pre_final_sql['agg'][idx])
+            pre_final_sql['sel'] = pre_sels
+            pre_final_sql['agg'] = pre_aggs
+
+            return pre_final_sql
+
+        elif action == 'change_focus_total':
+            pre_final_sql = history_sql
+            pre_sels = current_sql['sel']
+            pre_aggs = current_sql['agg']
+
+            pre_final_sql['sel'] = pre_sels
+            pre_final_sql['agg'] = pre_aggs
+            for pre_condi in current_sql['conds']:
+                if pre_condi[0] < len(headers):
+                    in_flag = False
+                    for history_condi in history_sql['conds']:
+                        if pre_condi[0] == history_condi[0]:
+                            in_flag = True
+                    if not in_flag:
+                        pre_final_sql['conds'].append(pre_condi)
+
+            return pre_final_sql
+
+        elif action == 'del_cond':
+            pre_final_sql = history_sql
+
+            final_conds = []
+
+            for idx, condi in enumerate(pre_final_sql['conds']):
+                if condi[0] not in current_sql['sel']:
+                    final_conds.append(condi)
+            pre_final_sql['conds'] = final_conds
+
+            final_conds = []
+            for condi in pre_final_sql['conds']:
+                if condi[0] < len(headers):
+                    final_conds.append(condi)
+            if len(final_conds) < 1:
+                final_conds.append([len(headers), 2, 'Null'])
+            pre_final_sql['conds'] = final_conds
+
+            return pre_final_sql
+
+        elif action == 'change_cond':
+            pre_final_sql = history_sql
+            final_conds = []
+
+            for idx, condi in enumerate(pre_final_sql['conds']):
+                in_single_flag = False
+                for single_condi in current_sql['conds']:
+                    if condi[0] == single_condi[0]:
+                        in_single_flag = True
+                        final_conds.append(single_condi)
+                if not in_single_flag:
+                    final_conds.append(condi)
+            pre_final_sql['conds'] = final_conds
+
+            final_conds = []
+            for condi in pre_final_sql['conds']:
+                if condi[0] < len(headers):
+                    final_conds.append(condi)
+            if len(final_conds) < 1:
+                final_conds.append([len(headers), 2, 'Null', 'Null'])
+            pre_final_sql['conds'] = final_conds
+
+            return pre_final_sql
+
+        elif action == 'add_cond':
+            pre_final_sql = history_sql
+            final_conds = pre_final_sql['conds']
+            for idx, condi in enumerate(current_sql['conds']):
+                if condi[0] < len(headers):
+                    final_conds.append(condi)
+            pre_final_sql['conds'] = final_conds
+
+            final_conds = []
+            for condi in pre_final_sql['conds']:
+                if condi[0] < len(headers):
+                    final_conds.append(condi)
+            if len(final_conds) < 1:
+                final_conds.append([len(headers), 2, 'Null'])
+            pre_final_sql['conds'] = final_conds
+
+            return pre_final_sql
+
+        else:
+            return current_sql
+
+    def sql_dict_to_str(self, result, table):
+        """
+        convert sql struct to string
+        """
+        header_names = table['header_name'] + ['空列']
+        header_ids = table['header_id'] + ['null']
+        sql = result['sql']
+
+        str_sel_list, sql_sel_list = [], []
+        for idx, sel in enumerate(sql['sel']):
+            header_name = header_names[sel]
+            header_id = '`%s`.`%s`' % (table['table_id'], header_ids[sel])
+            if sql['agg'][idx] == 0:
+                str_sel_list.append(header_name)
+                sql_sel_list.append(header_id)
+            else:
+                str_sel_list.append(self.agg_ops[sql['agg'][idx]] + '( '
+                                    + header_name + ' )')
+                sql_sel_list.append(self.agg_ops[sql['agg'][idx]] + '( '
+                                    + header_id + ' )')
+
+        str_cond_list, sql_cond_list = [], []
+        for cond in sql['conds']:
+            header_name = header_names[cond[0]]
+            header_id = '`%s`.`%s`' % (table['table_id'], header_ids[cond[0]])
+            op = self.cond_ops[cond[1]]
+            value = cond[2]
+            str_cond_list.append('( ' + header_name + ' ' + op + ' "' + value
+                                 + '" )')
+            sql_cond_list.append('( ' + header_id + ' ' + op + ' "' + value
+                                 + '" )')
+
+        cond = ' ' + self.cond_conn_ops[sql['cond_conn_op']] + ' '
+
+        final_str = 'SELECT %s FROM %s WHERE %s' % (', '.join(str_sel_list),
+                                                    table['table_name'],
+                                                    cond.join(str_cond_list))
+        final_sql = 'SELECT %s FROM `%s` WHERE %s' % (', '.join(sql_sel_list),
+                                                      table['table_id'],
+                                                      cond.join(sql_cond_list))
+        sql = SQLQuery(
+            string=final_str, query=final_sql, sql_result=result['sql'])
+
+        return sql
+
+    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, str]:
+        """process the prediction results
+
+        Args:
+            inputs (Dict[str, Any]): _description_
+
+        Returns:
+            Dict[str, str]: the prediction results
+        """
+        result = inputs['result']
+        history_sql = inputs['history_sql']
+        result['sql'] = self.post_process_multi_turn(
+            history_sql=history_sql,
+            result=result,
+            table=self.db.tables[result['table_id']])
+        sql = self.sql_dict_to_str(
+            result=result, table=self.db.tables[result['table_id']])
+        output = {OutputKeys.OUTPUT: sql, OutputKeys.HISTORY: result['sql']}
+        return output
+
+    def _collate_fn(self, data):
+        return data
diff --git a/modelscope/pipelines/nlp/text2text_generation_pipeline.py b/modelscope/pipelines/nlp/text2text_generation_pipeline.py
new file mode 100644
index 00000000..9ccd00f4
--- /dev/null
+++ b/modelscope/pipelines/nlp/text2text_generation_pipeline.py
@@ -0,0 +1,87 @@
+from typing import Any, Dict, Optional, Union
+
+import torch
+
+from modelscope.metainfo import Pipelines
+from modelscope.models.base import Model
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines.base import Pipeline, Tensor
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.preprocessors import Text2TextGenerationPreprocessor
+from modelscope.utils.constant import Tasks
+
+__all__ = ['Text2TextGenerationPipeline']
+
+
+@PIPELINES.register_module(
+    Tasks.text2text_generation, module_name=Pipelines.text2text_generation)
+class Text2TextGenerationPipeline(Pipeline):
+
+    def __init__(
+            self,
+            model: Union[Model, str],
+            preprocessor: Optional[Text2TextGenerationPreprocessor] = None,
+            first_sequence='sentence',
+            **kwargs):
+        """Use `model` and `preprocessor` to create a text to text generation pipeline for prediction.
+
+        Args:
+            model (str or Model): Supply either a local model dir which supported the text generation task,
+            or a model id from the model hub, or a torch model instance.
+            preprocessor (Preprocessor): An optional preprocessor instance, please make sure the preprocessor fits for
+            the model if supplied.
+            first_sequence: The key to read the first sentence in.
+            sequence_length: Max sequence length in the user's custom scenario. 128 will be used as a default value.
+
+            NOTE: Inputs of type 'str' are also supported. In this scenario, the 'first_sequence'
+            param will have no effect.
+
+            Example:
+            >>> from modelscope.pipelines import pipeline
+            >>> pipeline_ins = pipeline(task='text-generation',
+            >>>    model='damo/nlp_palm2.0_text-generation_chinese-base')
+            >>> sentence1 = '本文总结了十个可穿戴产品的设计原则，而这些原则，同样也是笔者认为是这个行业最吸引人的地方：'
+            >>>     '1.为人们解决重复性问题；2.从人开始，而不是从机器开始；3.要引起注意，但不要刻意；4.提升用户能力，而不是取代'
+            >>> print(pipeline_ins(sentence1))
+            >>> # Or use the dict input:
+            >>> print(pipeline_ins({'sentence': sentence1}))
+
+            To view other examples plese check the tests/pipelines/test_text_generation.py.
+        """
+        model = model if isinstance(model,
+                                    Model) else Model.from_pretrained(model)
+        if preprocessor is None:
+            preprocessor = Text2TextGenerationPreprocessor(
+                model.model_dir,
+                sequence_length=kwargs.pop('sequence_length', 128))
+        self.tokenizer = preprocessor.tokenizer
+        model.eval()
+        super().__init__(model=model, preprocessor=preprocessor, **kwargs)
+
+    def forward(self, inputs: Dict[str, Any],
+                **forward_params) -> Dict[str, Any]:
+
+        forward_params['min_length'] = forward_params.get(
+            'min_length', self.model.config.min_length)
+        forward_params['max_length'] = forward_params.get(
+            'max_length', self.model.config.max_length)
+
+        with torch.no_grad():
+            output_ids = self.model.generate(**inputs, **forward_params)
+            return {'output_ids': output_ids}
+
+    def postprocess(self, inputs: Dict[str, Tensor],
+                    **postprocess_params) -> Dict[str, str]:
+        """process the prediction results
+
+        Args:
+            inputs (Dict[str, Any]): _description_
+
+        Returns:
+            Dict[str, str]: the prediction results
+        """
+        output = self.tokenizer.decode(
+            inputs['output_ids'][0],
+            skip_special_tokens=True,
+        )
+        return {OutputKeys.TEXT: output}
diff --git a/modelscope/pipelines/nlp/text_error_correction_pipeline.py b/modelscope/pipelines/nlp/text_error_correction_pipeline.py
index b63d8d36..8e9bf85d 100644
--- a/modelscope/pipelines/nlp/text_error_correction_pipeline.py
+++ b/modelscope/pipelines/nlp/text_error_correction_pipeline.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 from typing import Any, Dict, Optional, Union
 
 import torch
diff --git a/modelscope/pipelines/nlp/text_generation_pipeline.py b/modelscope/pipelines/nlp/text_generation_pipeline.py
index 3d27ffa9..ea35763f 100644
--- a/modelscope/pipelines/nlp/text_generation_pipeline.py
+++ b/modelscope/pipelines/nlp/text_generation_pipeline.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 from typing import Any, Dict, Optional, Union
 
 import torch
diff --git a/modelscope/pipelines/nlp/token_classification_pipeline.py b/modelscope/pipelines/nlp/token_classification_pipeline.py
new file mode 100644
index 00000000..5367c1a8
--- /dev/null
+++ b/modelscope/pipelines/nlp/token_classification_pipeline.py
@@ -0,0 +1,94 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+from typing import Any, Dict, Optional, Union
+
+import torch
+
+from modelscope.metainfo import Pipelines
+from modelscope.models import Model
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines.base import Pipeline, Tensor
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.preprocessors import (Preprocessor,
+                                      TokenClassificationPreprocessor)
+from modelscope.utils.constant import Tasks
+
+__all__ = ['TokenClassificationPipeline']
+
+
+@PIPELINES.register_module(
+    Tasks.token_classification, module_name=Pipelines.part_of_speech)
+class TokenClassificationPipeline(Pipeline):
+
+    def __init__(self,
+                 model: Union[Model, str],
+                 preprocessor: Optional[Preprocessor] = None,
+                 **kwargs):
+        """use `model` and `preprocessor` to create a token classification pipeline for prediction
+
+        Args:
+            model (str or Model): A model instance or a model local dir or a model id in the model hub.
+            preprocessor (Preprocessor): a preprocessor instance, must not be None.
+        """
+        assert isinstance(model, str) or isinstance(model, Model), \
+            'model must be a single str or Model'
+        model = model if isinstance(model,
+                                    Model) else Model.from_pretrained(model)
+        if preprocessor is None:
+            preprocessor = TokenClassificationPreprocessor(
+                model.model_dir,
+                sequence_length=kwargs.pop('sequence_length', 128))
+        model.eval()
+        super().__init__(model=model, preprocessor=preprocessor, **kwargs)
+        self.id2label = getattr(model, 'id2label')
+        assert self.id2label is not None, 'Cannot convert id to the original label, please pass in the mapping ' \
+                                          'as a parameter or make sure the preprocessor has the attribute.'
+
+    def forward(self, inputs: Dict[str, Any],
+                **forward_params) -> Dict[str, Any]:
+        text = inputs.pop(OutputKeys.TEXT)
+        with torch.no_grad():
+            return {
+                **self.model(**inputs, **forward_params), OutputKeys.TEXT: text
+            }
+
+    def postprocess(self, inputs: Dict[str, Any],
+                    **postprocess_params) -> Dict[str, str]:
+        """process the prediction results
+
+        Args:
+            inputs (Dict[str, Any]): _description_
+
+        Returns:
+            Dict[str, str]: the prediction results
+        """
+
+        pred_list = inputs['predictions']
+        labels = []
+        for pre in pred_list:
+            labels.append(self.id2label[pre])
+        labels = labels[1:-1]
+        chunks = []
+        tags = []
+        chunk = ''
+        assert len(inputs['text']) == len(labels)
+        for token, label in zip(inputs['text'], labels):
+            if label[0] == 'B' or label[0] == 'I':
+                chunk += token
+            else:
+                chunk += token
+                chunks.append(chunk)
+                chunk = ''
+                tags.append(label.split('-')[-1])
+        if chunk:
+            chunks.append(chunk)
+            tags.append(label.split('-')[-1])
+        pos_result = []
+        seg_result = ' '.join(chunks)
+        for chunk, tag in zip(chunks, tags):
+            pos_result.append({OutputKeys.WORD: chunk, OutputKeys.LABEL: tag})
+        outputs = {
+            OutputKeys.OUTPUT: seg_result,
+            OutputKeys.LABELS: pos_result
+        }
+        return outputs
diff --git a/modelscope/pipelines/nlp/translation_pipeline.py b/modelscope/pipelines/nlp/translation_pipeline.py
index b9b74ce4..eb7f7f74 100644
--- a/modelscope/pipelines/nlp/translation_pipeline.py
+++ b/modelscope/pipelines/nlp/translation_pipeline.py
@@ -1,8 +1,13 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import os.path as osp
 from typing import Any, Dict
 
+import jieba
 import numpy as np
 import tensorflow as tf
+from sacremoses import MosesDetokenizer, MosesPunctNormalizer, MosesTokenizer
+from subword_nmt import apply_bpe
 
 from modelscope.metainfo import Pipelines
 from modelscope.models.base import Model
@@ -59,6 +64,21 @@ class TranslationPipeline(Pipeline):
             dtype=tf.int64, shape=[None, None], name='input_wids')
         self.output = {}
 
+        # preprocess
+        self._src_lang = self.cfg['preprocessor']['src_lang']
+        self._tgt_lang = self.cfg['preprocessor']['tgt_lang']
+        self._src_bpe_path = osp.join(
+            model, self.cfg['preprocessor']['src_bpe']['file'])
+
+        if self._src_lang == 'zh':
+            self._tok = jieba
+        else:
+            self._punct_normalizer = MosesPunctNormalizer(lang=self._src_lang)
+            self._tok = MosesTokenizer(lang=self._src_lang)
+        self._detok = MosesDetokenizer(lang=self._tgt_lang)
+
+        self._bpe = apply_bpe.BPE(open(self._src_bpe_path))
+
         # model
         output = self.model(self.input_wids)
         self.output.update(output)
@@ -70,10 +90,19 @@ class TranslationPipeline(Pipeline):
             model_loader.restore(sess, model_path)
 
     def preprocess(self, input: str) -> Dict[str, Any]:
+        if self._src_lang == 'zh':
+            input_tok = self._tok.cut(input)
+            input_tok = ' '.join(list(input_tok))
+        else:
+            input = self._punct_normalizer.normalize(input)
+            input_tok = self._tok.tokenize(
+                input, return_str=True, aggressive_dash_splits=True)
+
+        input_bpe = self._bpe.process_line(input_tok)
         input_ids = np.array([[
             self._src_vocab[w]
             if w in self._src_vocab else self.cfg['model']['src_vocab_size']
-            for w in input.strip().split()
+            for w in input_bpe.strip().split()
         ]])
         result = {'input_ids': input_ids}
         return result
@@ -92,5 +121,6 @@ class TranslationPipeline(Pipeline):
             self._trg_rvocab[wid] if wid in self._trg_rvocab else '<unk>'
             for wid in wids
         ]).replace('@@ ', '').replace('@@', '')
+        translation_out = self._detok.detokenize(translation_out.split())
         result = {OutputKeys.TRANSLATION: translation_out}
         return result
diff --git a/modelscope/pipelines/nlp/word_segmentation_pipeline.py b/modelscope/pipelines/nlp/word_segmentation_pipeline.py
index 66a5c524..9d4bb67f 100644
--- a/modelscope/pipelines/nlp/word_segmentation_pipeline.py
+++ b/modelscope/pipelines/nlp/word_segmentation_pipeline.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 from typing import Any, Dict, Optional, Union
 
 import torch
@@ -62,7 +64,7 @@ class WordSegmentationPipeline(Pipeline):
         text = inputs.pop(OutputKeys.TEXT)
         with torch.no_grad():
             return {
-                **self.model(inputs, **forward_params), OutputKeys.TEXT: text
+                **self.model(**inputs, **forward_params), OutputKeys.TEXT: text
             }
 
     def postprocess(self, inputs: Dict[str, Any],
@@ -94,4 +96,4 @@ class WordSegmentationPipeline(Pipeline):
         if chunk:
             chunks.append(chunk)
         seg_result = ' '.join(chunks)
-        return {OutputKeys.OUTPUT: seg_result}
+        return {OutputKeys.OUTPUT: seg_result, OutputKeys.LABELS: []}
diff --git a/modelscope/pipelines/nlp/zero_shot_classification_pipeline.py b/modelscope/pipelines/nlp/zero_shot_classification_pipeline.py
index e39cb0e1..fc7051c7 100644
--- a/modelscope/pipelines/nlp/zero_shot_classification_pipeline.py
+++ b/modelscope/pipelines/nlp/zero_shot_classification_pipeline.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 from typing import Any, Dict, Union
 
 import torch
@@ -85,7 +87,7 @@ class ZeroShotClassificationPipeline(Pipeline):
     def forward(self, inputs: Dict[str, Any],
                 **forward_params) -> Dict[str, Any]:
         with torch.no_grad():
-            return self.model(inputs, **forward_params)
+            return self.model(**inputs, **forward_params)
 
     def postprocess(self,
                     inputs: Dict[str, Any],
diff --git a/modelscope/preprocessors/__init__.py b/modelscope/preprocessors/__init__.py
index 0328b91a..90303b65 100644
--- a/modelscope/preprocessors/__init__.py
+++ b/modelscope/preprocessors/__init__.py
@@ -15,18 +15,30 @@ if TYPE_CHECKING:
                         ImageDenoisePreprocessor)
     from .kws import WavToLists
     from .multi_modal import (OfaPreprocessor, MPlugPreprocessor)
-    from .nlp import (Tokenize, SequenceClassificationPreprocessor,
-                      TextGenerationPreprocessor,
-                      TokenClassificationPreprocessor,
-                      SingleSentenceClassificationPreprocessor,
-                      PairSentenceClassificationPreprocessor,
-                      FillMaskPreprocessor, ZeroShotClassificationPreprocessor,
-                      NERPreprocessor, TextErrorCorrectionPreprocessor)
+    from .nlp import (
+        DocumentSegmentationPreprocessor,
+        FaqQuestionAnsweringPreprocessor,
+        FillMaskPoNetPreprocessor,
+        NLPPreprocessor,
+        NLPTokenizerPreprocessorBase,
+        PassageRankingPreprocessor,
+        RelationExtractionPreprocessor,
+        SentenceEmbeddingPreprocessor,
+        SequenceClassificationPreprocessor,
+        TokenClassificationPreprocessor,
+        TextErrorCorrectionPreprocessor,
+        TextGenerationPreprocessor,
+        Text2TextGenerationPreprocessor,
+        Tokenize,
+        WordSegmentationBlankSetToLabelPreprocessor,
+        ZeroShotClassificationPreprocessor,
+    )
     from .space import (DialogIntentPredictionPreprocessor,
                         DialogModelingPreprocessor,
                         DialogStateTrackingPreprocessor)
-    from .video import ReadVideoData
+    from .video import ReadVideoData, MovieSceneSegmentationPreprocessor
     from .star import ConversationalTextToSqlPreprocessor
+    from .star3 import TableQuestionAnsweringPreprocessor
 
 else:
     _import_structure = {
@@ -35,7 +47,7 @@ else:
         'common': ['Compose', 'ToTensor', 'Filter'],
         'audio': ['LinearAECAndFbank'],
         'asr': ['WavToScp'],
-        'video': ['ReadVideoData'],
+        'video': ['ReadVideoData', 'MovieSceneSegmentationPreprocessor'],
         'image': [
             'LoadImage', 'load_image', 'ImageColorEnhanceFinetunePreprocessor',
             'ImageInstanceSegmentationPreprocessor', 'ImageDenoisePreprocessor'
@@ -43,18 +55,29 @@ else:
         'kws': ['WavToLists'],
         'multi_modal': ['OfaPreprocessor', 'MPlugPreprocessor'],
         'nlp': [
-            'Tokenize', 'SequenceClassificationPreprocessor',
-            'TextGenerationPreprocessor', 'TokenClassificationPreprocessor',
-            'SingleSentenceClassificationPreprocessor',
-            'PairSentenceClassificationPreprocessor', 'FillMaskPreprocessor',
-            'ZeroShotClassificationPreprocessor', 'NERPreprocessor',
-            'TextErrorCorrectionPreprocessor'
+            'DocumentSegmentationPreprocessor',
+            'FaqQuestionAnsweringPreprocessor',
+            'FillMaskPoNetPreprocessor',
+            'NLPPreprocessor',
+            'NLPTokenizerPreprocessorBase',
+            'PassageRankingPreprocessor',
+            'RelationExtractionPreprocessor',
+            'SentenceEmbeddingPreprocessor',
+            'SequenceClassificationPreprocessor',
+            'TokenClassificationPreprocessor',
+            'TextErrorCorrectionPreprocessor',
+            'TextGenerationPreprocessor',
+            'Tokenize',
+            'Text2TextGenerationPreprocessor',
+            'WordSegmentationBlankSetToLabelPreprocessor',
+            'ZeroShotClassificationPreprocessor',
         ],
         'space': [
             'DialogIntentPredictionPreprocessor', 'DialogModelingPreprocessor',
             'DialogStateTrackingPreprocessor', 'InputFeatures'
         ],
         'star': ['ConversationalTextToSqlPreprocessor'],
+        'star3': ['TableQuestionAnsweringPreprocessor'],
     }
 
     import sys
diff --git a/modelscope/preprocessors/audio.py b/modelscope/preprocessors/audio.py
index 10057034..1e659218 100644
--- a/modelscope/preprocessors/audio.py
+++ b/modelscope/preprocessors/audio.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import io
 import os
 from typing import Any, Dict, Tuple, Union
@@ -6,9 +8,10 @@ import numpy as np
 import scipy.io.wavfile as wav
 import torch
 
+from modelscope.fileio import File
+from modelscope.preprocessors import Preprocessor
+from modelscope.preprocessors.builder import PREPROCESSORS
 from modelscope.utils.constant import Fields
-from . import Preprocessor
-from .builder import PREPROCESSORS
 
 
 def load_kaldi_feature_transform(filename):
@@ -201,7 +204,8 @@ class LinearAECAndFbank(Preprocessor):
         if isinstance(inputs, bytes):
             inputs = io.BytesIO(inputs)
         elif isinstance(inputs, str):
-            pass
+            file_bytes = File.read(inputs)
+            inputs = io.BytesIO(file_bytes)
         else:
             raise TypeError(f'Unsupported input type: {type(inputs)}.')
         sample_rate, data = wav.read(inputs)
diff --git a/modelscope/preprocessors/movie_scene_segmentation/__init__.py b/modelscope/preprocessors/movie_scene_segmentation/__init__.py
new file mode 100644
index 00000000..b28ccabc
--- /dev/null
+++ b/modelscope/preprocessors/movie_scene_segmentation/__init__.py
@@ -0,0 +1,20 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .transforms import get_transform
+else:
+    _import_structure = {
+        'transforms': ['get_transform'],
+    }
+
+    import sys
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/preprocessors/movie_scene_segmentation/transforms.py b/modelscope/preprocessors/movie_scene_segmentation/transforms.py
new file mode 100644
index 00000000..5b84003c
--- /dev/null
+++ b/modelscope/preprocessors/movie_scene_segmentation/transforms.py
@@ -0,0 +1,308 @@
+# The implementation here is modified based on BaSSL,
+# originally Apache 2.0 License and publicly avaialbe at https://github.com/kakaobrain/bassl
+import numbers
+import os.path as osp
+import random
+from typing import List
+
+import numpy as np
+import torch
+import torchvision.transforms as TF
+import torchvision.transforms.functional as F
+from PIL import Image, ImageFilter
+
+
+def get_transform(lst):
+    assert len(lst) > 0
+    transform_lst = []
+    for item in lst:
+        transform_lst.append(build_transform(item))
+    transform = TF.Compose(transform_lst)
+    return transform
+
+
+def build_transform(cfg):
+    assert isinstance(cfg, dict)
+    cfg = cfg.copy()
+    type = cfg.pop('type')
+
+    if type == 'VideoResizedCenterCrop':
+        return VideoResizedCenterCrop(**cfg)
+    elif type == 'VideoToTensor':
+        return VideoToTensor(**cfg)
+    elif type == 'VideoRandomResizedCrop':
+        return VideoRandomResizedCrop(**cfg)
+    elif type == 'VideoRandomHFlip':
+        return VideoRandomHFlip()
+    elif type == 'VideoRandomColorJitter':
+        return VideoRandomColorJitter(**cfg)
+    elif type == 'VideoRandomGaussianBlur':
+        return VideoRandomGaussianBlur(**cfg)
+    else:
+        raise NotImplementedError
+
+
+class VideoResizedCenterCrop(torch.nn.Module):
+
+    def __init__(self, image_size, crop_size):
+        self.tfm = TF.Compose([
+            TF.Resize(size=image_size, interpolation=Image.BICUBIC),
+            TF.CenterCrop(crop_size),
+        ])
+
+    def __call__(self, imgmap):
+        assert isinstance(imgmap, list)
+        return [self.tfm(img) for img in imgmap]
+
+
+class VideoToTensor(torch.nn.Module):
+
+    def __init__(self, mean=None, std=None, inplace=False):
+        self.mean = mean
+        self.std = std
+        self.inplace = inplace
+
+        assert self.mean is not None
+        assert self.std is not None
+
+    def __to_tensor__(self, img):
+        return F.to_tensor(img)
+
+    def __normalize__(self, img):
+        return F.normalize(img, self.mean, self.std, self.inplace)
+
+    def __call__(self, imgmap):
+        assert isinstance(imgmap, list)
+        return [self.__normalize__(self.__to_tensor__(img)) for img in imgmap]
+
+
+class VideoRandomResizedCrop(torch.nn.Module):
+
+    def __init__(self, size, bottom_area=0.2):
+        self.p = 1.0
+        self.interpolation = Image.BICUBIC
+        self.size = size
+        self.bottom_area = bottom_area
+
+    def __call__(self, imgmap):
+        assert isinstance(imgmap, list)
+        if random.random() < self.p:  # do RandomResizedCrop, consistent=True
+            top, left, height, width = TF.RandomResizedCrop.get_params(
+                imgmap[0],
+                scale=(self.bottom_area, 1.0),
+                ratio=(3 / 4.0, 4 / 3.0))
+            return [
+                F.resized_crop(
+                    img=img,
+                    top=top,
+                    left=left,
+                    height=height,
+                    width=width,
+                    size=(self.size, self.size),
+                ) for img in imgmap
+            ]
+        else:
+            return [
+                F.resize(img=img, size=[self.size, self.size])
+                for img in imgmap
+            ]
+
+
+class VideoRandomHFlip(torch.nn.Module):
+
+    def __init__(self, consistent=True, command=None, seq_len=0):
+        self.consistent = consistent
+        if seq_len != 0:
+            self.consistent = False
+        if command == 'left':
+            self.threshold = 0
+        elif command == 'right':
+            self.threshold = 1
+        else:
+            self.threshold = 0.5
+        self.seq_len = seq_len
+
+    def __call__(self, imgmap):
+        assert isinstance(imgmap, list)
+        if self.consistent:
+            if random.random() < self.threshold:
+                return [i.transpose(Image.FLIP_LEFT_RIGHT) for i in imgmap]
+            else:
+                return imgmap
+        else:
+            result = []
+            for idx, i in enumerate(imgmap):
+                if idx % self.seq_len == 0:
+                    th = random.random()
+                if th < self.threshold:
+                    result.append(i.transpose(Image.FLIP_LEFT_RIGHT))
+                else:
+                    result.append(i)
+            assert len(result) == len(imgmap)
+            return result
+
+
+class VideoRandomColorJitter(torch.nn.Module):
+    """Randomly change the brightness, contrast and saturation of an image.
+    Args:
+        brightness (float or tuple of float (min, max)): How much to jitter brightness.
+            brightness_factor is chosen uniformly from [max(0, 1 - brightness), 1 + brightness]
+            or the given [min, max]. Should be non negative numbers.
+        contrast (float or tuple of float (min, max)): How much to jitter contrast.
+            contrast_factor is chosen uniformly from [max(0, 1 - contrast), 1 + contrast]
+            or the given [min, max]. Should be non negative numbers.
+        saturation (float or tuple of float (min, max)): How much to jitter saturation.
+            saturation_factor is chosen uniformly from [max(0, 1 - saturation), 1 + saturation]
+            or the given [min, max]. Should be non negative numbers.
+        hue (float or tuple of float (min, max)): How much to jitter hue.
+            hue_factor is chosen uniformly from [-hue, hue] or the given [min, max].
+            Should have 0<= hue <= 0.5 or -0.5 <= min <= max <= 0.5.
+    """
+
+    def __init__(
+        self,
+        brightness=0,
+        contrast=0,
+        saturation=0,
+        hue=0,
+        consistent=True,
+        p=1.0,
+        seq_len=0,
+    ):
+        self.brightness = self._check_input(brightness, 'brightness')
+        self.contrast = self._check_input(contrast, 'contrast')
+        self.saturation = self._check_input(saturation, 'saturation')
+        self.hue = self._check_input(
+            hue, 'hue', center=0, bound=(-0.5, 0.5), clip_first_on_zero=False)
+        self.consistent = consistent
+        self.threshold = p
+        self.seq_len = seq_len
+
+    def _check_input(self,
+                     value,
+                     name,
+                     center=1,
+                     bound=(0, float('inf')),
+                     clip_first_on_zero=True):
+        if isinstance(value, numbers.Number):
+            if value < 0:
+                raise ValueError(
+                    'If {} is a single number, it must be non negative.'.
+                    format(name))
+            value = [center - value, center + value]
+            if clip_first_on_zero:
+                value[0] = max(value[0], 0)
+        elif isinstance(value, (tuple, list)) and len(value) == 2:
+            if not bound[0] <= value[0] <= value[1] <= bound[1]:
+                raise ValueError('{} values should be between {}'.format(
+                    name, bound))
+        else:
+            raise TypeError(
+                '{} should be a single number or a list/tuple with lenght 2.'.
+                format(name))
+
+        # if value is 0 or (1., 1.) for brightness/contrast/saturation
+        # or (0., 0.) for hue, do nothing
+        if value[0] == value[1] == center:
+            value = None
+        return value
+
+    @staticmethod
+    def get_params(brightness, contrast, saturation, hue):
+        """Get a randomized transform to be applied on image.
+        Arguments are same as that of __init__.
+        Returns:
+            Transform which randomly adjusts brightness, contrast and
+            saturation in a random order.
+        """
+        transforms = []
+
+        if brightness is not None:
+            brightness_factor = random.uniform(brightness[0], brightness[1])
+            transforms.append(
+                TF.Lambda(
+                    lambda img: F.adjust_brightness(img, brightness_factor)))
+
+        if contrast is not None:
+            contrast_factor = random.uniform(contrast[0], contrast[1])
+            transforms.append(
+                TF.Lambda(lambda img: F.adjust_contrast(img, contrast_factor)))
+
+        if saturation is not None:
+            saturation_factor = random.uniform(saturation[0], saturation[1])
+            transforms.append(
+                TF.Lambda(
+                    lambda img: F.adjust_saturation(img, saturation_factor)))
+
+        if hue is not None:
+            hue_factor = random.uniform(hue[0], hue[1])
+            transforms.append(
+                TF.Lambda(lambda img: F.adjust_hue(img, hue_factor)))
+
+        random.shuffle(transforms)
+        transform = TF.Compose(transforms)
+
+        return transform
+
+    def __call__(self, imgmap):
+        assert isinstance(imgmap, list)
+        if random.random() < self.threshold:  # do ColorJitter
+            if self.consistent:
+                transform = self.get_params(self.brightness, self.contrast,
+                                            self.saturation, self.hue)
+
+                return [transform(i) for i in imgmap]
+            else:
+                if self.seq_len == 0:
+                    return [
+                        self.get_params(self.brightness, self.contrast,
+                                        self.saturation, self.hue)(img)
+                        for img in imgmap
+                    ]
+                else:
+                    result = []
+                    for idx, img in enumerate(imgmap):
+                        if idx % self.seq_len == 0:
+                            transform = self.get_params(
+                                self.brightness,
+                                self.contrast,
+                                self.saturation,
+                                self.hue,
+                            )
+                        result.append(transform(img))
+                    return result
+
+        else:
+            return imgmap
+
+    def __repr__(self):
+        format_string = self.__class__.__name__ + '('
+        format_string += 'brightness={0}'.format(self.brightness)
+        format_string += ', contrast={0}'.format(self.contrast)
+        format_string += ', saturation={0}'.format(self.saturation)
+        format_string += ', hue={0})'.format(self.hue)
+        return format_string
+
+
+class VideoRandomGaussianBlur(torch.nn.Module):
+
+    def __init__(self, radius_min=0.1, radius_max=2.0, p=0.5):
+        self.radius_min = radius_min
+        self.radius_max = radius_max
+        self.p = p
+
+    def __call__(self, imgmap):
+        assert isinstance(imgmap, list)
+        if random.random() < self.p:
+            result = []
+            for _, img in enumerate(imgmap):
+                _radius = random.uniform(self.radius_min, self.radius_max)
+                result.append(
+                    img.filter(ImageFilter.GaussianBlur(radius=_radius)))
+            return result
+        else:
+            return imgmap
+
+
+def apply_transform(images, trans):
+    return torch.stack(trans(images), dim=0)
diff --git a/modelscope/preprocessors/multi_modal.py b/modelscope/preprocessors/multi_modal.py
index 56b10c3a..f38ff8ae 100644
--- a/modelscope/preprocessors/multi_modal.py
+++ b/modelscope/preprocessors/multi_modal.py
@@ -1,6 +1,6 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import os.path as osp
-from typing import Any, Dict, List, Union
+from typing import Any, Dict, List, Tuple, Union
 
 import torch
 from PIL import Image
@@ -8,8 +8,9 @@ from PIL import Image
 from modelscope.hub.snapshot_download import snapshot_download
 from modelscope.metainfo import Preprocessors
 from modelscope.pipelines.base import Input
+from modelscope.preprocessors import load_image
 from modelscope.utils.config import Config
-from modelscope.utils.constant import Fields, ModelFile, Tasks
+from modelscope.utils.constant import Fields, ModeKeys, ModelFile, Tasks
 from .base import Preprocessor
 from .builder import PREPROCESSORS
 from .ofa import *  # noqa
@@ -91,12 +92,20 @@ class OfaPreprocessor(Preprocessor):
     Fields.multi_modal, module_name=Preprocessors.mplug_tasks_preprocessor)
 class MPlugPreprocessor(Preprocessor):
 
-    def __init__(self, model_dir: str, *args, **kwargs):
+    def __init__(self,
+                 model_dir: str,
+                 mode: str = ModeKeys.INFERENCE,
+                 tokenizer_max_length: int = 25,
+                 *args,
+                 **kwargs):
         super().__init__(*args, **kwargs)
         self.model_dir = model_dir
+        self.mode = mode
+        self.tokenizer_max_length = tokenizer_max_length
 
         self._tokenizer = None
         self._patch_resize_transform = None
+        self._image_map = {}
 
     @property
     def tokenizer(self):
@@ -126,42 +135,57 @@ class MPlugPreprocessor(Preprocessor):
             ])
         return self._patch_resize_transform
 
-    def __call__(self, *args, **kwargs):
-        call_mapping = {
-            Tasks.visual_question_answering: self.vqa_call,
-            Tasks.image_captioning: self.caption_call
-        }
+    def image_open(self, path: str) -> Tuple[Image.Image, int]:
+        if path not in self._image_map:
+            index = len(self._image_map)
+            self._image_map[path] = (load_image(path), index)
+        return self._image_map[path]
 
-        self.cfg = Config.from_file(
-            osp.join(self.model_dir, ModelFile.CONFIGURATION))
-        return call_mapping[self.cfg.task](*args, **kwargs)
-
-    def vqa_call(self, data: Union[tuple, Dict[str, Any]]) -> Dict[str, Any]:
-        image: Image.Image = data[0] if isinstance(data,
-                                                   tuple) else data['image']
-        question: str = data[1] if isinstance(data,
-                                              tuple) else data['question']
-        image = image.convert('RGB')
-        image = self.patch_resize_transform(image)
-        image = torch.stack([image], dim=0)
-        question = self.tokenizer([question.lower()],
-                                  padding='longest',
-                                  return_tensors='pt')
-
-        return {'image': image, 'question': question, 'train': False}
-
-    def caption_call(
+    def __call__(
             self, data: Union[Image.Image, tuple,
                               Dict[str, Any]]) -> Dict[str, Any]:
-        if isinstance(data, Image.Image):
+        self.cfg = Config.from_file(
+            osp.join(self.model_dir, ModelFile.CONFIGURATION))
+
+        if isinstance(data, (Image.Image, str)):
             image = data
         elif isinstance(data, tuple):
             image = data[0]
         else:
             image = data['image']
+        index = 0
+        if isinstance(image, str):
+            image, index = self.image_open(image)
         image = image.convert('RGB')
         image = self.patch_resize_transform(image)
-        image = torch.stack([image], dim=0)
-        question = self.tokenizer('', return_tensors='pt')
+        question = '' if self.cfg.task == Tasks.image_captioning \
+            else data[1 if isinstance(data, tuple)
+                      else ('text' if 'text' in data else 'question')]
+        question = self.tokenizer(
+            question.lower(),
+            padding='max_length',
+            truncation=True,
+            max_length=self.tokenizer_max_length,
+            return_tensors='pt')
 
-        return {'image': image, 'question': question, 'train': False}
+        if self.mode == ModeKeys.INFERENCE:
+            image = torch.stack([image], dim=0)
+            return {'image': image, 'question': question}
+        else:
+            answer = data['answer']
+            answer = self.tokenizer(
+                answer,
+                padding='max_length',
+                truncation=True,
+                max_length=self.tokenizer_max_length,
+                return_tensors='pt')
+            output = {
+                'image': image,
+                'question_input_ids': question.input_ids.squeeze(),
+                'question_attention_mask': question.attention_mask.squeeze(),
+                'answer_input_ids': answer.input_ids.squeeze(),
+                'answer_attention_mask': answer.attention_mask.squeeze(),
+            }
+            if self.cfg.task == Tasks.image_text_retrieval:
+                output['index'] = index
+            return output
diff --git a/modelscope/preprocessors/nlp.py b/modelscope/preprocessors/nlp.py
deleted file mode 100644
index 25576667..00000000
--- a/modelscope/preprocessors/nlp.py
+++ /dev/null
@@ -1,647 +0,0 @@
-# Copyright (c) Alibaba, Inc. and its affiliates.
-
-import os.path as osp
-import uuid
-from typing import Any, Dict, Iterable, Optional, Tuple, Union
-
-import numpy as np
-from transformers import AutoTokenizer
-
-from modelscope.metainfo import Models, Preprocessors
-from modelscope.outputs import OutputKeys
-from modelscope.utils.constant import Fields, InputFields, ModeKeys
-from modelscope.utils.hub import get_model_type, parse_label_mapping
-from modelscope.utils.type_assert import type_assert
-from .base import Preprocessor
-from .builder import PREPROCESSORS
-
-__all__ = [
-    'Tokenize', 'SequenceClassificationPreprocessor',
-    'TextGenerationPreprocessor', 'TokenClassificationPreprocessor',
-    'PairSentenceClassificationPreprocessor',
-    'SingleSentenceClassificationPreprocessor', 'FillMaskPreprocessor',
-    'ZeroShotClassificationPreprocessor', 'NERPreprocessor',
-    'TextErrorCorrectionPreprocessor'
-]
-
-
-@PREPROCESSORS.register_module(Fields.nlp)
-class Tokenize(Preprocessor):
-
-    def __init__(self, tokenizer_name) -> None:
-        self._tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
-
-    def __call__(self, data: Union[str, Dict[str, Any]]) -> Dict[str, Any]:
-        if isinstance(data, str):
-            data = {InputFields.text: data}
-        token_dict = self._tokenizer(data[InputFields.text])
-        data.update(token_dict)
-        return data
-
-
-@PREPROCESSORS.register_module(
-    Fields.nlp, module_name=Preprocessors.bert_seq_cls_tokenizer)
-class SequenceClassificationPreprocessor(Preprocessor):
-
-    def __init__(self, model_dir: str, *args, **kwargs):
-        """preprocess the data
-
-        Args:
-            model_dir (str): model path
-        """
-
-        super().__init__(*args, **kwargs)
-
-        from easynlp.modelzoo import AutoTokenizer
-        self.model_dir: str = model_dir
-        self.first_sequence: str = kwargs.pop('first_sequence',
-                                              'first_sequence')
-        self.second_sequence = kwargs.pop('second_sequence', 'second_sequence')
-        self.sequence_length = kwargs.pop('sequence_length', 128)
-
-        self.tokenizer = AutoTokenizer.from_pretrained(self.model_dir)
-        print(f'this is the tokenzier {self.tokenizer}')
-        self.label2id = parse_label_mapping(self.model_dir)
-
-    @type_assert(object, (str, tuple, Dict))
-    def __call__(self, data: Union[str, tuple, Dict]) -> Dict[str, Any]:
-        feature = super().__call__(data)
-        if isinstance(data, str):
-            new_data = {self.first_sequence: data}
-        elif isinstance(data, tuple):
-            sentence1, sentence2 = data
-            new_data = {
-                self.first_sequence: sentence1,
-                self.second_sequence: sentence2
-            }
-        else:
-            new_data = data
-
-        # preprocess the data for the model input
-
-        rst = {
-            'id': [],
-            'input_ids': [],
-            'attention_mask': [],
-            'token_type_ids': [],
-        }
-
-        max_seq_length = self.sequence_length
-
-        text_a = new_data[self.first_sequence]
-        text_b = new_data.get(self.second_sequence, None)
-        feature = self.tokenizer(
-            text_a,
-            text_b,
-            padding='max_length',
-            truncation=True,
-            max_length=max_seq_length)
-
-        rst['id'].append(new_data.get('id', str(uuid.uuid4())))
-        rst['input_ids'].append(feature['input_ids'])
-        rst['attention_mask'].append(feature['attention_mask'])
-        rst['token_type_ids'].append(feature['token_type_ids'])
-
-        return rst
-
-
-class NLPTokenizerPreprocessorBase(Preprocessor):
-
-    def __init__(self, model_dir: str, pair: bool, mode: str, **kwargs):
-        """The NLP tokenizer preprocessor base class.
-
-        Any nlp preprocessor which uses the hf tokenizer can inherit from this class.
-
-        Args:
-            model_dir (str): The local model path
-            first_sequence: The key for the first sequence
-            second_sequence: The key for the second sequence
-            label: The label key
-            label2id: An optional label2id mapping, the class will try to call utils.parse_label_mapping
-                if this mapping is not supplied.
-            pair (bool): Pair sentence input or single sentence input.
-            mode: Run this preprocessor in either 'train'/'eval'/'inference' mode
-            kwargs: These kwargs will be directly fed into the tokenizer.
-        """
-
-        super().__init__(**kwargs)
-        self.model_dir: str = model_dir
-        self.first_sequence: str = kwargs.pop('first_sequence',
-                                              'first_sequence')
-        self.second_sequence = kwargs.pop('second_sequence', 'second_sequence')
-        self.pair = pair
-        self._mode = mode
-        self.label = kwargs.pop('label', OutputKeys.LABEL)
-        self.label2id = None
-        if 'label2id' in kwargs:
-            self.label2id = kwargs.pop('label2id')
-        if self.label2id is None:
-            self.label2id = parse_label_mapping(self.model_dir)
-
-        self.tokenize_kwargs = kwargs
-        self.tokenizer = self.build_tokenizer(model_dir)
-
-    @property
-    def id2label(self):
-        """Return the id2label mapping according to the label2id mapping.
-
-        @return: The id2label mapping if exists.
-        """
-        if self.label2id is not None:
-            return {id: label for label, id in self.label2id.items()}
-        return None
-
-    def build_tokenizer(self, model_dir):
-        """Build a tokenizer by the model type.
-
-        NOTE: This default implementation only returns slow tokenizer, because the fast tokenizers have a
-        multi-thread problem.
-
-        @param model_dir:  The local model dir.
-        @return: The initialized tokenizer.
-        """
-
-        model_type = get_model_type(model_dir)
-        if model_type in (Models.structbert, Models.gpt3, Models.palm):
-            from modelscope.models.nlp.structbert import SbertTokenizer
-            return SbertTokenizer.from_pretrained(model_dir, use_fast=False)
-        elif model_type == Models.veco:
-            from modelscope.models.nlp.veco import VecoTokenizer
-            return VecoTokenizer.from_pretrained(model_dir)
-        else:
-            return AutoTokenizer.from_pretrained(model_dir, use_fast=False)
-
-    def __call__(self, data: Union[str, Tuple, Dict]) -> Dict[str, Any]:
-        """process the raw input data
-
-        Args:
-            data (tuple): [sentence1, sentence2]
-                sentence1 (str): a sentence
-                    Example:
-                        'you are so handsome.'
-                sentence2 (str): a sentence
-                    Example:
-                        'you are so beautiful.'
-        Returns:
-            Dict[str, Any]: the preprocessed data
-        """
-
-        text_a, text_b, labels = self.parse_text_and_label(data)
-        output = self.tokenizer(
-            text_a,
-            text_b,
-            return_tensors='pt' if self._mode == ModeKeys.INFERENCE else None,
-            **self.tokenize_kwargs)
-        output = {
-            k: np.array(v) if isinstance(v, list) else v
-            for k, v in output.items()
-        }
-        self.labels_to_id(labels, output)
-        return output
-
-    def parse_text_and_label(self, data):
-        """Parse the input and return the sentences and labels.
-
-        When input type is tuple or list and its size is 2:
-        If the pair param is False, data will be parsed as the first_sentence and the label,
-        else it will be parsed as the first_sentence and the second_sentence.
-
-        @param data: The input data.
-        @return: The sentences and labels tuple.
-        """
-        text_a, text_b, labels = None, None, None
-        if isinstance(data, str):
-            text_a = data
-        elif isinstance(data, tuple) or isinstance(data, list):
-            if len(data) == 3:
-                text_a, text_b, labels = data
-            elif len(data) == 2:
-                if self.pair:
-                    text_a, text_b = data
-                else:
-                    text_a, labels = data
-        elif isinstance(data, dict):
-            text_a = data.get(self.first_sequence)
-            text_b = data.get(self.second_sequence)
-            labels = data.get(self.label)
-
-        return text_a, text_b, labels
-
-    def labels_to_id(self, labels, output):
-        """Turn the labels to id with the type int or float.
-
-        If the original label's type is str or int, the label2id mapping will try to convert it to the final label.
-        If the original label's type is float, or the label2id mapping does not exist,
-        the original label will be returned.
-
-        @param labels: The input labels.
-        @param output: The label id.
-        @return: The final labels.
-        """
-
-        def label_can_be_mapped(label):
-            return isinstance(label, str) or isinstance(label, int)
-
-        if labels is not None:
-            if isinstance(labels, Iterable) and all([label_can_be_mapped(label) for label in labels]) \
-                    and self.label2id is not None:
-                output[OutputKeys.LABELS] = [
-                    self.label2id[str(label)] for label in labels
-                ]
-            elif label_can_be_mapped(labels) and self.label2id is not None:
-                output[OutputKeys.LABELS] = self.label2id[str(labels)]
-            else:
-                output[OutputKeys.LABELS] = labels
-
-
-@PREPROCESSORS.register_module(
-    Fields.nlp, module_name=Preprocessors.nli_tokenizer)
-@PREPROCESSORS.register_module(
-    Fields.nlp, module_name=Preprocessors.sen_sim_tokenizer)
-class PairSentenceClassificationPreprocessor(NLPTokenizerPreprocessorBase):
-    """The tokenizer preprocessor used in pair sentence classification.
-    """
-
-    def __init__(self, model_dir: str, mode=ModeKeys.INFERENCE, **kwargs):
-        kwargs['truncation'] = kwargs.get('truncation', True)
-        kwargs['padding'] = kwargs.get(
-            'padding', False if mode == ModeKeys.INFERENCE else 'max_length')
-        kwargs['max_length'] = kwargs.pop('sequence_length', 128)
-        super().__init__(model_dir, pair=True, mode=mode, **kwargs)
-
-
-@PREPROCESSORS.register_module(
-    Fields.nlp, module_name=Preprocessors.sen_cls_tokenizer)
-class SingleSentenceClassificationPreprocessor(NLPTokenizerPreprocessorBase):
-    """The tokenizer preprocessor used in single sentence classification.
-    """
-
-    def __init__(self, model_dir: str, mode=ModeKeys.INFERENCE, **kwargs):
-        kwargs['truncation'] = kwargs.get('truncation', True)
-        kwargs['padding'] = kwargs.get(
-            'padding', False if mode == ModeKeys.INFERENCE else 'max_length')
-        kwargs['max_length'] = kwargs.pop('sequence_length', 128)
-        super().__init__(model_dir, pair=False, mode=mode, **kwargs)
-
-
-@PREPROCESSORS.register_module(
-    Fields.nlp, module_name=Preprocessors.zero_shot_cls_tokenizer)
-class ZeroShotClassificationPreprocessor(NLPTokenizerPreprocessorBase):
-    """The tokenizer preprocessor used in zero shot classification.
-    """
-
-    def __init__(self, model_dir: str, mode=ModeKeys.INFERENCE, **kwargs):
-        """preprocess the data
-
-        Args:
-            model_dir (str): model path
-        """
-        self.sequence_length = kwargs.pop('sequence_length', 512)
-        super().__init__(model_dir, pair=False, mode=mode, **kwargs)
-
-    def __call__(self, data: Union[str, Dict], hypothesis_template: str,
-                 candidate_labels: list) -> Dict[str, Any]:
-        """process the raw input data
-
-        Args:
-            data (str or dict): a sentence
-                Example:
-                    'you are so handsome.'
-
-        Returns:
-            Dict[str, Any]: the preprocessed data
-        """
-        if isinstance(data, dict):
-            data = data.get(self.first_sequence)
-
-        pairs = [[data, hypothesis_template.format(label)]
-                 for label in candidate_labels]
-
-        features = self.tokenizer(
-            pairs,
-            padding=True,
-            truncation=True,
-            max_length=self.sequence_length,
-            truncation_strategy='only_first',
-            return_tensors='pt' if self._mode == ModeKeys.INFERENCE else None)
-        return features
-
-
-@PREPROCESSORS.register_module(
-    Fields.nlp, module_name=Preprocessors.text_gen_tokenizer)
-class TextGenerationPreprocessor(NLPTokenizerPreprocessorBase):
-    """The tokenizer preprocessor used in text generation.
-    """
-
-    def __init__(self,
-                 model_dir: str,
-                 tokenizer=None,
-                 mode=ModeKeys.INFERENCE,
-                 **kwargs):
-        self.tokenizer = self.build_tokenizer(
-            model_dir) if tokenizer is None else tokenizer
-        kwargs['truncation'] = kwargs.get('truncation', True)
-        kwargs['padding'] = kwargs.get('padding', 'max_length')
-        kwargs['return_token_type_ids'] = kwargs.get('return_token_type_ids',
-                                                     False)
-        kwargs['max_length'] = kwargs.pop('sequence_length', 128)
-        super().__init__(model_dir, pair=False, mode=mode, **kwargs)
-
-    @staticmethod
-    def get_roberta_tokenizer_dir(model_dir: str) -> Optional[str]:
-        import os
-        for name in os.listdir(model_dir):
-            full_name = os.path.join(model_dir, name)
-            if 'roberta' in name and os.path.isdir(full_name):
-                return full_name
-
-    def build_tokenizer(self, model_dir: str):
-        roberta_tokenizer_dir = self.get_roberta_tokenizer_dir(model_dir)
-        if roberta_tokenizer_dir:
-            from transformers import RobertaTokenizer
-            return RobertaTokenizer.from_pretrained(
-                roberta_tokenizer_dir, do_lower_case=False)
-        return super().build_tokenizer(model_dir)
-
-    def __call__(self, data: Union[Dict, str]) -> Dict[str, Any]:
-        if self._mode == ModeKeys.INFERENCE:
-            return super().__call__(data)
-        src_txt = data['src_txt']
-        tgt_txt = data['tgt_txt']
-        src_rst = super().__call__(src_txt)
-        tgt_rst = super().__call__(tgt_txt)
-
-        return {
-            'src': src_rst['input_ids'],
-            'tgt': tgt_rst['input_ids'],
-            'mask_src': src_rst['attention_mask']
-        }
-
-
-@PREPROCESSORS.register_module(Fields.nlp, module_name=Preprocessors.fill_mask)
-class FillMaskPreprocessor(NLPTokenizerPreprocessorBase):
-    """The tokenizer preprocessor used in MLM task.
-    """
-
-    def __init__(self, model_dir: str, mode=ModeKeys.INFERENCE, **kwargs):
-        kwargs['truncation'] = kwargs.get('truncation', True)
-        kwargs['padding'] = kwargs.get('padding', 'max_length')
-        kwargs['max_length'] = kwargs.pop('sequence_length', 128)
-        kwargs['return_token_type_ids'] = kwargs.get('return_token_type_ids',
-                                                     True)
-        super().__init__(model_dir, pair=False, mode=mode, **kwargs)
-
-
-@PREPROCESSORS.register_module(
-    Fields.nlp,
-    module_name=Preprocessors.word_segment_text_to_label_preprocessor)
-class WordSegmentationBlankSetToLabelPreprocessor(Preprocessor):
-    """The preprocessor used to turn a single sentence to a labeled token-classification dict.
-    """
-
-    def __init__(self, **kwargs):
-        super().__init__(**kwargs)
-        self.first_sequence: str = kwargs.pop('first_sequence',
-                                              'first_sequence')
-        self.label = kwargs.pop('label', OutputKeys.LABELS)
-
-    def __call__(self, data: str) -> Union[Dict[str, Any], Tuple]:
-        data = data.split(' ')
-        data = list(filter(lambda x: len(x) > 0, data))
-
-        def produce_train_sample(words):
-            chars = []
-            labels = []
-            for word in words:
-                chars.extend(list(word))
-                if len(word) == 1:
-                    labels.append('S-CWS')
-                else:
-                    labels.extend(['B-CWS'] + ['I-CWS'] * (len(word) - 2)
-                                  + ['E-CWS'])
-            assert len(chars) == len(labels)
-            return chars, labels
-
-        chars, labels = produce_train_sample(data)
-        return {
-            self.first_sequence: chars,
-            self.label: labels,
-        }
-
-
-@PREPROCESSORS.register_module(
-    Fields.nlp, module_name=Preprocessors.token_cls_tokenizer)
-class TokenClassificationPreprocessor(NLPTokenizerPreprocessorBase):
-    """The tokenizer preprocessor used in normal token classification task.
-    """
-
-    def __init__(self, model_dir: str, mode=ModeKeys.INFERENCE, **kwargs):
-        kwargs['truncation'] = kwargs.get('truncation', True)
-        kwargs['padding'] = kwargs.get(
-            'padding', False if mode == ModeKeys.INFERENCE else 'max_length')
-        kwargs['max_length'] = kwargs.pop('sequence_length', 128)
-        self.label_all_tokens = kwargs.pop('label_all_tokens', False)
-        super().__init__(model_dir, pair=False, mode=mode, **kwargs)
-
-    def __call__(self, data: Union[str, Dict]) -> Dict[str, Any]:
-        """process the raw input data
-
-        Args:
-            data (str): a sentence
-                Example:
-                    'you are so handsome.'
-
-        Returns:
-            Dict[str, Any]: the preprocessed data
-        """
-
-        text_a = None
-        labels_list = None
-        if isinstance(data, str):
-            text_a = data
-        elif isinstance(data, dict):
-            text_a = data.get(self.first_sequence)
-            labels_list = data.get(self.label)
-
-        if isinstance(text_a, str):
-            text_a = text_a.replace(' ', '').strip()
-
-        tokenized_inputs = self.tokenizer(
-            [t for t in text_a],
-            return_tensors='pt' if self._mode == ModeKeys.INFERENCE else None,
-            is_split_into_words=True,
-            **self.tokenize_kwargs)
-
-        if labels_list is not None:
-            assert self.label2id is not None
-            # Map that sends B-Xxx label to its I-Xxx counterpart
-            b_to_i_label = []
-            label_enumerate_values = [
-                k for k, v in sorted(
-                    self.label2id.items(), key=lambda item: item[1])
-            ]
-            for idx, label in enumerate(label_enumerate_values):
-                if label.startswith('B-') and label.replace(
-                        'B-', 'I-') in label_enumerate_values:
-                    b_to_i_label.append(
-                        label_enumerate_values.index(
-                            label.replace('B-', 'I-')))
-                else:
-                    b_to_i_label.append(idx)
-
-            label_row = [self.label2id[lb] for lb in labels_list]
-            word_ids = tokenized_inputs.word_ids()
-            previous_word_idx = None
-            label_ids = []
-            for word_idx in word_ids:
-                if word_idx is None:
-                    label_ids.append(-100)
-                elif word_idx != previous_word_idx:
-                    label_ids.append(label_row[word_idx])
-                else:
-                    if self.label_all_tokens:
-                        label_ids.append(b_to_i_label[label_row[word_idx]])
-                    else:
-                        label_ids.append(-100)
-                previous_word_idx = word_idx
-            labels = label_ids
-            tokenized_inputs['labels'] = labels
-            # new code end
-
-        if self._mode == ModeKeys.INFERENCE:
-            tokenized_inputs[OutputKeys.TEXT] = text_a
-        return tokenized_inputs
-
-
-@PREPROCESSORS.register_module(
-    Fields.nlp, module_name=Preprocessors.ner_tokenizer)
-class NERPreprocessor(Preprocessor):
-    """The tokenizer preprocessor used in normal NER task.
-
-    NOTE: This preprocessor may be merged with the TokenClassificationPreprocessor in the next edition.
-    """
-
-    def __init__(self, model_dir: str, *args, **kwargs):
-        """preprocess the data
-
-        Args:
-            model_dir (str): model path
-        """
-
-        super().__init__(*args, **kwargs)
-
-        self.model_dir: str = model_dir
-        self.sequence_length = kwargs.pop('sequence_length', 512)
-        self.tokenizer = AutoTokenizer.from_pretrained(
-            model_dir, use_fast=False)
-        self.is_split_into_words = self.tokenizer.init_kwargs.get(
-            'is_split_into_words', False)
-
-    @type_assert(object, str)
-    def __call__(self, data: str) -> Dict[str, Any]:
-        """process the raw input data
-
-        Args:
-            data (str): a sentence
-                Example:
-                    'you are so handsome.'
-
-        Returns:
-            Dict[str, Any]: the preprocessed data
-        """
-
-        # preprocess the data for the model input
-        text = data
-        if self.is_split_into_words:
-            input_ids = []
-            label_mask = []
-            offset_mapping = []
-            for offset, token in enumerate(list(data)):
-                subtoken_ids = self.tokenizer.encode(
-                    token, add_special_tokens=False)
-                if len(subtoken_ids) == 0:
-                    subtoken_ids = [self.tokenizer.unk_token_id]
-                input_ids.extend(subtoken_ids)
-                label_mask.extend([1] + [0] * (len(subtoken_ids) - 1))
-                offset_mapping.extend([(offset, offset + 1)]
-                                      + [(offset + 1, offset + 1)]
-                                      * (len(subtoken_ids) - 1))
-            if len(input_ids) >= self.sequence_length - 2:
-                input_ids = input_ids[:self.sequence_length - 2]
-                label_mask = label_mask[:self.sequence_length - 2]
-                offset_mapping = offset_mapping[:self.sequence_length - 2]
-            input_ids = [self.tokenizer.cls_token_id
-                         ] + input_ids + [self.tokenizer.sep_token_id]
-            label_mask = [0] + label_mask + [0]
-            attention_mask = [1] * len(input_ids)
-        else:
-            encodings = self.tokenizer(
-                text,
-                add_special_tokens=True,
-                padding=True,
-                truncation=True,
-                max_length=self.sequence_length,
-                return_offsets_mapping=True)
-            input_ids = encodings['input_ids']
-            attention_mask = encodings['attention_mask']
-            word_ids = encodings.word_ids()
-            label_mask = []
-            offset_mapping = []
-            for i in range(len(word_ids)):
-                if word_ids[i] is None:
-                    label_mask.append(0)
-                elif word_ids[i] == word_ids[i - 1]:
-                    label_mask.append(0)
-                    offset_mapping[-1] = (offset_mapping[-1][0],
-                                          encodings['offset_mapping'][i][1])
-                else:
-                    label_mask.append(1)
-                    offset_mapping.append(encodings['offset_mapping'][i])
-        return {
-            'text': text,
-            'input_ids': input_ids,
-            'attention_mask': attention_mask,
-            'label_mask': label_mask,
-            'offset_mapping': offset_mapping
-        }
-
-
-@PREPROCESSORS.register_module(
-    Fields.nlp, module_name=Preprocessors.text_error_correction)
-class TextErrorCorrectionPreprocessor(Preprocessor):
-    """The preprocessor used in text correction task.
-    """
-
-    def __init__(self, model_dir: str, *args, **kwargs):
-        from fairseq.data import Dictionary
-        """preprocess the data via the vocab file from the `model_dir` path
-
-        Args:
-            model_dir (str): model path
-        """
-        super().__init__(*args, **kwargs)
-        self.vocab = Dictionary.load(osp.join(model_dir, 'dict.src.txt'))
-
-    def __call__(self, data: str) -> Dict[str, Any]:
-        """process the raw input data
-
-        Args:
-            data (str): a sentence
-                Example:
-                    '随着中国经济突飞猛近，建造工业与日俱增'
-        Returns:
-            Dict[str, Any]: the preprocessed data
-            Example:
-            {'net_input':
-                {'src_tokens':tensor([1,2,3,4]),
-                'src_lengths': tensor([4])}
-            }
-        """
-
-        text = ' '.join([x for x in data])
-        inputs = self.vocab.encode_line(
-            text, append_eos=True, add_if_not_exist=False)
-        lengths = inputs.size()
-        sample = dict()
-        sample['net_input'] = {'src_tokens': inputs, 'src_lengths': lengths}
-        return sample
diff --git a/modelscope/preprocessors/nlp/__init__.py b/modelscope/preprocessors/nlp/__init__.py
new file mode 100644
index 00000000..dfbb5c81
--- /dev/null
+++ b/modelscope/preprocessors/nlp/__init__.py
@@ -0,0 +1,58 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .text_error_correction import TextErrorCorrectionPreprocessor
+    from .nlp_base import (
+        DocumentSegmentationPreprocessor,
+        FaqQuestionAnsweringPreprocessor,
+        FillMaskPoNetPreprocessor,
+        NLPPreprocessor,
+        NLPTokenizerPreprocessorBase,
+        PassageRankingPreprocessor,
+        RelationExtractionPreprocessor,
+        SentenceEmbeddingPreprocessor,
+        SequenceClassificationPreprocessor,
+        TokenClassificationPreprocessor,
+        TextGenerationPreprocessor,
+        Text2TextGenerationPreprocessor,
+        Tokenize,
+        WordSegmentationBlankSetToLabelPreprocessor,
+        ZeroShotClassificationPreprocessor,
+    )
+
+else:
+    _import_structure = {
+        'nlp_base': [
+            'DocumentSegmentationPreprocessor',
+            'FaqQuestionAnsweringPreprocessor',
+            'FillMaskPoNetPreprocessor',
+            'NLPPreprocessor',
+            'NLPTokenizerPreprocessorBase',
+            'PassageRankingPreprocessor',
+            'RelationExtractionPreprocessor',
+            'SentenceEmbeddingPreprocessor',
+            'SequenceClassificationPreprocessor',
+            'TokenClassificationPreprocessor',
+            'TextGenerationPreprocessor',
+            'Tokenize',
+            'Text2TextGenerationPreprocessor',
+            'WordSegmentationBlankSetToLabelPreprocessor',
+            'ZeroShotClassificationPreprocessor',
+        ],
+        'text_error_correction': [
+            'TextErrorCorrectionPreprocessor',
+        ],
+    }
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/preprocessors/nlp/nlp_base.py b/modelscope/preprocessors/nlp/nlp_base.py
new file mode 100644
index 00000000..6b559de9
--- /dev/null
+++ b/modelscope/preprocessors/nlp/nlp_base.py
@@ -0,0 +1,1163 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import os.path as osp
+import re
+from typing import Any, Dict, Iterable, Optional, Tuple, Union
+
+import numpy as np
+import torch
+from transformers import AutoTokenizer
+
+from modelscope.metainfo import Models, Preprocessors
+from modelscope.outputs import OutputKeys
+from modelscope.preprocessors.base import Preprocessor
+from modelscope.preprocessors.builder import PREPROCESSORS
+from modelscope.utils.config import Config, ConfigFields
+from modelscope.utils.constant import Fields, InputFields, ModeKeys, ModelFile
+from modelscope.utils.hub import get_model_type, parse_label_mapping
+from modelscope.utils.logger import get_logger
+from modelscope.utils.nlp import import_external_nltk_data
+from modelscope.utils.type_assert import type_assert
+
+logger = get_logger()
+
+__all__ = [
+    'DocumentSegmentationPreprocessor',
+    'FaqQuestionAnsweringPreprocessor',
+    'NLPPreprocessor',
+    'FillMaskPoNetPreprocessor',
+    'NLPTokenizerPreprocessorBase',
+    'PassageRankingPreprocessor',
+    'RelationExtractionPreprocessor',
+    'SentenceEmbeddingPreprocessor',
+    'SequenceClassificationPreprocessor',
+    'TokenClassificationPreprocessor',
+    'Text2TextGenerationPreprocessor',
+    'TextGenerationPreprocessor',
+    'Tokenize',
+    'WordSegmentationBlankSetToLabelPreprocessor',
+    'ZeroShotClassificationPreprocessor',
+]
+
+
+@PREPROCESSORS.register_module(Fields.nlp)
+class Tokenize(Preprocessor):
+
+    def __init__(self, tokenizer_name) -> None:
+        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
+
+    def __call__(self, data: Union[str, Dict[str, Any]]) -> Dict[str, Any]:
+        if isinstance(data, str):
+            data = {InputFields.text: data}
+        token_dict = self.tokenizer(data[InputFields.text])
+        data.update(token_dict)
+        return data
+
+
+class NLPTokenizerPreprocessorBase(Preprocessor):
+
+    def __init__(self, model_dir: str, mode: str, **kwargs):
+        """The NLP tokenizer preprocessor base class.
+
+        Any nlp preprocessor which uses the hf tokenizer can inherit from this class.
+
+        Args:
+            model_dir (str): The local model path
+            first_sequence: The key for the first sequence
+            second_sequence: The key for the second sequence
+            label: The label key
+            label2id: An optional label2id mapping, the class will try to call utils.parse_label_mapping
+                if this mapping is not supplied.
+            mode: Run this preprocessor in either 'train'/'eval'/'inference' mode
+            kwargs: These kwargs will be directly fed into the tokenizer.
+        """
+
+        super().__init__(**kwargs)
+        self.model_dir: str = model_dir
+        self.first_sequence: str = kwargs.pop('first_sequence',
+                                              'first_sequence')
+        self.second_sequence = kwargs.pop('second_sequence', 'second_sequence')
+        self.sequence_length = kwargs.pop('sequence_length', 128)
+
+        self._mode = mode
+        self.label = kwargs.pop('label', OutputKeys.LABEL)
+        self.label2id = None
+        if 'label2id' in kwargs:
+            self.label2id = kwargs.pop('label2id')
+        if self.label2id is None:
+            self.label2id = parse_label_mapping(self.model_dir)
+
+        self.tokenize_kwargs = kwargs
+
+        self.tokenizer = self.build_tokenizer(model_dir)
+
+    @property
+    def id2label(self):
+        """Return the id2label mapping according to the label2id mapping.
+
+        @return: The id2label mapping if exists.
+        """
+        if self.label2id is not None:
+            return {id: label for label, id in self.label2id.items()}
+        return None
+
+    def build_tokenizer(self, model_dir):
+        """Build a tokenizer by the model type.
+
+        NOTE: This default implementation only returns slow tokenizer, because the fast tokenizers have a
+        multi-thread problem.
+
+        @param model_dir:  The local model dir.
+        @return: The initialized tokenizer.
+        """
+        self.is_transformer_based_model = 'lstm' not in model_dir
+        # fast version lead to parallel inference failed
+        model_type = get_model_type(model_dir)
+        if model_type in (Models.structbert, Models.gpt3, Models.palm,
+                          Models.plug):
+            from modelscope.models.nlp.structbert import SbertTokenizer, SbertTokenizerFast
+            return SbertTokenizer.from_pretrained(
+                model_dir
+            ) if self._mode == ModeKeys.INFERENCE else SbertTokenizerFast.from_pretrained(
+                model_dir)
+        elif model_type == Models.veco:
+            from modelscope.models.nlp.veco import VecoTokenizer, VecoTokenizerFast
+            return VecoTokenizer.from_pretrained(
+                model_dir
+            ) if self._mode == ModeKeys.INFERENCE else VecoTokenizerFast.from_pretrained(
+                model_dir)
+        elif model_type == Models.deberta_v2:
+            from modelscope.models.nlp.deberta_v2 import DebertaV2Tokenizer, DebertaV2TokenizerFast
+            return DebertaV2Tokenizer.from_pretrained(
+                model_dir
+            ) if self._mode == ModeKeys.INFERENCE else DebertaV2TokenizerFast.from_pretrained(
+                model_dir)
+        elif not self.is_transformer_based_model:
+            from transformers import BertTokenizer, BertTokenizerFast
+            return BertTokenizer.from_pretrained(
+                model_dir
+            ) if self._mode == ModeKeys.INFERENCE else BertTokenizerFast.from_pretrained(
+                model_dir)
+        else:
+            return AutoTokenizer.from_pretrained(
+                model_dir,
+                use_fast=False if self._mode == ModeKeys.INFERENCE else True)
+
+    def __call__(self, data: Union[str, Tuple, Dict]) -> Dict[str, Any]:
+        """process the raw input data
+
+        Args:
+            data (tuple): [sentence1, sentence2]
+                sentence1 (str): a sentence
+                    Example:
+                        'you are so handsome.'
+                sentence2 (str): a sentence
+                    Example:
+                        'you are so beautiful.'
+        Returns:
+            Dict[str, Any]: the preprocessed data
+        """
+
+        text_a, text_b, labels = self.parse_text_and_label(data)
+        output = self.tokenizer(
+            text_a,
+            text_b,
+            return_tensors='pt' if self._mode == ModeKeys.INFERENCE else None,
+            **self.tokenize_kwargs)
+        output = {
+            k: np.array(v) if isinstance(v, list) else v
+            for k, v in output.items()
+        }
+        self.labels_to_id(labels, output)
+        return output
+
+    def parse_text_and_label(self, data):
+        """Parse the input and return the sentences and labels.
+
+        When input type is tuple or list and its size is 2:
+        If the pair param is False, data will be parsed as the first_sentence and the label,
+        else it will be parsed as the first_sentence and the second_sentence.
+
+        @param data: The input data.
+        @return: The sentences and labels tuple.
+        """
+        text_a, text_b, labels = None, None, None
+        if isinstance(data, str):
+            text_a = data
+        elif isinstance(data, tuple) or isinstance(data, list):
+            if len(data) == 3:
+                text_a, text_b, labels = data
+            elif len(data) == 2:
+                if self._mode == ModeKeys.INFERENCE:
+                    text_a, text_b = data
+                else:
+                    text_a, labels = data
+        elif isinstance(data, dict):
+            text_a = data.get(self.first_sequence)
+            text_b = data.get(self.second_sequence)
+            labels = data.get(self.label)
+
+        return text_a, text_b, labels
+
+    def labels_to_id(self, labels, output):
+        """Turn the labels to id with the type int or float.
+
+        If the original label's type is str or int, the label2id mapping will try to convert it to the final label.
+        If the original label's type is float, or the label2id mapping does not exist,
+        the original label will be returned.
+
+        @param labels: The input labels.
+        @param output: The label id.
+        @return: The final labels.
+        """
+
+        def label_can_be_mapped(label):
+            return isinstance(label, str) or isinstance(label, int)
+
+        if labels is not None:
+            if isinstance(labels, Iterable) and all([label_can_be_mapped(label) for label in labels]) \
+                    and self.label2id is not None:
+                output[OutputKeys.LABELS] = [
+                    self.label2id[str(label)] for label in labels
+                ]
+            elif label_can_be_mapped(labels) and self.label2id is not None:
+                output[OutputKeys.LABELS] = self.label2id[str(labels)]
+            else:
+                output[OutputKeys.LABELS] = labels
+
+
+@PREPROCESSORS.register_module(Fields.nlp, module_name=Preprocessors.fill_mask)
+@PREPROCESSORS.register_module(
+    Fields.nlp, module_name=Preprocessors.feature_extraction)
+class NLPPreprocessor(NLPTokenizerPreprocessorBase):
+    """The tokenizer preprocessor used in MLM task.
+    """
+
+    def __init__(self, model_dir: str, mode=ModeKeys.INFERENCE, **kwargs):
+        kwargs['truncation'] = kwargs.get('truncation', True)
+        kwargs['padding'] = kwargs.get('padding', 'max_length')
+        kwargs['max_length'] = kwargs.pop('sequence_length', 128)
+        kwargs['return_token_type_ids'] = kwargs.get('return_token_type_ids',
+                                                     True)
+        super().__init__(model_dir, mode=mode, **kwargs)
+
+
+@PREPROCESSORS.register_module(
+    Fields.nlp, module_name=Preprocessors.passage_ranking)
+class PassageRankingPreprocessor(NLPTokenizerPreprocessorBase):
+    """The tokenizer preprocessor used in passage ranking model.
+    """
+
+    def __init__(self,
+                 model_dir: str,
+                 mode=ModeKeys.INFERENCE,
+                 *args,
+                 **kwargs):
+        """preprocess the data
+
+        Args:
+            model_dir (str): model path
+        """
+        super().__init__(model_dir, pair=True, mode=mode, *args, **kwargs)
+        self.model_dir: str = model_dir
+        self.first_sequence: str = kwargs.pop('first_sequence',
+                                              'source_sentence')
+        self.second_sequence = kwargs.pop('second_sequence',
+                                          'sentences_to_compare')
+        self.sequence_length = kwargs.pop('sequence_length', 128)
+
+        self.tokenizer = AutoTokenizer.from_pretrained(self.model_dir)
+
+    @type_assert(object, (str, tuple, Dict))
+    def __call__(self, data: Union[tuple, Dict]) -> Dict[str, Any]:
+        if isinstance(data, tuple):
+            sentence1, sentence2 = data
+        elif isinstance(data, dict):
+            sentence1 = data.get(self.first_sequence)
+            sentence2 = data.get(self.second_sequence)
+        if isinstance(sentence2, str):
+            sentence2 = [sentence2]
+        if isinstance(sentence1, str):
+            sentence1 = [sentence1]
+        sentence1 = sentence1 * len(sentence2)
+
+        max_seq_length = self.sequence_length
+        feature = self.tokenizer(
+            sentence1,
+            sentence2,
+            padding='max_length',
+            truncation=True,
+            max_length=max_seq_length,
+            return_tensors='pt')
+        if 'labels' in data:
+            labels = data['labels']
+            feature['labels'] = labels
+        if 'qid' in data:
+            qid = data['qid']
+            feature['qid'] = qid
+        return feature
+
+
+@PREPROCESSORS.register_module(
+    Fields.nlp, module_name=Preprocessors.nli_tokenizer)
+@PREPROCESSORS.register_module(
+    Fields.nlp, module_name=Preprocessors.sen_sim_tokenizer)
+@PREPROCESSORS.register_module(
+    Fields.nlp, module_name=Preprocessors.bert_seq_cls_tokenizer)
+@PREPROCESSORS.register_module(
+    Fields.nlp, module_name=Preprocessors.sen_cls_tokenizer)
+class SequenceClassificationPreprocessor(NLPTokenizerPreprocessorBase):
+    """The tokenizer preprocessor used in sequence classification.
+    """
+
+    def __init__(self, model_dir: str, mode=ModeKeys.INFERENCE, **kwargs):
+        kwargs['truncation'] = kwargs.get('truncation', True)
+        kwargs['padding'] = kwargs.get(
+            'padding', False if mode == ModeKeys.INFERENCE else 'max_length')
+        kwargs['max_length'] = kwargs.pop('sequence_length', 128)
+        super().__init__(model_dir, mode=mode, **kwargs)
+
+
+@PREPROCESSORS.register_module(
+    Fields.nlp, module_name=Preprocessors.sentence_embedding)
+class SentenceEmbeddingPreprocessor(NLPTokenizerPreprocessorBase):
+    """The tokenizer preprocessor used in sentence embedding.
+    """
+
+    def __init__(self, model_dir: str, mode=ModeKeys.INFERENCE, **kwargs):
+        kwargs['truncation'] = kwargs.get('truncation', True)
+        kwargs['padding'] = kwargs.get(
+            'padding', False if mode == ModeKeys.INFERENCE else 'max_length')
+        kwargs['max_length'] = kwargs.pop('sequence_length', 128)
+        super().__init__(model_dir, pair=False, mode=mode, **kwargs)
+
+    def __call__(self, data: Union[str, Dict]) -> Dict[str, Any]:
+        """process the raw input data
+
+        Args:
+            data Dict:
+                keys: "source_sentence" && "sentences_to_compare"
+                values: list of sentences
+                Example:
+                    {"source_sentence": ["how long it take to get a master's degree"],
+                     "sentences_to_compare": ["On average, students take about 18 to 24 months
+                     to complete a master's degree.",
+                     "On the other hand, some students prefer to go at a slower pace
+                     and choose to take several years to complete their studies.",
+                     "It can take anywhere from two semesters"]}
+        Returns:
+            Dict[str, Any]: the preprocessed data
+        """
+        source_sentence = data['source_sentence']
+        compare_sentences = data['sentences_to_compare']
+        sentences = []
+        sentences.append(source_sentence[0])
+        for sent in compare_sentences:
+            sentences.append(sent)
+
+        tokenized_inputs = self.tokenizer(
+            sentences,
+            return_tensors='pt' if self._mode == ModeKeys.INFERENCE else None,
+            padding=True,
+            truncation=True)
+        return tokenized_inputs
+
+
+@PREPROCESSORS.register_module(
+    Fields.nlp, module_name=Preprocessors.zero_shot_cls_tokenizer)
+class ZeroShotClassificationPreprocessor(NLPTokenizerPreprocessorBase):
+    """The tokenizer preprocessor used in zero shot classification.
+    """
+
+    def __init__(self, model_dir: str, mode=ModeKeys.INFERENCE, **kwargs):
+        """preprocess the data
+
+        Args:
+            model_dir (str): model path
+        """
+        self.sequence_length = kwargs.pop('sequence_length', 512)
+        super().__init__(model_dir, mode=mode, **kwargs)
+
+    def __call__(self, data: Union[str, Dict], hypothesis_template: str,
+                 candidate_labels: list) -> Dict[str, Any]:
+        """process the raw input data
+
+        Args:
+            data (str or dict): a sentence
+                Example:
+                    'you are so handsome.'
+
+        Returns:
+            Dict[str, Any]: the preprocessed data
+        """
+        if isinstance(data, dict):
+            data = data.get(self.first_sequence)
+
+        pairs = [[data, hypothesis_template.format(label)]
+                 for label in candidate_labels]
+
+        features = self.tokenizer(
+            pairs,
+            padding=True,
+            truncation=True,
+            max_length=self.sequence_length,
+            truncation_strategy='only_first',
+            return_tensors='pt' if self._mode == ModeKeys.INFERENCE else None)
+        return features
+
+
+@PREPROCESSORS.register_module(
+    Fields.nlp, module_name=Preprocessors.text2text_gen_preprocessor)
+class Text2TextGenerationPreprocessor(NLPTokenizerPreprocessorBase):
+    """The tokenizer preprocessor used in text generation.
+    """
+
+    def __init__(self,
+                 model_dir: str,
+                 tokenizer=None,
+                 mode=ModeKeys.INFERENCE,
+                 **kwargs):
+        self.tokenizer = self.build_tokenizer(
+            model_dir) if tokenizer is None else tokenizer
+        kwargs['truncation'] = kwargs.get('truncation', 'do_not_truncate')
+        kwargs['padding'] = kwargs.get('padding', False)
+        kwargs['return_token_type_ids'] = kwargs.get('return_token_type_ids',
+                                                     False)
+        kwargs['max_length'] = kwargs.pop('sequence_length', 128)
+        super().__init__(model_dir, pair=False, mode=mode, **kwargs)
+
+    def __call__(self, data: Union[Dict, str]) -> Dict[str, Any]:
+        text_a, _, _ = self.parse_text_and_label(data)
+
+        inputs = self.tokenizer(
+            text_a,
+            return_tensors='pt' if self._mode == ModeKeys.INFERENCE else None,
+            **self.tokenize_kwargs)
+
+        # This is produced by tokenizers but is an invalid generate kwargs
+        if 'token_type_ids' in inputs:
+            del inputs['token_type_ids']
+        return inputs
+
+
+@PREPROCESSORS.register_module(
+    Fields.nlp, module_name=Preprocessors.text_gen_tokenizer)
+class TextGenerationPreprocessor(NLPTokenizerPreprocessorBase):
+    """The tokenizer preprocessor used in text generation.
+    """
+
+    def __init__(self,
+                 model_dir: str,
+                 tokenizer=None,
+                 mode=ModeKeys.INFERENCE,
+                 **kwargs):
+        kwargs['truncation'] = kwargs.get('truncation', True)
+        kwargs['padding'] = kwargs.get('padding', 'max_length')
+        kwargs['return_token_type_ids'] = kwargs.get('return_token_type_ids',
+                                                     False)
+        kwargs['max_length'] = kwargs.pop('sequence_length', 128)
+        super().__init__(model_dir, mode=mode, **kwargs)
+
+    @staticmethod
+    def get_roberta_tokenizer_dir(model_dir: str) -> Optional[str]:
+        import os
+        for name in os.listdir(model_dir):
+            full_name = os.path.join(model_dir, name)
+            if 'roberta' in name and os.path.isdir(full_name):
+                return full_name
+
+    def build_tokenizer(self, model_dir: str):
+        roberta_tokenizer_dir = self.get_roberta_tokenizer_dir(model_dir)
+        if roberta_tokenizer_dir:
+            from transformers import RobertaTokenizer
+            return RobertaTokenizer.from_pretrained(
+                roberta_tokenizer_dir, do_lower_case=False)
+        return super().build_tokenizer(model_dir)
+
+    def __call__(self, data: Union[Dict, str]) -> Dict[str, Any]:
+        if self._mode == ModeKeys.INFERENCE:
+            return super().__call__(data)
+        src_rst = super().__call__(data['src_txt'])
+        src_input_ids = src_rst['input_ids']
+        src_attention_mask = src_rst['attention_mask']
+        if 'tgt_txt' in data:
+            labels = super().__call__(data['tgt_txt'])['input_ids']
+        else:
+            labels = src_input_ids[1:]
+            src_input_ids = src_input_ids[:-1]
+            src_attention_mask = src_attention_mask[:-1]
+
+        return {
+            'input_ids': src_input_ids,
+            'attention_mask': src_attention_mask,
+            'labels': labels,
+        }
+
+
+@PREPROCESSORS.register_module(
+    Fields.nlp,
+    module_name=Preprocessors.word_segment_text_to_label_preprocessor)
+class WordSegmentationBlankSetToLabelPreprocessor(Preprocessor):
+    """The preprocessor used to turn a single sentence to a labeled token-classification dict.
+    """
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        self.first_sequence: str = kwargs.pop('first_sequence',
+                                              'first_sequence')
+        self.label = kwargs.pop('label', OutputKeys.LABELS)
+
+    def __call__(self, data: str) -> Union[Dict[str, Any], Tuple]:
+        data = data.split(' ')
+        data = list(filter(lambda x: len(x) > 0, data))
+
+        def produce_train_sample(words):
+            chars = []
+            labels = []
+            for word in words:
+                chars.extend(list(word))
+                if len(word) == 1:
+                    labels.append('S-CWS')
+                else:
+                    labels.extend(['B-CWS'] + ['I-CWS'] * (len(word) - 2)
+                                  + ['E-CWS'])
+            assert len(chars) == len(labels)
+            return chars, labels
+
+        chars, labels = produce_train_sample(data)
+        return {
+            self.first_sequence: chars,
+            self.label: labels,
+        }
+
+
+@PREPROCESSORS.register_module(
+    Fields.nlp, module_name=Preprocessors.ner_tokenizer)
+@PREPROCESSORS.register_module(
+    Fields.nlp, module_name=Preprocessors.token_cls_tokenizer)
+@PREPROCESSORS.register_module(
+    Fields.nlp, module_name=Preprocessors.sequence_labeling_tokenizer)
+class TokenClassificationPreprocessor(NLPTokenizerPreprocessorBase):
+    """The tokenizer preprocessor used in normal NER task.
+    """
+
+    def __init__(self, model_dir: str, mode=ModeKeys.INFERENCE, **kwargs):
+        """preprocess the data
+
+        Args:
+            model_dir (str): model path
+        """
+        kwargs['truncation'] = kwargs.get('truncation', True)
+        kwargs['padding'] = kwargs.get(
+            'padding', False if mode == ModeKeys.INFERENCE else 'max_length')
+        kwargs['max_length'] = kwargs.pop('sequence_length', 128)
+        self.label_all_tokens = kwargs.pop('label_all_tokens', False)
+        super().__init__(model_dir, mode=mode, **kwargs)
+
+        if 'is_split_into_words' in kwargs:
+            self.is_split_into_words = kwargs.pop('is_split_into_words')
+        else:
+            self.is_split_into_words = self.tokenizer.init_kwargs.get(
+                'is_split_into_words', False)
+        if 'label2id' in kwargs:
+            kwargs.pop('label2id')
+        self.tokenize_kwargs = kwargs
+
+    @type_assert(object, str)
+    def __call__(self, data: str) -> Dict[str, Any]:
+        """process the raw input data
+
+        Args:
+            data (str): a sentence
+                Example:
+                    'you are so handsome.'
+
+        Returns:
+            Dict[str, Any]: the preprocessed data
+        """
+
+        # preprocess the data for the model input
+        text = None
+        labels_list = None
+        if isinstance(data, str):
+            text = data
+        elif isinstance(data, dict):
+            text = data.get(self.first_sequence)
+            labels_list = data.get(self.label)
+
+        input_ids = []
+        label_mask = []
+        offset_mapping = []
+        if self.is_split_into_words:
+            for offset, token in enumerate(list(data)):
+                subtoken_ids = self.tokenizer.encode(
+                    token, add_special_tokens=False)
+                if len(subtoken_ids) == 0:
+                    subtoken_ids = [self.tokenizer.unk_token_id]
+                input_ids.extend(subtoken_ids)
+                label_mask.extend([1] + [0] * (len(subtoken_ids) - 1))
+                offset_mapping.extend([(offset, offset + 1)])
+        else:
+            if self.tokenizer.is_fast:
+                encodings = self.tokenizer(
+                    text,
+                    add_special_tokens=False,
+                    return_offsets_mapping=True,
+                    **self.tokenize_kwargs)
+                input_ids = encodings['input_ids']
+                word_ids = encodings.word_ids()
+                for i in range(len(word_ids)):
+                    if word_ids[i] is None:
+                        label_mask.append(0)
+                    elif word_ids[i] == word_ids[i - 1]:
+                        label_mask.append(0)
+                        offset_mapping[-1] = (
+                            offset_mapping[-1][0],
+                            encodings['offset_mapping'][i][1])
+                    else:
+                        label_mask.append(1)
+                        offset_mapping.append(encodings['offset_mapping'][i])
+            else:
+                encodings = self.tokenizer(
+                    text, add_special_tokens=False, **self.tokenize_kwargs)
+                input_ids = encodings['input_ids']
+                label_mask, offset_mapping = self.get_label_mask_and_offset_mapping(
+                    text)
+
+        if len(input_ids) >= self.sequence_length - 2:
+            input_ids = input_ids[:self.sequence_length - 2]
+            label_mask = label_mask[:self.sequence_length - 2]
+        input_ids = [self.tokenizer.cls_token_id
+                     ] + input_ids + [self.tokenizer.sep_token_id]
+        label_mask = [0] + label_mask + [0]
+        attention_mask = [1] * len(input_ids)
+        offset_mapping = offset_mapping[:sum(label_mask)]
+
+        if not self.is_transformer_based_model:
+            input_ids = input_ids[1:-1]
+            attention_mask = attention_mask[1:-1]
+            label_mask = label_mask[1:-1]
+
+        if self._mode == ModeKeys.INFERENCE:
+            input_ids = torch.tensor(input_ids).unsqueeze(0)
+            attention_mask = torch.tensor(attention_mask).unsqueeze(0)
+            label_mask = torch.tensor(
+                label_mask, dtype=torch.bool).unsqueeze(0)
+
+        # the token classification
+        output = {
+            'text': text,
+            'input_ids': input_ids,
+            'attention_mask': attention_mask,
+            'label_mask': label_mask,
+            'offset_mapping': offset_mapping
+        }
+
+        # align the labels with tokenized text
+        if labels_list is not None:
+            assert self.label2id is not None
+            # Map that sends B-Xxx label to its I-Xxx counterpart
+            b_to_i_label = []
+            label_enumerate_values = [
+                k for k, v in sorted(
+                    self.label2id.items(), key=lambda item: item[1])
+            ]
+            for idx, label in enumerate(label_enumerate_values):
+                if label.startswith('B-') and label.replace(
+                        'B-', 'I-') in label_enumerate_values:
+                    b_to_i_label.append(
+                        label_enumerate_values.index(
+                            label.replace('B-', 'I-')))
+                else:
+                    b_to_i_label.append(idx)
+
+            label_row = [self.label2id[lb] for lb in labels_list]
+            previous_word_idx = None
+            label_ids = []
+            for word_idx in word_ids:
+                if word_idx is None:
+                    label_ids.append(-100)
+                elif word_idx != previous_word_idx:
+                    label_ids.append(label_row[word_idx])
+                else:
+                    if self.label_all_tokens:
+                        label_ids.append(b_to_i_label[label_row[word_idx]])
+                    else:
+                        label_ids.append(-100)
+                previous_word_idx = word_idx
+            labels = label_ids
+            output['labels'] = labels
+        return output
+
+    def get_tokenizer_class(self):
+        tokenizer_class = self.tokenizer.__class__.__name__
+        if tokenizer_class.endswith(
+                'Fast') and tokenizer_class != 'PreTrainedTokenizerFast':
+            tokenizer_class = tokenizer_class[:-4]
+        return tokenizer_class
+
+    def get_label_mask_and_offset_mapping(self, text):
+        label_mask = []
+        offset_mapping = []
+        tokens = self.tokenizer.tokenize(text)
+        offset = 0
+        if self.get_tokenizer_class() == 'BertTokenizer':
+            for token in tokens:
+                is_start = (token[:2] != '##')
+                if is_start:
+                    label_mask.append(True)
+                else:
+                    token = token[2:]
+                    label_mask.append(False)
+                start = offset + text[offset:].index(token)
+                end = start + len(token)
+                if is_start:
+                    offset_mapping.append((start, end))
+                else:
+                    offset_mapping[-1] = (offset_mapping[-1][0], end)
+                offset = end
+        elif self.get_tokenizer_class() == 'XLMRobertaTokenizer':
+            last_is_blank = False
+            for token in tokens:
+                is_start = (token[0] == '▁')
+                if is_start:
+                    token = token[1:]
+                    label_mask.append(True)
+                    if len(token) == 0:
+                        last_is_blank = True
+                        continue
+                else:
+                    label_mask.append(False)
+                start = offset + text[offset:].index(token)
+                end = start + len(token)
+                if last_is_blank or is_start:
+                    offset_mapping.append((start, end))
+                else:
+                    offset_mapping[-1] = (offset_mapping[-1][0], end)
+                offset = end
+                last_is_blank = False
+        else:
+            raise NotImplementedError
+
+        return label_mask, offset_mapping
+
+
+@PREPROCESSORS.register_module(
+    Fields.nlp, module_name=Preprocessors.re_tokenizer)
+class RelationExtractionPreprocessor(Preprocessor):
+    """The relation extraction preprocessor used in normal RE task.
+    """
+
+    def __init__(self, model_dir: str, *args, **kwargs):
+        """preprocess the data
+
+        Args:
+            model_dir (str): model path
+        """
+
+        super().__init__(*args, **kwargs)
+
+        self.model_dir: str = model_dir
+        self.sequence_length = kwargs.pop('sequence_length', 512)
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            model_dir, use_fast=True)
+
+    @type_assert(object, str)
+    def __call__(self, data: str) -> Dict[str, Any]:
+        """process the raw input data
+
+        Args:
+            data (str): a sentence
+                Example:
+                    'you are so handsome.'
+
+        Returns:
+            Dict[str, Any]: the preprocessed data
+        """
+
+        # preprocess the data for the model input
+        text = data
+        output = self.tokenizer([text], return_tensors='pt')
+        return {
+            'text': text,
+            'input_ids': output['input_ids'],
+            'attention_mask': output['attention_mask'],
+            'offsets': output[0].offsets
+        }
+
+
+@PREPROCESSORS.register_module(
+    Fields.nlp, module_name=Preprocessors.faq_question_answering_preprocessor)
+class FaqQuestionAnsweringPreprocessor(Preprocessor):
+
+    def __init__(self, model_dir: str, *args, **kwargs):
+        super(FaqQuestionAnsweringPreprocessor, self).__init__(
+            model_dir, mode=ModeKeys.INFERENCE, **kwargs)
+        import os
+        from transformers import BertTokenizer
+
+        from modelscope.utils.config import Config
+        from modelscope.utils.constant import ModelFile
+        self.tokenizer = BertTokenizer.from_pretrained(model_dir)
+        preprocessor_config = Config.from_file(
+            os.path.join(model_dir, ModelFile.CONFIGURATION)).get(
+                ConfigFields.preprocessor, {})
+        self.MAX_LEN = preprocessor_config.get('max_seq_length', 50)
+        self.label_dict = None
+
+    def pad(self, samples, max_len):
+        result = []
+        for sample in samples:
+            pad_len = max_len - len(sample[:max_len])
+            result.append(sample[:max_len]
+                          + [self.tokenizer.pad_token_id] * pad_len)
+        return result
+
+    def set_label_dict(self, label_dict):
+        self.label_dict = label_dict
+
+    def get_label(self, label_id):
+        assert self.label_dict is not None and label_id < len(self.label_dict)
+        return self.label_dict[label_id]
+
+    def encode_plus(self, text):
+        return [
+            self.tokenizer.cls_token_id
+        ] + self.tokenizer.convert_tokens_to_ids(
+            self.tokenizer.tokenize(text)) + [self.tokenizer.sep_token_id]
+
+    @type_assert(object, Dict)
+    def __call__(self, data: Dict[str, Any],
+                 **preprocessor_param) -> Dict[str, Any]:
+        TMP_MAX_LEN = preprocessor_param.get('max_seq_length', self.MAX_LEN)
+        queryset = data['query_set']
+        if not isinstance(queryset, list):
+            queryset = [queryset]
+        supportset = data['support_set']
+        supportset = sorted(supportset, key=lambda d: d['label'])
+
+        queryset_tokenized = [self.encode_plus(text) for text in queryset]
+        supportset_tokenized = [
+            self.encode_plus(item['text']) for item in supportset
+        ]
+
+        max_len = max(
+            [len(seq) for seq in queryset_tokenized + supportset_tokenized])
+        max_len = min(TMP_MAX_LEN, max_len)
+        queryset_padded = self.pad(queryset_tokenized, max_len)
+        supportset_padded = self.pad(supportset_tokenized, max_len)
+
+        supportset_labels_ori = [item['label'] for item in supportset]
+        label_dict = []
+        for label in supportset_labels_ori:
+            if label not in label_dict:
+                label_dict.append(label)
+        self.set_label_dict(label_dict)
+        supportset_labels_ids = [
+            label_dict.index(label) for label in supportset_labels_ori
+        ]
+        return {
+            'query': queryset_padded,
+            'support': supportset_padded,
+            'support_labels': supportset_labels_ids
+        }
+
+    def batch_encode(self, sentence_list: list, max_length=None):
+        if not max_length:
+            max_length = self.MAX_LEN
+        return self.tokenizer.batch_encode_plus(
+            sentence_list, padding=True, max_length=max_length)
+
+
+@PREPROCESSORS.register_module(
+    Fields.nlp, module_name=Preprocessors.document_segmentation)
+class DocumentSegmentationPreprocessor(Preprocessor):
+
+    def __init__(self, model_dir: str, config, *args, **kwargs):
+        """preprocess the data
+
+        Args:
+            model_dir (str): model path
+        """
+
+        super().__init__(*args, **kwargs)
+        from transformers import BertTokenizerFast
+        self.tokenizer = BertTokenizerFast.from_pretrained(
+            model_dir,
+            use_fast=True,
+        )
+        self.question_column_name = 'labels'
+        self.context_column_name = 'sentences'
+        self.example_id_column_name = 'example_id'
+        self.label_to_id = {'B-EOP': 0, 'O': 1}
+        self.target_specical_ids = set()
+        self.target_specical_ids.add(self.tokenizer.eos_token_id)
+        self.max_seq_length = config.max_position_embeddings
+        self.label_list = ['B-EOP', 'O']
+
+    def __call__(self, examples) -> Dict[str, Any]:
+        questions = examples[self.question_column_name]
+        contexts = examples[self.context_column_name]
+        example_ids = examples[self.example_id_column_name]
+        num_examples = len(questions)
+
+        sentences = []
+        for sentence_list in contexts:
+            sentence_list = [_ + '[EOS]' for _ in sentence_list]
+            sentences.append(sentence_list)
+
+        try:
+            tokenized_examples = self.tokenizer(
+                sentences,
+                is_split_into_words=True,
+                add_special_tokens=False,
+                return_token_type_ids=True,
+                return_attention_mask=True,
+            )
+        except Exception as e:
+            logger.error(e)
+            return {}
+
+        segment_ids = []
+        token_seq_labels = []
+        for example_index in range(num_examples):
+            example_input_ids = tokenized_examples['input_ids'][example_index]
+            example_labels = questions[example_index]
+            example_labels = [
+                self.label_to_id[_] if _ in self.label_to_id else -100
+                for _ in example_labels
+            ]
+            example_token_labels = []
+            segment_id = []
+            cur_seg_id = 1
+            for token_index in range(len(example_input_ids)):
+                if example_input_ids[token_index] in self.target_specical_ids:
+                    example_token_labels.append(example_labels[cur_seg_id - 1])
+                    segment_id.append(cur_seg_id)
+                    cur_seg_id += 1
+                else:
+                    example_token_labels.append(-100)
+                    segment_id.append(cur_seg_id)
+
+            segment_ids.append(segment_id)
+            token_seq_labels.append(example_token_labels)
+
+        tokenized_examples['segment_ids'] = segment_ids
+        tokenized_examples['token_seq_labels'] = token_seq_labels
+
+        new_segment_ids = []
+        new_token_seq_labels = []
+        new_input_ids = []
+        new_token_type_ids = []
+        new_attention_mask = []
+        new_example_ids = []
+        new_sentences = []
+
+        for example_index in range(num_examples):
+            example_input_ids = tokenized_examples['input_ids'][example_index]
+            example_token_type_ids = tokenized_examples['token_type_ids'][
+                example_index]
+            example_attention_mask = tokenized_examples['attention_mask'][
+                example_index]
+            example_segment_ids = tokenized_examples['segment_ids'][
+                example_index]
+            example_token_seq_labels = tokenized_examples['token_seq_labels'][
+                example_index]
+            example_sentences = contexts[example_index]
+            example_id = example_ids[example_index]
+            example_total_num_sentences = len(questions[example_index])
+            example_total_num_tokens = len(
+                tokenized_examples['input_ids'][example_index])
+            accumulate_length = [
+                i for i, x in enumerate(tokenized_examples['input_ids']
+                                        [example_index])
+                if x == self.tokenizer.eos_token_id
+            ]
+            samples_boundary = []
+            left_index = 0
+            sent_left_index = 0
+            sent_i = 0
+
+            # for sent_i, length in enumerate(accumulate_length):
+            while sent_i < len(accumulate_length):
+                length = accumulate_length[sent_i]
+                right_index = length + 1
+                sent_right_index = sent_i + 1
+                if right_index - left_index >= self.max_seq_length - 1 or right_index == example_total_num_tokens:
+                    samples_boundary.append([left_index, right_index])
+
+                    sample_input_ids = [
+                        self.tokenizer.cls_token_id
+                    ] + example_input_ids[left_index:right_index]
+                    sample_input_ids = sample_input_ids[:self.max_seq_length]
+
+                    sample_token_type_ids = [
+                        0
+                    ] + example_token_type_ids[left_index:right_index]
+                    sample_token_type_ids = sample_token_type_ids[:self.
+                                                                  max_seq_length]
+
+                    sample_attention_mask = [
+                        1
+                    ] + example_attention_mask[left_index:right_index]
+                    sample_attention_mask = sample_attention_mask[:self.
+                                                                  max_seq_length]
+
+                    sample_segment_ids = [
+                        0
+                    ] + example_segment_ids[left_index:right_index]
+                    sample_segment_ids = sample_segment_ids[:self.
+                                                            max_seq_length]
+
+                    sample_token_seq_labels = [
+                        -100
+                    ] + example_token_seq_labels[left_index:right_index]
+                    sample_token_seq_labels = sample_token_seq_labels[:self.
+                                                                      max_seq_length]
+
+                    if sent_right_index - 1 == sent_left_index:
+                        left_index = right_index
+                        sample_input_ids[-1] = self.tokenizer.eos_token_id
+                        sample_token_seq_labels[-1] = -100
+                    else:
+                        left_index = accumulate_length[sent_i - 1] + 1
+                        if sample_token_seq_labels[-1] != -100:
+                            sample_token_seq_labels[-1] = -100
+
+                    if sent_right_index - 1 == sent_left_index or right_index == example_total_num_tokens:
+                        sample_sentences = example_sentences[
+                            sent_left_index:sent_right_index]
+                        sent_left_index = sent_right_index
+                        sent_i += 1
+                    else:
+                        sample_sentences = example_sentences[
+                            sent_left_index:sent_right_index - 1]
+                        sent_left_index = sent_right_index - 1
+
+                    if (len([_ for _ in sample_token_seq_labels if _ != -100
+                             ])) != len(sample_sentences) - 1 and (len([
+                                 _
+                                 for _ in sample_token_seq_labels if _ != -100
+                             ])) != len(sample_sentences):
+                        tmp = []
+                        for w_i, w, l in zip(
+                                sample_input_ids,
+                                self.tokenizer.decode(sample_input_ids).split(
+                                    ' '), sample_token_seq_labels):
+                            tmp.append((w_i, w, l))
+                    while len(sample_input_ids) < self.max_seq_length:
+                        sample_input_ids.append(self.tokenizer.pad_token_id)
+                        sample_token_type_ids.append(0)
+                        sample_attention_mask.append(0)
+                        sample_segment_ids.append(example_total_num_sentences
+                                                  + 1)
+                        sample_token_seq_labels.append(-100)
+
+                    new_input_ids.append(sample_input_ids)
+                    new_token_type_ids.append(sample_token_type_ids)
+                    new_attention_mask.append(sample_attention_mask)
+                    new_segment_ids.append(sample_segment_ids)
+                    new_token_seq_labels.append(sample_token_seq_labels)
+                    new_example_ids.append(example_id)
+                    new_sentences.append(sample_sentences)
+                else:
+                    sent_i += 1
+                    continue
+
+        output_samples = {}
+
+        output_samples['input_ids'] = new_input_ids
+        output_samples['token_type_ids'] = new_token_type_ids
+        output_samples['attention_mask'] = new_attention_mask
+
+        output_samples['segment_ids'] = new_segment_ids
+        output_samples['example_id'] = new_example_ids
+        output_samples['labels'] = new_token_seq_labels
+        output_samples['sentences'] = new_sentences
+
+        return output_samples
+
+
+@PREPROCESSORS.register_module(
+    Fields.nlp, module_name=Preprocessors.fill_mask_ponet)
+class FillMaskPoNetPreprocessor(NLPTokenizerPreprocessorBase):
+    """The tokenizer preprocessor used in MLM task.
+    """
+
+    def __init__(self, model_dir: str, mode=ModeKeys.INFERENCE, **kwargs):
+        kwargs['truncation'] = kwargs.get('truncation', True)
+        kwargs['padding'] = kwargs.get('padding', 'max_length')
+        kwargs['max_length'] = kwargs.pop('sequence_length', 512)
+        kwargs['return_token_type_ids'] = kwargs.get('return_token_type_ids',
+                                                     True)
+        super().__init__(model_dir, pair=False, mode=mode, **kwargs)
+
+        self.cfg = Config.from_file(
+            osp.join(model_dir, ModelFile.CONFIGURATION))
+        self.language = self.cfg.model.get('language', 'en')
+        if self.language == 'en':
+            from nltk.tokenize import sent_tokenize
+            import_external_nltk_data(
+                osp.join(model_dir, 'nltk_data'), 'tokenizers/punkt')
+        elif self.language in ['zh', 'cn']:
+
+            def sent_tokenize(para):
+                para = re.sub(r'([。！!？\?])([^”’])', r'\1\n\2', para)  # noqa *
+                para = re.sub(r'(\.{6})([^”’])', r'\1\n\2', para)  # noqa *
+                para = re.sub(r'(\…{2})([^”’])', r'\1\n\2', para)  # noqa *
+                para = re.sub(r'([。！？\?][”’])([^，。！？\?])', r'\1\n\2',
+                              para)  # noqa *
+                para = para.rstrip()
+                return [_ for _ in para.split('\n') if _]
+        else:
+            raise NotImplementedError
+
+        self.sent_tokenize = sent_tokenize
+        self.max_length = kwargs['max_length']
+
+    def __call__(self, data: Union[str, Tuple, Dict]) -> Dict[str, Any]:
+        """process the raw input data
+
+        Args:
+            data (tuple): [sentence1, sentence2]
+                sentence1 (str): a sentence
+                    Example:
+                        'you are so handsome.'
+                sentence2 (str): a sentence
+                    Example:
+                        'you are so beautiful.'
+        Returns:
+            Dict[str, Any]: the preprocessed data
+        """
+
+        text_a, text_b, labels = self.parse_text_and_label(data)
+        output = self.tokenizer(
+            text_a,
+            text_b,
+            return_tensors='pt' if self._mode == ModeKeys.INFERENCE else None,
+            **self.tokenize_kwargs)
+        max_seq_length = self.max_length
+
+        if text_b is None:
+            segment_ids = []
+            seg_lens = list(
+                map(
+                    len,
+                    self.tokenizer(
+                        self.sent_tokenize(text_a),
+                        add_special_tokens=False,
+                        truncation=True)['input_ids']))
+            segment_id = [0] + sum(
+                [[i] * sl for i, sl in enumerate(seg_lens, start=1)], [])
+            segment_id = segment_id[:max_seq_length - 1]
+            segment_ids.append(segment_id + [segment_id[-1] + 1]
+                               * (max_seq_length - len(segment_id)))
+            output['segment_ids'] = segment_ids
+
+        output = {
+            k: np.array(v) if isinstance(v, list) else v
+            for k, v in output.items()
+        }
+
+        self.labels_to_id(labels, output)
+        return output
diff --git a/modelscope/preprocessors/nlp/text_error_correction.py b/modelscope/preprocessors/nlp/text_error_correction.py
new file mode 100644
index 00000000..357a946f
--- /dev/null
+++ b/modelscope/preprocessors/nlp/text_error_correction.py
@@ -0,0 +1,50 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import os.path as osp
+from typing import Any, Dict
+
+from modelscope.metainfo import Preprocessors
+from modelscope.preprocessors.base import Preprocessor
+from modelscope.preprocessors.builder import PREPROCESSORS
+from modelscope.utils.constant import Fields
+
+
+@PREPROCESSORS.register_module(
+    Fields.nlp, module_name=Preprocessors.text_error_correction)
+class TextErrorCorrectionPreprocessor(Preprocessor):
+    """The preprocessor used in text correction task.
+    """
+
+    def __init__(self, model_dir: str, *args, **kwargs):
+        from fairseq.data import Dictionary
+        """preprocess the data via the vocab file from the `model_dir` path
+
+        Args:
+            model_dir (str): model path
+        """
+        super().__init__(*args, **kwargs)
+        self.vocab = Dictionary.load(osp.join(model_dir, 'dict.src.txt'))
+
+    def __call__(self, data: str) -> Dict[str, Any]:
+        """process the raw input data
+
+        Args:
+            data (str): a sentence
+                Example:
+                    '随着中国经济突飞猛近，建造工业与日俱增'
+        Returns:
+            Dict[str, Any]: the preprocessed data
+            Example:
+            {'net_input':
+                {'src_tokens':tensor([1,2,3,4]),
+                'src_lengths': tensor([4])}
+            }
+        """
+
+        text = ' '.join([x for x in data])
+        inputs = self.vocab.encode_line(
+            text, append_eos=True, add_if_not_exist=False)
+        lengths = inputs.size()
+        sample = dict()
+        sample['net_input'] = {'src_tokens': inputs, 'src_lengths': lengths}
+        return sample
diff --git a/modelscope/preprocessors/ofa/text_to_image_synthesis.py b/modelscope/preprocessors/ofa/text_to_image_synthesis.py
index 938f50de..e10de82c 100644
--- a/modelscope/preprocessors/ofa/text_to_image_synthesis.py
+++ b/modelscope/preprocessors/ofa/text_to_image_synthesis.py
@@ -19,7 +19,8 @@ class OfaTextToImageSynthesisPreprocessor(OfaBasePreprocessor):
         self.max_src_length = 64
 
     def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
-        source = data['text'].lower().strip().split()[:self.max_src_length]
+        source = ' '.join(
+            data['text'].lower().strip().split()[:self.max_src_length])
         source = 'what is the complete image? caption: {}'.format(source)
         inputs = self.get_inputs(source)
         sample = {
diff --git a/modelscope/preprocessors/space/__init__.py b/modelscope/preprocessors/space/__init__.py
index f216287b..b484dabe 100644
--- a/modelscope/preprocessors/space/__init__.py
+++ b/modelscope/preprocessors/space/__init__.py
@@ -4,6 +4,7 @@ from typing import TYPE_CHECKING
 from modelscope.utils.import_utils import LazyImportModule
 
 if TYPE_CHECKING:
+    from .data_loader import DataLoader
     from .dialog_intent_prediction_preprocessor import \
         DialogIntentPredictionPreprocessor
     from .dialog_modeling_preprocessor import DialogModelingPreprocessor
@@ -13,6 +14,7 @@ if TYPE_CHECKING:
 
 else:
     _import_structure = {
+        'data_loader': ['DataLoader'],
         'dialog_intent_prediction_preprocessor':
         ['DialogIntentPredictionPreprocessor'],
         'dialog_modeling_preprocessor': ['DialogModelingPreprocessor'],
diff --git a/modelscope/preprocessors/space/args.py b/modelscope/preprocessors/space/args.py
new file mode 100644
index 00000000..d9e91e74
--- /dev/null
+++ b/modelscope/preprocessors/space/args.py
@@ -0,0 +1,66 @@
+"""
+Parse argument.
+"""
+
+import argparse
+
+import json
+
+
+def str2bool(v):
+    if v.lower() in ('yes', 'true', 't', 'y', '1'):
+        return True
+    elif v.lower() in ('no', 'false', 'f', 'n', '0'):
+        return False
+    else:
+        raise argparse.ArgumentTypeError('Unsupported value encountered.')
+
+
+class HParams(dict):
+    """ Hyper-parameters class
+
+    Store hyper-parameters in training / infer / ... scripts.
+    """
+
+    def __getattr__(self, name):
+        if name in self.keys():
+            return self[name]
+        for v in self.values():
+            if isinstance(v, HParams):
+                if name in v:
+                    return v[name]
+        raise AttributeError(f"'HParams' object has no attribute '{name}'")
+
+    def __setattr__(self, name, value):
+        self[name] = value
+
+    def save(self, filename):
+        with open(filename, 'w', encoding='utf-8') as fp:
+            json.dump(self, fp, ensure_ascii=False, indent=4, sort_keys=False)
+
+    def load(self, filename):
+        with open(filename, 'r', encoding='utf-8') as fp:
+            params_dict = json.load(fp)
+        for k, v in params_dict.items():
+            if isinstance(v, dict):
+                self[k].update(HParams(v))
+            else:
+                self[k] = v
+
+
+def parse_args(parser):
+    """ Parse hyper-parameters from cmdline. """
+    parsed = parser.parse_args()
+    args = HParams()
+    optional_args = parser._action_groups[1]
+    for action in optional_args._group_actions[1:]:
+        arg_name = action.dest
+        args[arg_name] = getattr(parsed, arg_name)
+    for group in parser._action_groups[2:]:
+        group_args = HParams()
+        for action in group._group_actions:
+            arg_name = action.dest
+            group_args[arg_name] = getattr(parsed, arg_name)
+        if len(group_args) > 0:
+            args[group.title] = group_args
+    return args
diff --git a/modelscope/preprocessors/space/batch.py b/modelscope/preprocessors/space/batch.py
new file mode 100644
index 00000000..fe0ad0ec
--- /dev/null
+++ b/modelscope/preprocessors/space/batch.py
@@ -0,0 +1,55 @@
+def batch(reader, batch_size, drop_last=False):
+    """
+    This operator creates a batched reader which combines the data from the
+    input reader to batched data.
+
+    Args:
+        reader(generator): the data reader to read from.
+        batch_size(int): size of each mini-batch.
+        drop_last(bool, optional): If set to True, the last batch is dropped when
+            the size of last batch is not equal to batch_size, if set to False,
+            it will not. Default: False.
+    Returns:
+        The batched reader.
+
+    Return Type:
+        generator
+
+    Examples:
+        .. code-block:: python
+
+            import paddle.fluid as fluid
+            def reader():
+                for i in range(10):
+                    yield i
+            batch_reader = fluid.io.batch(reader, batch_size=2)
+
+            for data in batch_reader():
+                print(data)
+
+            # Output is
+            # [0, 1]
+            # [2, 3]
+            # [4, 5]
+            # [6, 7]
+            # [8, 9]
+    """
+
+    def batch_reader():
+        r = reader()
+        b = []
+        for instance in r:
+            b.append(instance)
+            if len(b) == batch_size:
+                yield b
+                b = []
+        if drop_last is False and len(b) != 0:
+            yield b
+
+    # Batch size check
+    batch_size = int(batch_size)
+    if batch_size <= 0:
+        raise ValueError('batch_size should be a positive integeral value, '
+                         'but got batch_size={}'.format(batch_size))
+
+    return batch_reader
diff --git a/modelscope/preprocessors/space/data_loader.py b/modelscope/preprocessors/space/data_loader.py
new file mode 100644
index 00000000..bd04a79c
--- /dev/null
+++ b/modelscope/preprocessors/space/data_loader.py
@@ -0,0 +1,112 @@
+"""
+DataLoader class
+"""
+
+import math
+import os
+
+import numpy as np
+
+from modelscope.preprocessors.space.args import str2bool
+from modelscope.preprocessors.space.batch import batch
+from modelscope.preprocessors.space.lazy_dataset import LazyDataset
+from modelscope.preprocessors.space.sampler import (RandomSampler,
+                                                    SequentialSampler,
+                                                    SortedSampler)
+
+
+def get_data_loader(batch_size, reader, hparams, file, collate_fn, is_test):
+    assert os.path.exists(file), f"{file} doesn't exist"
+    dataset = LazyDataset(file, reader=reader)
+    data_loader = DataLoader(
+        dataset,
+        batch_size,
+        hparams.Trainer,
+        collate_fn=collate_fn,
+        is_test=is_test)
+    return data_loader
+
+
+def get_sequential_data_loader(batch_size, reader, hparams, data_paths,
+                               collate_fn, data_type):
+    data_loaders = []
+    for data_path in data_paths:
+        file = os.path.join(
+            data_path,
+            f'{data_type}.{hparams.BPETextField.tokenizer_type}.jsonl')
+        data_loaders.append(
+            get_data_loader(
+                batch_size=batch_size,
+                reader=reader,
+                hparams=hparams,
+                file=file,
+                collate_fn=collate_fn,
+                is_test=(data_type != 'train')))
+    data_loader = SequentialDataLoaderWrapper(data_loaders)
+    return data_loader
+
+
+class DataLoader(object):
+    """ Implement of DataLoader. """
+
+    @classmethod
+    def add_cmdline_argument(cls, group):
+        group.add_argument('--shuffle', type=str2bool, default=True)
+        group.add_argument('--sort_pool_size', type=int, default=0)
+        return group
+
+    def __init__(self,
+                 dataset,
+                 batch_size,
+                 hparams,
+                 collate_fn=None,
+                 sampler=None,
+                 is_test=False):
+        self.dataset = dataset
+        self.collate_fn = collate_fn
+        self.gpu = hparams.gpu
+        self.sort_pool_size = hparams.sort_pool_size
+
+        if sampler is None:
+            if hparams.shuffle and not is_test:
+                sampler = RandomSampler(dataset)
+            else:
+                sampler = SequentialSampler(dataset)
+
+        if self.sort_pool_size > 0 and not is_test:
+            sampler = SortedSampler(sampler, self.sort_pool_size)
+
+        def reader():
+            for idx in sampler:
+                yield idx
+
+        drop_last = False if self.gpu <= 1 or is_test else True
+        self.reader = batch(reader, batch_size=batch_size, drop_last=drop_last)
+        self.num_batches = math.floor(len(dataset) / batch_size) if drop_last \
+            else math.ceil(len(dataset) / batch_size)
+
+    def __len__(self):
+        return self.num_batches
+
+    def __iter__(self):
+        for batch_indices in self.reader():
+            samples = [self.dataset[idx] for idx in batch_indices]
+            yield self.collate_fn(samples)
+
+
+class SequentialDataLoaderWrapper:
+
+    def __init__(self, data_loaders):
+        self.data_loaders = data_loaders
+        self.data_file_to_dataset = {
+            data_loader.dataset.data_file: data_loader.dataset
+            for data_loader in self.data_loaders
+        }
+
+    def __iter__(self):
+        for data_loader in self.data_loaders:
+            for tmp_batch in data_loader:
+                yield data_loader.dataset.data_file, tmp_batch
+
+    def __len__(self):
+        return np.sum([len(data_loader) for data_loader in self.data_loaders])
diff --git a/modelscope/preprocessors/space/dialog_modeling_preprocessor.py b/modelscope/preprocessors/space/dialog_modeling_preprocessor.py
index a2157c2b..c461ade1 100644
--- a/modelscope/preprocessors/space/dialog_modeling_preprocessor.py
+++ b/modelscope/preprocessors/space/dialog_modeling_preprocessor.py
@@ -35,7 +35,7 @@ class DialogModelingPreprocessor(Preprocessor):
         self.config.use_gpu = self.config.use_gpu and torch.cuda.is_available()
 
         self.text_field = MultiWOZBPETextField(
-            self.model_dir, config=self.config)
+            config=self.config, model_dir=self.model_dir)
 
     @type_assert(object, Dict)
     def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
diff --git a/modelscope/preprocessors/space/fields/gen_field.py b/modelscope/preprocessors/space/fields/gen_field.py
index 5bff360f..32346bd5 100644
--- a/modelscope/preprocessors/space/fields/gen_field.py
+++ b/modelscope/preprocessors/space/fields/gen_field.py
@@ -2,9 +2,11 @@
 
 import os
 import random
+from asyncio import constants
 from collections import OrderedDict
 from itertools import chain
 
+import json
 import numpy as np
 
 from modelscope.preprocessors.space.tokenizer import Tokenizer
@@ -117,7 +119,8 @@ class BPETextField(object):
         return self.tokenizer.convert_tokens_to_ids([self.eos_d_token])[0]
 
     def __init__(self, config):
-        self.gpu = 0
+        self.train, self.dev, self.test = [], [], []
+        self.gpu = config.Trainer.gpu
         self.tokenizer = None
         self.vocab = None
         self.db = None
@@ -249,13 +252,9 @@ class BPETextField(object):
         for dial in data:
             batch.append(dial)
             if len(batch) == self.batch_size:
-                # print('batch size: %d, batch num +1'%(len(batch)))
                 all_batches.append(batch)
                 batch = []
-        # if remainder > 1/2 batch_size, just put them in the previous batch, otherwise form a new batch
-        # print('last batch size: %d, batch num +1'%(len(batch)))
-        # if (len(batch) % len(cfg.cuda_device)) != 0:
-        #     batch = batch[:-(len(batch) % len(cfg.cuda_device))]
+
         # TODO deal with deleted data
         if self.gpu <= 1:
             if len(batch) > 0.5 * self.batch_size:
@@ -308,7 +307,7 @@ class BPETextField(object):
 
 class MultiWOZBPETextField(BPETextField):
 
-    def __init__(self, model_dir, config):
+    def __init__(self, config, **kwargs):
         super(MultiWOZBPETextField, self).__init__(config)
 
         import spacy
@@ -327,8 +326,12 @@ class MultiWOZBPETextField(BPETextField):
                 )
         self.nlp = spacy.load('en_core_web_sm')
 
+        if config.do_train:
+            db_dir = kwargs['data_dir']
+        else:
+            db_dir = kwargs['model_dir']
         self.db = MultiWozDB(
-            model_dir, {
+            db_dir, {
                 'attraction': 'db/attraction_db_processed.json',
                 'hospital': 'db/hospital_db_processed.json',
                 'hotel': 'db/hotel_db_processed.json',
@@ -337,14 +340,14 @@ class MultiWOZBPETextField(BPETextField):
                 'taxi': 'db/taxi_db_processed.json',
                 'train': 'db/train_db_processed.json',
             })
-        self._build_vocab(model_dir)
+        self._build_vocab(db_dir)
 
         special_tokens = [
             self.pad_token, self.bos_token, self.eos_token, self.unk_token
         ]
         special_tokens.extend(self.add_sepcial_tokens())
         self.tokenizer = Tokenizer(
-            vocab_path=os.path.join(model_dir, ModelFile.VOCAB_FILE),
+            vocab_path=os.path.join(kwargs['model_dir'], ModelFile.VOCAB_FILE),
             special_tokens=special_tokens,
             tokenizer_type=config.BPETextField.tokenizer_type)
         self.understand_ids = self.tokenizer.convert_tokens_to_ids(
@@ -352,6 +355,26 @@ class MultiWOZBPETextField(BPETextField):
         self.policy_ids = self.tokenizer.convert_tokens_to_ids(
             self.policy_tokens)
 
+        if config.do_train:
+            test_list = [
+                line.strip().lower() for line in open(
+                    os.path.join(kwargs['data_dir'], 'testListFile.json'),
+                    'r').readlines()
+            ]
+            dev_list = [
+                line.strip().lower() for line in open(
+                    os.path.join(kwargs['data_dir'], 'valListFile.json'),
+                    'r').readlines()
+            ]
+
+            self.dev_files, self.test_files = {}, {}
+            for fn in test_list:
+                self.test_files[fn.replace('.json', '')] = 1
+            for fn in dev_list:
+                self.dev_files[fn.replace('.json', '')] = 1
+
+            self._load_data(kwargs['data_dir'])
+
         return
 
     def get_ids(self, data: str):
@@ -414,7 +437,6 @@ class MultiWOZBPETextField(BPETextField):
         name_to_set = {'train': self.train, 'test': self.test, 'dev': self.dev}
         dial = name_to_set[set_name]
         turn_bucket = self._bucket_by_turn(dial)
-        # self._shuffle_turn_bucket(turn_bucket)
         all_batches = []
 
         if set_name not in self.set_stats:
@@ -433,19 +455,13 @@ class MultiWOZBPETextField(BPETextField):
             except Exception:
                 log_str += 'turn num:%d, dial num: %d, batch num: %d last batch len: %d\n' % (
                     k, len(turn_bucket[k]), len(batches), 0.0)
-            # print("turn num:%d, dial num:v%d, batch num: %d, "%(k, len(turn_bucket[k]), len(batches)))
+
             num_training_steps += k * len(batches)
             num_turns += k * len(turn_bucket[k])
             num_dials += len(turn_bucket[k])
             all_batches += batches
         log_str += 'total batch num: %d\n' % len(all_batches)
-        # print('total batch num: %d'%len(all_batches))
-        # print('dialog count: %d'%dia_count)
-        # return all_batches
 
-        # log stats
-        # logging.info(log_str)
-        # cfg.num_training_steps = num_training_steps * cfg.epoch_num
         self.set_stats[set_name][
             'num_training_steps_per_epoch'] = num_training_steps  # turn-level steps
         self.set_stats[set_name]['num_turns'] = num_turns
@@ -484,6 +500,71 @@ class MultiWOZBPETextField(BPETextField):
         self.vocab.load_vocab(vp)
         return self.vocab.vocab_size
 
+    def _load_data(self, data_dir, save_temp=True):
+        """
+        load processed data and encode, or load already encoded data
+        """
+
+        def load_data_from_resource(data_resource):
+            data = json.loads(
+                open(
+                    os.path.join(data_dir, data_resource),
+                    'r',
+                    encoding='utf-8').read().lower())
+            train, dev, test = [], [], []
+            for fn, dial in data.items():
+                if '.json' in fn:
+                    fn = fn.replace('.json', '')
+                if self.dev_files.get(fn):
+                    dev.append(self._get_encoded_data(fn, dial))
+                elif self.test_files.get(fn):
+                    test.append(self._get_encoded_data(fn, dial))
+                else:
+                    train.append(self._get_encoded_data(fn, dial))
+            return train, dev, test
+
+        data_processed = 'new_db_se_blank_encoded_domain.data.json'
+        data_resource = 'data_for_damd.json'
+        if save_temp:  # save encoded data
+            # encoded: no sos, se_encoded: sos and eos
+            encoded_file = os.path.join(data_dir, data_processed)
+
+            if os.path.exists(encoded_file):
+                logger.info(
+                    'Reading encoded data from {}'.format(encoded_file))
+                self.data = json.loads(
+                    open(
+                        os.path.join(data_dir, data_resource),
+                        'r',
+                        encoding='utf-8').read().lower())
+                encoded_data = json.loads(
+                    open(encoded_file, 'r', encoding='utf-8').read())
+                self.train = encoded_data['train']
+                self.dev = encoded_data['dev']
+                self.test = encoded_data['test']
+            else:
+                logger.info(
+                    'Encoding data now and save the encoded data in {}'.format(
+                        encoded_file))
+                # not exists, encode data and save
+                self.train, self.dev, self.test = load_data_from_resource(
+                    data_resource)
+                # save encoded data
+                encoded_data = {
+                    'train': self.train,
+                    'dev': self.dev,
+                    'test': self.test
+                }
+                json.dump(encoded_data, open(encoded_file, 'w'), indent=2)
+        else:  # directly read processed data and encode
+            self.train, self.dev, self.test = load_data_from_resource(
+                data_resource)
+
+        random.seed(10)
+        random.shuffle(self.train)
+        logger.info('train size:{}, dev size:{}, test size:{}'.format(
+            len(self.train), len(self.dev), len(self.test)))
+
     def _get_convert_str(self, sent):
         assert isinstance(sent, str)
         return ' '.join([
@@ -491,14 +572,65 @@ class MultiWOZBPETextField(BPETextField):
             for tok in sent.split()
         ])
 
+    def _get_encoded_data(self, fn, dial):
+        encoded_dial = []
+        for idx, t in enumerate(dial['log']):  # tokenize to list of ids
+            enc = {}
+            enc['dial_id'] = fn
+
+            enc_info_list = [
+                ('user', self.sos_u_id, 'user', self.eos_u_id),
+                ('usdx', self.sos_u_id, 'user', self.eos_u_id),
+                ('resp', self.sos_r_id, 'resp', self.eos_r_id),
+                ('bspn', self.sos_b_id, 'constraint', self.eos_b_id),
+                ('bsdx', self.sos_b_id, 'cons_delex', self.eos_b_id),
+                ('aspn', self.sos_a_id, 'sys_act', self.eos_a_id)
+            ]
+            for enc_key, start_token, item_key, end_token in enc_info_list:
+                enc[enc_key] = [
+                    start_token
+                ] + self.tokenizer.convert_tokens_to_ids(
+                    self.tokenizer.tokenize(
+                        self._get_convert_str(t[item_key]))) + [end_token]
+
+            enc['turn_num'] = t['turn_num']
+
+            if idx > 0 and t['turn_domain'] == '[general]':
+                enc['dspn'] = encoded_dial[idx - 1]['dspn']
+                enc['pointer'] = encoded_dial[idx - 1]['pointer'][:4] + [
+                    int(i) for i in t['pointer'].split(',')
+                ][-2:]
+                enc['turn_domain'] = encoded_dial[idx - 1]['turn_domain']
+                enc['db'] = encoded_dial[idx - 1]['db']
+            else:
+                if t['turn_domain'] == '[general]':
+                    assert not t['constraint'], f'{fn}-{idx}'
+                enc['dspn'] = [
+                    self.sos_d_id
+                ] + self.tokenizer.convert_tokens_to_ids(
+                    self.tokenizer.tokenize(
+                        self._get_convert_str(
+                            t['turn_domain']))) + [self.eos_d_id]
+                enc['pointer'] = [int(i) for i in t['pointer'].split(',')]
+                enc['turn_domain'] = t['turn_domain'].split()
+                db_pointer = self.bspan_to_DBpointer(t['constraint'],
+                                                     t['turn_domain'].split())
+                enc['db'] = [
+                    self.sos_db_id
+                ] + self.tokenizer.convert_tokens_to_ids(
+                    self.tokenizer.tokenize(
+                        self._get_convert_str(db_pointer))) + [self.eos_db_id]
+
+            encoded_dial.append(enc)
+        return encoded_dial
+
     def bspan_to_DBpointer(self, bspan, turn_domain):
         constraint_dict = self.bspan_to_constraint_dict(bspan)
-        # print(constraint_dict)
         matnums = self.db.get_match_num(constraint_dict)
         match_dom = turn_domain[0] if len(turn_domain) == 1 else turn_domain[1]
         match_dom = match_dom[1:-1] if match_dom.startswith('[') else match_dom
         match = matnums[match_dom]
-        # vector = self.db.addDBPointer(match_dom, match)
+
         vector = self.db.addDBIndicator(match_dom, match)
         return vector
 
@@ -691,3 +823,67 @@ class MultiWOZBPETextField(BPETextField):
                 inputs['labels'] = [context]  # use previous turn
 
         return inputs, prompt_id
+
+    def restore(self, resp, domain, constraint_dict, mat_ents):
+        restored = resp
+
+        restored = restored.replace('[value_reference]', '53022')
+        restored = restored.replace('[value_car]', 'BMW')
+
+        for d in domain:
+            constraint = constraint_dict.get(d, None)
+            if constraint:
+                replace_res_list = [('stay', '[value_stay]'),
+                                    ('day', '[value_day]'),
+                                    ('people', '[value_people]'),
+                                    ('time', '[value_time]'),
+                                    ('type', '[value_type]')]
+                for key, value_key in replace_res_list:
+                    if key in constraint:
+                        restored = restored.replace(value_key, constraint[key])
+
+                if d in mat_ents and len(mat_ents[d]) == 0:
+                    for s in constraint:
+                        if s == 'pricerange' and d in [
+                                'hotel', 'restaurant'
+                        ] and 'price]' in restored:
+                            restored = restored.replace(
+                                '[value_price]', constraint['pricerange'])
+                        if s + ']' in restored:
+                            restored = restored.replace(
+                                '[value_%s]' % s, constraint[s])
+
+            if '[value_choice' in restored and mat_ents.get(d):
+                restored = restored.replace('[value_choice]',
+                                            str(len(mat_ents[d])))
+        if '[value_choice' in restored:
+            restored = restored.replace('[value_choice]', '3')
+
+        try:
+            ent = mat_ents.get(domain[-1], [])
+            if ent:
+                ent = ent[0]
+
+                for t in restored.split():
+                    if '[value' in t:
+                        slot = t[7:-1]
+                        if ent.get(slot):
+                            if domain[-1] == 'hotel' and slot == 'price':
+                                slot = 'pricerange'
+                            restored = restored.replace(t, ent[slot])
+                        elif slot == 'price':
+                            if ent.get('pricerange'):
+                                restored = restored.replace(
+                                    t, ent['pricerange'])
+                            else:
+                                logger.info(restored, domain)
+        except Exception:
+            logger.error(resp)
+            logger.error(restored)
+            quit()
+
+        restored = restored.replace('[value_phone]', '62781111')
+        restored = restored.replace('[value_postcode]', 'CG9566')
+        restored = restored.replace('[value_address]', 'Parkside, Cambridge')
+
+        return restored
diff --git a/modelscope/preprocessors/space/fields/intent_field.py b/modelscope/preprocessors/space/fields/intent_field.py
index dc00e677..6d3b5fff 100644
--- a/modelscope/preprocessors/space/fields/intent_field.py
+++ b/modelscope/preprocessors/space/fields/intent_field.py
@@ -791,7 +791,6 @@ class BPETextField(object):
                                 user_or_sys = [self.sos_r_id]
                             tmp = [self.sos_u_id
                                    ] + self.numericalize(s) + user_or_sys
-                            tmp = tmp + self.numericalize(s) + [self.eos_r_id]
                             new_src.append(tmp)
 
                         src_span_mask = [[0] + list(map(int, s)) + [0]
diff --git a/modelscope/preprocessors/space/lazy_dataset.py b/modelscope/preprocessors/space/lazy_dataset.py
new file mode 100644
index 00000000..8da21db7
--- /dev/null
+++ b/modelscope/preprocessors/space/lazy_dataset.py
@@ -0,0 +1,47 @@
+"""
+Dataset class
+"""
+
+import json
+
+from modelscope.preprocessors.space.args import str2bool
+
+
+class LazyDataset(object):
+    """
+    Lazy load dataset from disk.
+
+    Each line of data file is a preprocessed example.
+    """
+
+    def __init__(self, data_file, reader, transform=lambda s: json.loads(s)):
+        """
+        Initialize lazy dataset.
+
+        By default, loading .jsonl format.
+
+        :param data_file
+        :type str
+
+        :param transform
+        :type callable
+        """
+        self.data_file = data_file
+        self.transform = transform
+        self.reader = reader
+        self.offsets = [0]
+        with open(data_file, 'r', encoding='utf-8') as fp:
+            while fp.readline() != '':
+                self.offsets.append(fp.tell())
+        self.offsets.pop()
+        self.fp = open(data_file, 'r', encoding='utf-8')
+
+    def __len__(self):
+        return len(self.offsets)
+
+    def __getitem__(self, idx):
+        self.fp.seek(self.offsets[idx], 0)
+        sample = self.transform(self.fp.readline().strip())
+        if self.reader.with_mlm:
+            sample = self.reader.create_token_masked_lm_predictions(sample)
+        return sample
diff --git a/modelscope/preprocessors/space/preprocess.py b/modelscope/preprocessors/space/preprocess.py
new file mode 100644
index 00000000..bd8d64d1
--- /dev/null
+++ b/modelscope/preprocessors/space/preprocess.py
@@ -0,0 +1,48 @@
+"""
+Preprocess script.
+"""
+
+import glob
+import os
+
+from modelscope.preprocessors.space.args import parse_args
+from modelscope.preprocessors.space.fields.intent_field import \
+    IntentBPETextField
+
+FILE_NAME = 'train.json'
+
+
+def intent_preprocess(path, cfg):
+
+    bpe = IntentBPETextField(path, cfg)
+    args = cfg.Dataset
+    build_examples_fn = bpe.build_examples_multi_turn if args.trigger_role == 'system' \
+        else bpe.build_examples_single_turn
+    build_score_matrix_fn = bpe.build_score_matrix
+    build_score_matrix_multiprocessing_fn = bpe.build_score_matrix_multiprocessing
+    data_paths = list(
+        os.path.dirname(c) for c in sorted(
+            glob.glob(args.data_dir + '/**/' + FILE_NAME, recursive=True)))
+    data_paths = bpe.filter_data_path(data_paths=data_paths)
+
+    for mode in ['train', 'valid', 'test']:
+        for data_path in data_paths:
+            input_file = os.path.join(data_path, f'{mode}.json')
+            output_file = os.path.join(data_path,
+                                       f'{mode}.{bpe.tokenizer_type}.jsonl')
+            output_score_file = os.path.join(data_path, f'{mode}.Score.npy')
+            if os.path.exists(input_file) and not os.path.exists(output_file):
+                examples = build_examples_fn(input_file, data_type=mode)
+                if examples:
+                    bpe.save_examples(examples, output_file)
+                else:
+                    continue
+            if os.path.exists(output_file) and not os.path.exists(output_score_file) and \
+                    not args.dynamic_score and 'AnPreDial' in data_path:
+                examples = bpe.load_examples(output_file)
+                if args.num_process >= 2:
+                    score_matrix = build_score_matrix_multiprocessing_fn(
+                        examples)
+                else:
+                    score_matrix = build_score_matrix_fn(examples)
+                bpe.save_examples(score_matrix, output_score_file)
diff --git a/modelscope/preprocessors/space/sampler.py b/modelscope/preprocessors/space/sampler.py
new file mode 100644
index 00000000..49a216d1
--- /dev/null
+++ b/modelscope/preprocessors/space/sampler.py
@@ -0,0 +1,75 @@
+"""
+Sampler class.
+"""
+
+import numpy as np
+
+
+class Sampler(object):
+
+    def __init__(self):
+        return
+
+    def __len__(self):
+        raise NotImplementedError
+
+    def __iter__(self):
+        raise NotImplementedError
+
+
+class SequentialSampler(Sampler):
+
+    def __init__(self, dataset):
+        self.dataset = dataset
+        return
+
+    def __len__(self):
+        return len(self.dataset)
+
+    def __iter__(self):
+        return iter(range(len(self)))
+
+
+class RandomSampler(Sampler):
+
+    def __init__(self, dataset):
+        self.dataset = dataset
+        self.epoch = 0
+        return
+
+    def __len__(self):
+        return len(self.dataset)
+
+    def __iter__(self):
+        np.random.seed(self.epoch)
+        self.epoch += 1
+        return iter(np.random.permutation(len(self)))
+
+
+class SortedSampler(Sampler):
+    """ Sorted Sampler.
+    Sort each block of examples by key.
+    """
+
+    def __init__(self, sampler, sort_pool_size, key='src'):
+        self.sampler = sampler
+        self.sort_pool_size = sort_pool_size
+        self.key = lambda idx: len(self.sampler.dataset[idx][key])
+        return
+
+    def __len__(self):
+        return len(self.sampler)
+
+    def __iter__(self):
+        pool = []
+        for idx in self.sampler:
+            pool.append(idx)
+            if len(pool) == self.sort_pool_size:
+                pool = sorted(pool, key=self.key)
+                for i in pool:
+                    yield i
+                pool = []
+        if len(pool) > 0:
+            pool = sorted(pool, key=self.key)
+            for i in pool:
+                yield i
diff --git a/modelscope/preprocessors/star/__init__.py b/modelscope/preprocessors/star/__init__.py
index 5a4bcea9..cef8f074 100644
--- a/modelscope/preprocessors/star/__init__.py
+++ b/modelscope/preprocessors/star/__init__.py
@@ -6,7 +6,8 @@ from modelscope.utils.import_utils import LazyImportModule
 if TYPE_CHECKING:
     from .conversational_text_to_sql_preprocessor import \
         ConversationalTextToSqlPreprocessor
-    from .fields import MultiWOZBPETextField, IntentBPETextField
+    from .fields import (get_label, SubPreprocessor, preprocess_dataset,
+                         process_dataset)
 
 else:
     _import_structure = {
diff --git a/modelscope/preprocessors/star/fields/__init__.py b/modelscope/preprocessors/star/fields/__init__.py
index 1e95a998..7049c43b 100644
--- a/modelscope/preprocessors/star/fields/__init__.py
+++ b/modelscope/preprocessors/star/fields/__init__.py
@@ -1,6 +1,30 @@
-from modelscope.preprocessors.star.fields.common_utils import SubPreprocessor
-from modelscope.preprocessors.star.fields.parse import get_label
-from modelscope.preprocessors.star.fields.preprocess_dataset import \
-    preprocess_dataset
-from modelscope.preprocessors.star.fields.process_dataset import \
-    process_dataset
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .common_utils import SubPreprocessor
+    from .parse import get_label
+    from .preprocess_dataset import \
+        preprocess_dataset
+    from .process_dataset import \
+        process_dataset, process_tables
+
+else:
+    _import_structure = {
+        'common_utils': ['SubPreprocessor'],
+        'parse': ['get_label'],
+        'preprocess_dataset': ['preprocess_dataset'],
+        'process_dataset': ['process_dataset', 'process_tables'],
+    }
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/preprocessors/star3/__init__.py b/modelscope/preprocessors/star3/__init__.py
new file mode 100644
index 00000000..9aa562d7
--- /dev/null
+++ b/modelscope/preprocessors/star3/__init__.py
@@ -0,0 +1,24 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .table_question_answering_preprocessor import TableQuestionAnsweringPreprocessor
+    from .fields import MultiWOZBPETextField, IntentBPETextField
+
+else:
+    _import_structure = {
+        'table_question_answering_preprocessor':
+        ['TableQuestionAnsweringPreprocessor'],
+    }
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/preprocessors/star3/fields/__init__.py b/modelscope/preprocessors/star3/fields/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/modelscope/preprocessors/star3/fields/database.py b/modelscope/preprocessors/star3/fields/database.py
new file mode 100644
index 00000000..a99800cf
--- /dev/null
+++ b/modelscope/preprocessors/star3/fields/database.py
@@ -0,0 +1,77 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import json
+import tqdm
+
+from modelscope.preprocessors.star3.fields.struct import Trie
+
+
+class Database:
+
+    def __init__(self, tokenizer, table_file_path, syn_dict_file_path):
+        self.tokenizer = tokenizer
+        self.tables = self.init_tables(table_file_path=table_file_path)
+        self.syn_dict = self.init_syn_dict(
+            syn_dict_file_path=syn_dict_file_path)
+
+    def init_tables(self, table_file_path):
+        tables = {}
+        lines = []
+        with open(table_file_path, 'r') as fo:
+            for line in fo:
+                lines.append(line)
+
+        for line in tqdm.tqdm(lines, desc='Load Tables'):
+            table = json.loads(line.strip())
+
+            table_header_length = 0
+            headers_tokens = []
+            for header in table['header_name']:
+                header_tokens = self.tokenizer.tokenize(header)
+                table_header_length += len(header_tokens)
+                headers_tokens.append(header_tokens)
+            empty_column = self.tokenizer.tokenize('空列')
+            table_header_length += len(empty_column)
+            headers_tokens.append(empty_column)
+            table['tablelen'] = table_header_length
+            table['header_tok'] = headers_tokens
+
+            table['header_types'].append('null')
+            table['header_units'] = [
+                self.tokenizer.tokenize(unit) for unit in table['header_units']
+            ] + [[]]
+
+            trie_set = [Trie() for _ in table['header_name']]
+            for row in table['rows']:
+                for ii, cell in enumerate(row):
+                    if 'real' in table['header_types'][ii].lower() or \
+                        'number' in table['header_types'][ii].lower() or \
+                            'duration' in table['header_types'][ii].lower():
+                        continue
+                    word = str(cell).strip().lower()
+                    trie_set[ii].insert(word, word)
+
+            table['value_trie'] = trie_set
+            tables[table['table_id']] = table
+
+        return tables
+
+    def init_syn_dict(self, syn_dict_file_path):
+        lines = []
+        with open(syn_dict_file_path, encoding='utf-8') as fo:
+            for line in fo:
+                lines.append(line)
+
+        syn_dict = {}
+        for line in tqdm.tqdm(lines, desc='Load Synonym Dict'):
+            tokens = line.strip().split('\t')
+            if len(tokens) != 2:
+                continue
+            keys = tokens[0].strip().split('|')
+            values = tokens[1].strip().split('|')
+            for key in keys:
+                key = key.lower().strip()
+                syn_dict.setdefault(key, [])
+                for value in values:
+                    syn_dict[key].append(value.lower().strip())
+
+        return syn_dict
diff --git a/modelscope/preprocessors/star3/fields/schema_link.py b/modelscope/preprocessors/star3/fields/schema_link.py
new file mode 100644
index 00000000..40613f78
--- /dev/null
+++ b/modelscope/preprocessors/star3/fields/schema_link.py
@@ -0,0 +1,423 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import re
+
+from modelscope.preprocessors.star3.fields.struct import TypeInfo
+
+
+class SchemaLinker:
+
+    def __init__(self):
+        pass
+
+    def find_in_list(self, comlist, words):
+        result = False
+        for com in comlist:
+            if words in com:
+                result = True
+                break
+        return result
+
+    def get_continue_score(self, pstr, tstr):
+        comlist = []
+        minlen = min(len(pstr), len(tstr))
+        for slen in range(minlen, 1, -1):
+            for ts in range(0, len(tstr), 1):
+                if ts + slen > len(tstr):
+                    continue
+                words = tstr[ts:ts + slen]
+                if words in pstr and not self.find_in_list(comlist, words):
+                    comlist.append(words)
+
+        comlen = 0
+        for com in comlist:
+            comlen += len(com) * len(com)
+        weight = comlen / (len(tstr) * len(tstr) + 0.001)
+        if weight > 1.0:
+            weight = 1.0
+
+        return weight
+
+    def get_match_score(self, ptokens, ttokens):
+        pset = set(ptokens)
+        tset = set(ttokens)
+        comset = pset & tset
+        allset = pset | tset
+        weight2 = len(comset) / (len(allset) + 0.001)
+        weight3 = self.get_continue_score(''.join(ptokens), ''.join(ttokens))
+        return 0.4 * weight2 + 0.6 * weight3
+
+    def is_number(self, s):
+        try:
+            float(s)
+            return True
+        except ValueError:
+            pass
+
+        try:
+            import unicodedata
+            unicodedata.numeric(s)
+            return True
+        except (TypeError, ValueError):
+            pass
+
+        return False
+
+    def get_match_phrase(self, query, target):
+        if target in query:
+            return target, 1.0
+
+        qtokens = []
+        for i in range(0, len(query), 1):
+            qtokens.append(query[i:i + 1])
+        ttokens = []
+        for i in range(0, len(target), 1):
+            ttokens.append(target[i:i + 1])
+        ttok_set = set(ttokens)
+
+        phrase = ''
+        score = 0.0
+        for qidx, qword in enumerate(qtokens):
+            if qword not in ttok_set:
+                continue
+
+            eidx = (qidx + 2 * len(ttokens)) if (
+                len(qtokens) > qidx + 2 * len(ttokens)) else len(qtokens)
+            while eidx > qidx:
+                ptokens = qtokens[qidx:eidx]
+                weight = self.get_match_score(ptokens, ttokens)
+                if weight + 0.001 > score:
+                    score = weight
+                    phrase = ''.join(ptokens)
+                eidx -= 1
+
+        if self.is_number(target) and phrase != target:
+            score = 0.0
+        if len(phrase) > 1 and phrase in target:
+            score *= (1.0 + 0.05 * len(phrase))
+
+        return phrase, score
+
+    def allfindpairidx(self, que_tok, value_tok, weight):
+        idxs = []
+        for i in range(0, len(que_tok) - len(value_tok) + 1, 1):
+            s = i
+            e = i
+            matched = True
+            for j in range(0, len(value_tok), 1):
+                if value_tok[j].lower() == que_tok[i + j].lower():
+                    e = i + j
+                else:
+                    matched = False
+                    break
+            if matched:
+                idxs.append([s, e, weight])
+
+        return idxs
+
+    def findnear(self, ps1, pe1, ps2, pe2):
+        if abs(ps1 - pe2) <= 2 or abs(pe1 - ps2) <= 2:
+            return True
+        return False
+
+    def get_column_type(self, col_idx, table):
+        colType = table['header_types'][col_idx]
+        if 'number' in colType or 'duration' in colType or 'real' in colType:
+            colType = 'real'
+        elif 'date' in colType:
+            colType = 'date'
+        elif 'bool' in colType:
+            colType = 'bool'
+        else:
+            colType = 'text'
+
+        return colType
+
+    def add_type_all(self, typeinfos, index, idxs, label, linktype, value,
+                     orgvalue):
+        for idx in idxs:
+            info = TypeInfo(label, index, linktype, value, orgvalue, idx[0],
+                            idx[1], idx[2])
+            flag = True
+            for i, typeinfo in enumerate(typeinfos):
+                if info.pstart < typeinfo.pstart:
+                    typeinfos.insert(i, info)
+                    flag = False
+                    break
+
+            if flag:
+                typeinfos.append(info)
+
+        return typeinfos
+
+    def save_info(self, tinfo, sinfo):
+        flag = True
+        if tinfo.pstart > sinfo.pend or tinfo.pend < sinfo.pstart:
+            pass
+        elif tinfo.pstart >= sinfo.pstart and \
+                tinfo.pend <= sinfo.pend and tinfo.index == -1:
+            flag = False
+        elif tinfo.pstart == sinfo.pstart and sinfo.pend == tinfo.pend and \
+                abs(tinfo.weight - sinfo.weight) < 0.01:
+            pass
+        else:
+            if sinfo.label == 'col' or sinfo.label == 'val':
+                if tinfo.label == 'col' or tinfo.label == 'val':
+                    if (sinfo.pend
+                            - sinfo.pstart) > (tinfo.pend - tinfo.pstart) or (
+                                sinfo.weight > tinfo.weight
+                                and sinfo.index != -1):
+                        flag = False
+                else:
+                    flag = False
+            else:
+                if (tinfo.label == 'op' or tinfo.label == 'agg'):
+                    if (sinfo.pend - sinfo.pstart) > (
+                            tinfo.pend
+                            - tinfo.pstart) or sinfo.weight > tinfo.weight:
+                        flag = False
+
+        return flag
+
+    def normal_type_infos(self, infos):
+        typeinfos = []
+        for info in infos:
+            typeinfos = [x for x in typeinfos if self.save_info(x, info)]
+            flag = True
+            for i, typeinfo in enumerate(typeinfos):
+                if not self.save_info(info, typeinfo):
+                    flag = False
+                    break
+                if info.pstart < typeinfo.pstart:
+                    typeinfos.insert(i, info)
+                    flag = False
+                    break
+            if flag:
+                typeinfos.append(info)
+        return typeinfos
+
+    def findnear_typeinfo(self, info1, info2):
+        return self.findnear(info1.pstart, info1.pend, info2.pstart,
+                             info2.pend)
+
+    def find_real_column(self, infos, table):
+        for i, vinfo in enumerate(infos):
+            if vinfo.index != -1 or vinfo.label != 'val':
+                continue
+            eoidx = -1
+            for j, oinfo in enumerate(infos):
+                if oinfo.label != 'op':
+                    continue
+                if self.findnear_typeinfo(vinfo, oinfo):
+                    eoidx = j
+                    break
+            for j, cinfo in enumerate(infos):
+                if cinfo.label != 'col' or table['header_types'][
+                        cinfo.index] != 'real':
+                    continue
+                if self.findnear_typeinfo(cinfo, vinfo) or (
+                        eoidx != -1
+                        and self.findnear_typeinfo(cinfo, infos[eoidx])):
+                    infos[i].index = cinfo.index
+                    break
+
+        return infos
+
+    def filter_column_infos(self, infos):
+        delid = []
+        for i, info in enumerate(infos):
+            if info.label != 'col':
+                continue
+            for j in range(i + 1, len(infos), 1):
+                if infos[j].label == 'col' and \
+                        info.pstart == infos[j].pstart and \
+                        info.pend == infos[j].pend:
+                    delid.append(i)
+                    delid.append(j)
+                    break
+
+        typeinfos = []
+        for idx, info in enumerate(infos):
+            if idx in set(delid):
+                continue
+            typeinfos.append(info)
+
+        return typeinfos
+
+    def filter_type_infos(self, infos, table):
+        infos = self.filter_column_infos(infos)
+        infos = self.find_real_column(infos, table)
+
+        colvalMp = {}
+        for info in infos:
+            if info.label == 'col':
+                colvalMp[info.index] = []
+        for info in infos:
+            if info.label == 'val' and info.index in colvalMp:
+                colvalMp[info.index].append(info)
+
+        delid = []
+        for idx, info in enumerate(infos):
+            if info.label != 'val' or info.index in colvalMp:
+                continue
+            for index in colvalMp.keys():
+                valinfos = colvalMp[index]
+                for valinfo in valinfos:
+                    if valinfo.pstart <= info.pstart and \
+                            valinfo.pend >= info.pend:
+                        delid.append(idx)
+                        break
+
+        typeinfos = []
+        for idx, info in enumerate(infos):
+            if idx in set(delid):
+                continue
+            typeinfos.append(info)
+
+        return typeinfos
+
+    def get_table_match_score(self, nlu_t, schema_link):
+        match_len = 0
+        for info in schema_link:
+            scale = 0.6
+            if info['question_len'] > 0 and info['column_index'] != -1:
+                scale = 1.0
+            else:
+                scale = 0.5
+            match_len += scale * info['question_len'] * info['weight']
+
+        return match_len / (len(nlu_t) + 0.1)
+
+    def get_entity_linking(self, tokenizer, nlu, nlu_t, tables, col_syn_dict):
+        """
+        get linking between question and schema column
+        """
+        typeinfos = []
+        numbers = re.findall(r'[-]?\d*\.\d+|[-]?\d+|\d+', nlu)
+
+        # search schema link in every table
+        search_result_list = []
+        for tablename in tables:
+            table = tables[tablename]
+            trie_set = None
+            if 'value_trie' in table:
+                trie_set = table['value_trie']
+
+            typeinfos = []
+            for ii, column in enumerate(table['header_name']):
+                column = column.lower()
+                column_new = re.sub('(.*?)', '', column)
+                column_new = re.sub('（.*?）', '', column_new)
+                cphrase, cscore = self.get_match_phrase(
+                    nlu.lower(), column_new)
+                if cscore > 0.3 and cphrase.strip() != '':
+                    phrase_tok = tokenizer.tokenize(cphrase)
+                    cidxs = self.allfindpairidx(nlu_t, phrase_tok, cscore)
+                    typeinfos = self.add_type_all(typeinfos, ii, cidxs, 'col',
+                                                  'column', cphrase, column)
+                if cscore < 0.8 and column_new in col_syn_dict:
+                    columns = list(set(col_syn_dict[column_new]))
+                    for syn_col in columns:
+                        if syn_col not in nlu.lower() or syn_col == '':
+                            continue
+                        phrase_tok = tokenizer.tokenize(syn_col)
+                        cidxs = self.allfindpairidx(nlu_t, phrase_tok, 1.0)
+                        typeinfos = self.add_type_all(typeinfos, ii, cidxs,
+                                                      'col', 'column', syn_col,
+                                                      column)
+
+            for ii, trie in enumerate(trie_set):
+                ans = trie.match(nlu.lower())
+                for cell in ans.keys():
+                    vphrase = cell
+                    vscore = 1.0
+                    # print("trie_set find:", cell, ans[cell])
+                    phrase_tok = tokenizer.tokenize(vphrase)
+                    if len(phrase_tok) == 0 or len(vphrase) < 2:
+                        continue
+                    vidxs = self.allfindpairidx(nlu_t, phrase_tok, vscore)
+                    linktype = self.get_column_type(ii, table)
+                    typeinfos = self.add_type_all(typeinfos, ii, vidxs, 'val',
+                                                  linktype, vphrase, ans[cell])
+
+            for number in set(numbers):
+                number_tok = tokenizer.tokenize(number.lower())
+                if len(number_tok) == 0:
+                    continue
+                nidxs = self.allfindpairidx(nlu_t, number_tok, 1.0)
+                typeinfos = self.add_type_all(typeinfos, -1, nidxs, 'val',
+                                              'real', number, number)
+
+            newtypeinfos = self.normal_type_infos(typeinfos)
+
+            newtypeinfos = self.filter_type_infos(newtypeinfos, table)
+
+            final_question = [0] * len(nlu_t)
+            final_header = [0] * len(table['header_name'])
+            for typeinfo in newtypeinfos:
+                pstart = typeinfo.pstart
+                pend = typeinfo.pend + 1
+                if typeinfo.label == 'op' or typeinfo.label == 'agg':
+                    score = int(typeinfo.linktype[-1])
+                    if typeinfo.label == 'op':
+                        score += 6
+                    else:
+                        score += 11
+                    for i in range(pstart, pend, 1):
+                        final_question[i] = score
+
+                elif typeinfo.label == 'col':
+                    for i in range(pstart, pend, 1):
+                        final_question[i] = 4
+                    if final_header[typeinfo.index] % 2 == 0:
+                        final_header[typeinfo.index] += 1
+
+                elif typeinfo.label == 'val':
+                    if typeinfo.index == -1:
+                        for i in range(pstart, pend, 1):
+                            final_question[i] = 5
+                    else:
+                        for i in range(pstart, pend, 1):
+                            final_question[i] = 2
+                        final_question[pstart] = 1
+                        final_question[pend - 1] = 3
+                        if final_header[typeinfo.index] < 2:
+                            final_header[typeinfo.index] += 2
+
+            # collect schema_link
+            schema_link = []
+            for sl in newtypeinfos:
+                if sl.label in ['val', 'col']:
+                    schema_link.append({
+                        'question_len':
+                        max(0, sl.pend - sl.pstart + 1),
+                        'question_index': [sl.pstart, sl.pend],
+                        'question_span':
+                        ''.join(nlu_t[sl.pstart:sl.pend + 1]),
+                        'column_index':
+                        sl.index,
+                        'column_span':
+                        table['header_name'][sl.index]
+                        if sl.index != -1 else '空列',
+                        'label':
+                        sl.label,
+                        'weight':
+                        round(sl.weight, 4)
+                    })
+
+            # get the match score of each table
+            match_score = self.get_table_match_score(nlu_t, schema_link)
+
+            search_result = {
+                'table_id': table['table_id'],
+                'question_knowledge': final_question,
+                'header_knowledge': final_header,
+                'schema_link': schema_link,
+                'match_score': match_score
+            }
+            search_result_list.append(search_result)
+
+        search_result_list = sorted(
+            search_result_list, key=lambda x: x['match_score'],
+            reverse=True)[0:4]
+
+        return search_result_list
diff --git a/modelscope/preprocessors/star3/fields/struct.py b/modelscope/preprocessors/star3/fields/struct.py
new file mode 100644
index 00000000..3c2e664b
--- /dev/null
+++ b/modelscope/preprocessors/star3/fields/struct.py
@@ -0,0 +1,181 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+cond_ops = ['>', '<', '==', '!=', 'ASC', 'DESC']
+agg_ops = [
+    '', 'AVG', 'MAX', 'MIN', 'COUNT', 'SUM', 'COMPARE', 'GROUP BY', 'SAME'
+]
+conn_ops = ['', 'AND', 'OR']
+
+
+class Context:
+
+    def __init__(self):
+        self.history_sql = None
+
+    def set_history_sql(self, sql):
+        self.history_sql = sql
+
+
+class SQLQuery:
+
+    def __init__(self, string, query, sql_result):
+        self.string = string
+        self.query = query
+        self.sql_result = sql_result
+
+
+class TrieNode(object):
+
+    def __init__(self):
+        """
+        Initialize your data structure here.
+        """
+        self.data = {}
+        self.is_word = False
+        self.term = None
+
+
+class Trie(object):
+
+    def __init__(self):
+        self.root = TrieNode()
+
+    def insert(self, word, term):
+        """
+        Inserts a word into the trie.
+        :type word: str
+        :rtype: void
+        """
+        node = self.root
+        for letter in word:
+            child = node.data.get(letter)
+            if not child:
+                node.data[letter] = TrieNode()
+            node = node.data[letter]
+        node.is_word = True
+        node.term = term
+
+    def search(self, word):
+        """
+        Returns if the word is in the trie.
+        :type word: str
+        :rtype: bool
+        """
+        node = self.root
+        for letter in word:
+            node = node.data.get(letter)
+            if not node:
+                return None, False
+        return node.term, True
+
+    def match(self, query):
+        start = 0
+        end = 1
+        length = len(query)
+        ans = {}
+        while start < length and end < length:
+            sub = query[start:end]
+            term, flag = self.search(sub)
+            if flag:
+                if term is not None:
+                    ans[sub] = term
+                end += 1
+            else:
+                start += 1
+                end = start + 1
+        return ans
+
+    def starts_with(self, prefix):
+        """
+        Returns if there is any word in the trie
+        that starts with the given prefix.
+        :type prefix: str
+        :rtype: bool
+        """
+        node = self.root
+        for letter in prefix:
+            node = node.data.get(letter)
+            if not node:
+                return False
+        return True
+
+    def get_start(self, prefix):
+        """
+        Returns words started with prefix
+        :param prefix:
+        :return: words (list)
+        """
+
+        def _get_key(pre, pre_node):
+            words_list = []
+            if pre_node.is_word:
+                words_list.append(pre)
+            for x in pre_node.data.keys():
+                words_list.extend(_get_key(pre + str(x), pre_node.data.get(x)))
+            return words_list
+
+        words = []
+        if not self.starts_with(prefix):
+            return words
+        if self.search(prefix):
+            words.append(prefix)
+            return words
+        node = self.root
+        for letter in prefix:
+            node = node.data.get(letter)
+        return _get_key(prefix, node)
+
+
+class TypeInfo:
+
+    def __init__(self, label, index, linktype, value, orgvalue, pstart, pend,
+                 weight):
+        self.label = label
+        self.index = index
+        self.linktype = linktype
+        self.value = value
+        self.orgvalue = orgvalue
+        self.pstart = pstart
+        self.pend = pend
+        self.weight = weight
+
+
+class Constant:
+
+    def __init__(self):
+        self.action_ops = [
+            'add_cond', 'change_cond', 'del_cond', 'change_focus_total',
+            'change_agg_only', 'del_focus', 'restart', 'switch_table',
+            'out_of_scripts', 'repeat', 'firstTurn'
+        ]
+
+        self.agg_ops = [
+            '', 'AVG', 'MAX', 'MIN', 'COUNT', 'SUM', 'COMPARE', 'GROUP BY',
+            'SAME'
+        ]
+
+        self.cond_ops = ['>', '<', '==', '!=', 'ASC', 'DESC']
+
+        self.cond_conn_ops = ['', 'AND', 'OR']
+
+        self.col_type_dict = {
+            'null': 0,
+            'text': 1,
+            'number': 2,
+            'duration': 3,
+            'bool': 4,
+            'date': 5
+        }
+
+        self.schema_link_dict = {
+            'col_start': 1,
+            'col_middle': 2,
+            'col_end': 3,
+            'val_start': 4,
+            'val_middle': 5,
+            'val_end': 6
+        }
+
+        self.max_select_num = 4
+
+        self.max_where_num = 6
diff --git a/modelscope/preprocessors/star3/table_question_answering_preprocessor.py b/modelscope/preprocessors/star3/table_question_answering_preprocessor.py
new file mode 100644
index 00000000..163759a1
--- /dev/null
+++ b/modelscope/preprocessors/star3/table_question_answering_preprocessor.py
@@ -0,0 +1,118 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os
+from typing import Any, Dict
+
+import torch
+from transformers import BertTokenizer
+
+from modelscope.metainfo import Preprocessors
+from modelscope.preprocessors.base import Preprocessor
+from modelscope.preprocessors.builder import PREPROCESSORS
+from modelscope.preprocessors.star3.fields.database import Database
+from modelscope.preprocessors.star3.fields.schema_link import SchemaLinker
+from modelscope.utils.config import Config
+from modelscope.utils.constant import Fields, ModelFile
+from modelscope.utils.type_assert import type_assert
+
+__all__ = ['TableQuestionAnsweringPreprocessor']
+
+
+@PREPROCESSORS.register_module(
+    Fields.nlp,
+    module_name=Preprocessors.table_question_answering_preprocessor)
+class TableQuestionAnsweringPreprocessor(Preprocessor):
+
+    def __init__(self, model_dir: str, db: Database = None, *args, **kwargs):
+        """preprocess the data
+
+        Args:
+            model_dir (str): model path
+            db (Database): database instance
+        """
+        super().__init__(*args, **kwargs)
+
+        self.model_dir: str = model_dir
+        self.config = Config.from_file(
+            os.path.join(self.model_dir, ModelFile.CONFIGURATION))
+
+        # read tokenizer
+        self.tokenizer = BertTokenizer(
+            os.path.join(self.model_dir, ModelFile.VOCAB_FILE))
+
+        # read database
+        if db is None:
+            self.db = Database(
+                tokenizer=self.tokenizer,
+                table_file_path=os.path.join(self.model_dir, 'table.json'),
+                syn_dict_file_path=os.path.join(self.model_dir, 'synonym.txt'))
+        else:
+            self.db = db
+
+        # get schema linker
+        self.schema_linker = SchemaLinker()
+
+        # set device
+        self.device = 'cuda' if \
+            ('device' not in kwargs or kwargs['device'] == 'gpu') \
+            and torch.cuda.is_available() else 'cpu'
+
+    def construct_data(self, search_result_list, nlu, nlu_t, db, history_sql):
+        datas = []
+        for search_result in search_result_list:
+            data = {}
+            data['table_id'] = search_result['table_id']
+            data['question'] = nlu
+            data['question_tok'] = nlu_t
+            data['header_tok'] = db.tables[data['table_id']]['header_tok']
+            data['types'] = db.tables[data['table_id']]['header_types']
+            data['units'] = db.tables[data['table_id']]['header_units']
+            data['action'] = 0
+            data['sql'] = None
+            data['history_sql'] = history_sql
+            data['wvi_corenlp'] = []
+            data['bertindex_knowledge'] = search_result['question_knowledge']
+            data['header_knowledge'] = search_result['header_knowledge']
+            data['schema_link'] = search_result['schema_link']
+            datas.append(data)
+
+        return datas
+
+    @type_assert(object, dict)
+    def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
+        """process the raw input data
+
+        Args:
+            data (dict):
+                utterance: a sentence
+                last_sql: predicted sql of last utterance
+                Example:
+                    utterance: 'Which of these are hiring?'
+                    last_sql: ''
+
+        Returns:
+            Dict[str, Any]: the preprocessed data
+        """
+
+        # tokenize question
+        question = data['question']
+        history_sql = data['history_sql']
+        nlu = question.lower()
+        nlu_t = self.tokenizer.tokenize(nlu)
+
+        # get linking
+        search_result_list = self.schema_linker.get_entity_linking(
+            tokenizer=self.tokenizer,
+            nlu=nlu,
+            nlu_t=nlu_t,
+            tables=self.db.tables,
+            col_syn_dict=self.db.syn_dict)
+
+        # collect data
+        datas = self.construct_data(
+            search_result_list=search_result_list[0:1],
+            nlu=nlu,
+            nlu_t=nlu_t,
+            db=self.db,
+            history_sql=history_sql)
+
+        return {'datas': datas}
diff --git a/modelscope/preprocessors/video.py b/modelscope/preprocessors/video.py
index 36110d1b..f693cd9e 100644
--- a/modelscope/preprocessors/video.py
+++ b/modelscope/preprocessors/video.py
@@ -9,35 +9,56 @@ import torchvision.transforms._transforms_video as transforms
 from decord import VideoReader
 from torchvision.transforms import Compose
 
+from modelscope.metainfo import Preprocessors
+from modelscope.utils.constant import Fields, ModeKeys
+from modelscope.utils.type_assert import type_assert
+from .base import Preprocessor
+from .builder import PREPROCESSORS
 
-def ReadVideoData(cfg, video_path):
+
+def ReadVideoData(cfg,
+                  video_path,
+                  num_spatial_crops_override=None,
+                  num_temporal_views_override=None):
     """ simple interface to load video frames from file
 
     Args:
         cfg (Config): The global config object.
         video_path (str): video file path
+        num_spatial_crops_override (int): the spatial crops per clip
+        num_temporal_views_override (int): the temporal clips per video
+    Returns:
+        data (Tensor): the normalized video clips for model inputs
     """
-    data = _decode_video(cfg, video_path)
-    transform = kinetics400_tranform(cfg)
+    data = _decode_video(cfg, video_path, num_temporal_views_override)
+    if num_spatial_crops_override is not None:
+        num_spatial_crops = num_spatial_crops_override
+        transform = kinetics400_tranform(cfg, num_spatial_crops_override)
+    else:
+        num_spatial_crops = cfg.TEST.NUM_SPATIAL_CROPS
+        transform = kinetics400_tranform(cfg, cfg.TEST.NUM_SPATIAL_CROPS)
     data_list = []
     for i in range(data.size(0)):
-        for j in range(cfg.TEST.NUM_SPATIAL_CROPS):
+        for j in range(num_spatial_crops):
             transform.transforms[1].set_spatial_index(j)
             data_list.append(transform(data[i]))
     return torch.stack(data_list, dim=0)
 
 
-def kinetics400_tranform(cfg):
+def kinetics400_tranform(cfg, num_spatial_crops):
     """
     Configs the transform for the kinetics-400 dataset.
     We apply controlled spatial cropping and normalization.
     Args:
         cfg (Config): The global config object.
+        num_spatial_crops (int): the spatial crops per clip
+    Returns:
+        transform_function (Compose): the transform function for input clips
     """
     resize_video = KineticsResizedCrop(
         short_side_range=[cfg.DATA.TEST_SCALE, cfg.DATA.TEST_SCALE],
         crop_size=cfg.DATA.TEST_CROP_SIZE,
-        num_spatial_crops=cfg.TEST.NUM_SPATIAL_CROPS)
+        num_spatial_crops=num_spatial_crops)
     std_transform_list = [
         transforms.ToTensorVideo(), resize_video,
         transforms.NormalizeVideo(
@@ -54,17 +75,17 @@ def _interval_based_sampling(vid_length, vid_fps, target_fps, clip_idx,
             vid_length  (int): the length of the whole video (valid selection range).
             vid_fps     (int): the original video fps
             target_fps  (int): the normalized video fps
-            clip_idx    (int): -1 for random temporal sampling, and positive values for
-                                sampling specific clip from the video
+            clip_idx    (int): -1 for random temporal sampling, and positive values for sampling specific
+                                clip from the video
             num_clips   (int): the total clips to be sampled from each video.
-                                combined with clip_idx, the sampled video is the "clip_idx-th"
-                                 video from "num_clips" videos.
+                                combined with clip_idx, the sampled video is the "clip_idx-th" video from
+                                "num_clips" videos.
             num_frames  (int): number of frames in each sampled clips.
             interval    (int): the interval to sample each frame.
             minus_interval (bool): control the end index
         Returns:
             index (tensor): the sampled frame indexes
-        """
+    """
     if num_frames == 1:
         index = [random.randint(0, vid_length - 1)]
     else:
@@ -72,7 +93,10 @@ def _interval_based_sampling(vid_length, vid_fps, target_fps, clip_idx,
         clip_length = num_frames * interval * vid_fps / target_fps
 
         max_idx = max(vid_length - clip_length, 0)
-        start_idx = clip_idx * math.floor(max_idx / (num_clips - 1))
+        if num_clips == 1:
+            start_idx = max_idx / 2
+        else:
+            start_idx = clip_idx * math.floor(max_idx / (num_clips - 1))
         if minus_interval:
             end_idx = start_idx + clip_length - interval
         else:
@@ -84,59 +108,79 @@ def _interval_based_sampling(vid_length, vid_fps, target_fps, clip_idx,
     return index
 
 
-def _decode_video_frames_list(cfg, frames_list, vid_fps):
+def _decode_video_frames_list(cfg,
+                              frames_list,
+                              vid_fps,
+                              num_temporal_views_override=None):
     """
         Decodes the video given the numpy frames.
         Args:
             cfg          (Config): The global config object.
             frames_list  (list):  all frames for a video, the frames should be numpy array.
             vid_fps      (int):  the fps of this video.
+            num_temporal_views_override (int): the temporal clips per video
         Returns:
             frames            (Tensor): video tensor data
     """
     assert isinstance(frames_list, list)
-    num_clips_per_video = cfg.TEST.NUM_ENSEMBLE_VIEWS
+    if num_temporal_views_override is not None:
+        num_clips_per_video = num_temporal_views_override
+    else:
+        num_clips_per_video = cfg.TEST.NUM_ENSEMBLE_VIEWS
 
     frame_list = []
     for clip_idx in range(num_clips_per_video):
         # for each clip in the video,
         # a list is generated before decoding the specified frames from the video
         list_ = _interval_based_sampling(
-            len(frames_list), vid_fps, cfg.DATA.TARGET_FPS, clip_idx,
-            num_clips_per_video, cfg.DATA.NUM_INPUT_FRAMES,
-            cfg.DATA.SAMPLING_RATE, cfg.DATA.MINUS_INTERVAL)
+            len(frames_list),
+            vid_fps,
+            cfg.DATA.TARGET_FPS,
+            clip_idx,
+            num_clips_per_video,
+            cfg.DATA.NUM_INPUT_FRAMES,
+            cfg.DATA.SAMPLING_RATE,
+            cfg.DATA.MINUS_INTERVAL,
+        )
         frames = None
         frames = torch.from_numpy(
-            np.stack([frames_list[l_index] for l_index in list_.tolist()],
-                     axis=0))
+            np.stack([frames_list[index] for index in list_.tolist()], axis=0))
         frame_list.append(frames)
     frames = torch.stack(frame_list)
-    if num_clips_per_video == 1:
-        frames = frames.squeeze(0)
-
+    del vr
     return frames
 
 
-def _decode_video(cfg, path):
+def _decode_video(cfg, path, num_temporal_views_override=None):
     """
         Decodes the video given the numpy frames.
         Args:
+            cfg          (Config): The global config object.
             path          (str): video file path.
+            num_temporal_views_override (int): the temporal clips per video
         Returns:
             frames            (Tensor): video tensor data
     """
     vr = VideoReader(path)
-
-    num_clips_per_video = cfg.TEST.NUM_ENSEMBLE_VIEWS
+    if num_temporal_views_override is not None:
+        num_clips_per_video = num_temporal_views_override
+    else:
+        num_clips_per_video = cfg.TEST.NUM_ENSEMBLE_VIEWS
 
     frame_list = []
     for clip_idx in range(num_clips_per_video):
         # for each clip in the video,
         # a list is generated before decoding the specified frames from the video
         list_ = _interval_based_sampling(
-            len(vr), vr.get_avg_fps(), cfg.DATA.TARGET_FPS, clip_idx,
-            num_clips_per_video, cfg.DATA.NUM_INPUT_FRAMES,
-            cfg.DATA.SAMPLING_RATE, cfg.DATA.MINUS_INTERVAL)
+            len(vr),
+            vr.get_avg_fps(),
+            cfg.DATA.TARGET_FPS,
+            clip_idx,
+            num_clips_per_video,
+            cfg.DATA.NUM_INPUT_FRAMES,
+            cfg.DATA.SAMPLING_RATE,
+            cfg.DATA.MINUS_INTERVAL,
+        )
         frames = None
         if path.endswith('.avi'):
             append_list = torch.arange(0, list_[0], 4)
@@ -149,8 +193,6 @@ def _decode_video(cfg, path):
                 vr.get_batch(list_).to_dlpack()).clone()
         frame_list.append(frames)
     frames = torch.stack(frame_list)
-    if num_clips_per_video == 1:
-        frames = frames.squeeze(0)
     del vr
     return frames
 
@@ -218,6 +260,29 @@ class KineticsResizedCrop(object):
                     y = y_max // 2
         return new_clip[:, :, y:y + self.crop_size, x:x + self.crop_size]
 
+    def _get_random_crop(self, clip):
+        _, _, clip_height, clip_width = clip.shape
+
+        short_side = min(clip_height, clip_width)
+        long_side = max(clip_height, clip_width)
+        new_short_side = int(random.uniform(*self.short_side_range))
+        new_long_side = int(long_side / short_side * new_short_side)
+        if clip_height < clip_width:
+            new_clip_height = new_short_side
+            new_clip_width = new_long_side
+        else:
+            new_clip_height = new_long_side
+            new_clip_width = new_short_side
+
+        new_clip = torch.nn.functional.interpolate(
+            clip, size=(new_clip_height, new_clip_width), mode='bilinear')
+
+        x_max = int(new_clip_width - self.crop_size)
+        y_max = int(new_clip_height - self.crop_size)
+        x = int(random.uniform(0, x_max))
+        y = int(random.uniform(0, y_max))
+        return new_clip[:, :, y:y + self.crop_size, x:x + self.crop_size]
+
     def set_spatial_index(self, idx):
         """Set the spatial cropping index for controlled cropping..
         Args:
@@ -227,3 +292,42 @@ class KineticsResizedCrop(object):
 
     def __call__(self, clip):
         return self._get_controlled_crop(clip)
+
+
+@PREPROCESSORS.register_module(
+    Fields.cv, module_name=Preprocessors.movie_scene_segmentation_preprocessor)
+class MovieSceneSegmentationPreprocessor(Preprocessor):
+
+    def __init__(self, *args, **kwargs):
+        """
+        movie scene segmentation preprocessor
+        """
+        super().__init__(*args, **kwargs)
+
+        self.is_train = kwargs.pop('is_train', True)
+        self.preprocessor_train_cfg = kwargs.pop(ModeKeys.TRAIN, None)
+        self.preprocessor_test_cfg = kwargs.pop(ModeKeys.EVAL, None)
+        self.num_keyframe = kwargs.pop('num_keyframe', 3)
+
+        from .movie_scene_segmentation import get_transform
+        self.train_transform = get_transform(self.preprocessor_train_cfg)
+        self.test_transform = get_transform(self.preprocessor_test_cfg)
+
+    def train(self):
+        self.is_train = True
+        return
+
+    def eval(self):
+        self.is_train = False
+        return
+
+    @type_assert(object, object)
+    def __call__(self, results):
+        if self.is_train:
+            transforms = self.train_transform
+        else:
+            transforms = self.test_transform
+
+        results = torch.stack(transforms(results), dim=0)
+        results = results.view(-1, self.num_keyframe, 3, 224, 224)
+        return results
diff --git a/modelscope/trainers/__init__.py b/modelscope/trainers/__init__.py
index 17ed7f3c..a632642a 100644
--- a/modelscope/trainers/__init__.py
+++ b/modelscope/trainers/__init__.py
@@ -4,25 +4,28 @@ from typing import TYPE_CHECKING
 from modelscope.utils.import_utils import LazyImportModule
 
 if TYPE_CHECKING:
+    from .audio.ans_trainer import ANSTrainer
     from .base import DummyTrainer
     from .builder import build_trainer
     from .cv import (ImageInstanceSegmentationTrainer,
-                     ImagePortraitEnhancementTrainer)
+                     ImagePortraitEnhancementTrainer,
+                     MovieSceneSegmentationTrainer)
     from .multi_modal import CLIPTrainer
-    from .nlp import SequenceClassificationTrainer
+    from .nlp import SequenceClassificationTrainer, PassageRankingTrainer
     from .nlp_trainer import NlpEpochBasedTrainer, VecoTrainer
     from .trainer import EpochBasedTrainer
 
 else:
     _import_structure = {
+        'audio.ans_trainer': ['ANSTrainer'],
         'base': ['DummyTrainer'],
         'builder': ['build_trainer'],
         'cv': [
             'ImageInstanceSegmentationTrainer',
-            'ImagePortraitEnhancementTrainer'
+            'ImagePortraitEnhancementTrainer', 'MovieSceneSegmentationTrainer'
         ],
         'multi_modal': ['CLIPTrainer'],
-        'nlp': ['SequenceClassificationTrainer'],
+        'nlp': ['SequenceClassificationTrainer', 'PassageRankingTrainer'],
         'nlp_trainer': ['NlpEpochBasedTrainer', 'VecoTrainer'],
         'trainer': ['EpochBasedTrainer']
     }
diff --git a/modelscope/trainers/audio/__init__.py b/modelscope/trainers/audio/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/modelscope/trainers/audio/ans_trainer.py b/modelscope/trainers/audio/ans_trainer.py
new file mode 100644
index 00000000..37b201ce
--- /dev/null
+++ b/modelscope/trainers/audio/ans_trainer.py
@@ -0,0 +1,52 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from modelscope.metainfo import Trainers
+from modelscope.trainers import EpochBasedTrainer
+from modelscope.trainers.builder import TRAINERS
+from modelscope.utils.constant import TrainerStages
+from modelscope.utils.data_utils import to_device
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+
+@TRAINERS.register_module(module_name=Trainers.speech_frcrn_ans_cirm_16k)
+class ANSTrainer(EpochBasedTrainer):
+    """
+    A trainer is used for acoustic noise suppression.
+    Override train_loop() to use dataset just one time.
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+    def train_loop(self, data_loader):
+        """
+        Update epoch by step number, based on super method.
+        """
+        self.invoke_hook(TrainerStages.before_run)
+        self._epoch = 0
+        kwargs = {}
+        self.model.train()
+        enumerated = enumerate(data_loader)
+        for _ in range(self._epoch, self._max_epochs):
+            self.invoke_hook(TrainerStages.before_train_epoch)
+            self._inner_iter = 0
+            for i, data_batch in enumerated:
+                data_batch = to_device(data_batch, self.device)
+                self.data_batch = data_batch
+                self._inner_iter += 1
+                self.invoke_hook(TrainerStages.before_train_iter)
+                self.train_step(self.model, data_batch, **kwargs)
+                self.invoke_hook(TrainerStages.after_train_iter)
+                del self.data_batch
+                self._iter += 1
+                if self._inner_iter >= self.iters_per_epoch:
+                    break
+
+            self.invoke_hook(TrainerStages.after_train_epoch)
+            self._epoch += 1
+
+        self.invoke_hook(TrainerStages.after_run)
+
+    def prediction_step(self, model, inputs):
+        pass
diff --git a/modelscope/trainers/cv/__init__.py b/modelscope/trainers/cv/__init__.py
index 99c2aea5..4c65870e 100644
--- a/modelscope/trainers/cv/__init__.py
+++ b/modelscope/trainers/cv/__init__.py
@@ -7,6 +7,7 @@ if TYPE_CHECKING:
     from .image_instance_segmentation_trainer import \
         ImageInstanceSegmentationTrainer
     from .image_portrait_enhancement_trainer import ImagePortraitEnhancementTrainer
+    from .movie_scene_segmentation_trainer import MovieSceneSegmentationTrainer
 
 else:
     _import_structure = {
@@ -14,6 +15,7 @@ else:
         ['ImageInstanceSegmentationTrainer'],
         'image_portrait_enhancement_trainer':
         ['ImagePortraitEnhancementTrainer'],
+        'movie_scene_segmentation_trainer': ['MovieSceneSegmentationTrainer']
     }
 
     import sys
diff --git a/modelscope/trainers/cv/image_instance_segmentation_trainer.py b/modelscope/trainers/cv/image_instance_segmentation_trainer.py
index 2e2415dc..a777bde1 100644
--- a/modelscope/trainers/cv/image_instance_segmentation_trainer.py
+++ b/modelscope/trainers/cv/image_instance_segmentation_trainer.py
@@ -1,3 +1,4 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
 from modelscope.metainfo import Trainers
 from modelscope.trainers.builder import TRAINERS
 from modelscope.trainers.trainer import EpochBasedTrainer
diff --git a/modelscope/trainers/cv/movie_scene_segmentation_trainer.py b/modelscope/trainers/cv/movie_scene_segmentation_trainer.py
new file mode 100644
index 00000000..7645f9f3
--- /dev/null
+++ b/modelscope/trainers/cv/movie_scene_segmentation_trainer.py
@@ -0,0 +1,21 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from modelscope.metainfo import Trainers
+from modelscope.trainers.builder import TRAINERS
+from modelscope.trainers.trainer import EpochBasedTrainer
+
+
+@TRAINERS.register_module(module_name=Trainers.movie_scene_segmentation)
+class MovieSceneSegmentationTrainer(EpochBasedTrainer):
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+    def train(self, *args, **kwargs):
+        super().train(*args, **kwargs)
+
+    def evaluate(self, *args, **kwargs):
+        metric_values = super().evaluate(*args, **kwargs)
+        return metric_values
+
+    def prediction_step(self, model, inputs):
+        pass
diff --git a/modelscope/trainers/default_config.py b/modelscope/trainers/default_config.py
index 69fdd400..c8f0c7b0 100644
--- a/modelscope/trainers/default_config.py
+++ b/modelscope/trainers/default_config.py
@@ -1,4 +1,7 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
+
+from modelscope.utils.config import Config
+
 DEFAULT_CONFIG = {
     'train': {
         'hooks': [{
@@ -12,3 +15,19 @@ DEFAULT_CONFIG = {
         }]
     }
 }
+
+
+def merge_cfg(cfg: Config):
+    """Merge the default config into the input cfg.
+
+    This function will pop the default CheckpointHook when the BestCkptSaverHook exists in the input cfg.
+
+    @param cfg: The input cfg to be merged into.
+    """
+    cfg.merge_from_dict(DEFAULT_CONFIG, force=False)
+    # pop duplicate hook
+
+    if any(['BestCkptSaverHook' == hook['type'] for hook in cfg.train.hooks]):
+        cfg.train.hooks = list(
+            filter(lambda hook: hook['type'] != 'CheckpointHook',
+                   cfg.train.hooks))
diff --git a/modelscope/trainers/easycv/__init__.py b/modelscope/trainers/easycv/__init__.py
new file mode 100644
index 00000000..b1b8fc15
--- /dev/null
+++ b/modelscope/trainers/easycv/__init__.py
@@ -0,0 +1,19 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .utils import AddLrLogHook, EasyCVMetric
+else:
+    _import_structure = {'utils': ['AddLrLogHook', 'EasyCVMetric']}
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/trainers/easycv/trainer.py b/modelscope/trainers/easycv/trainer.py
new file mode 100644
index 00000000..3c869495
--- /dev/null
+++ b/modelscope/trainers/easycv/trainer.py
@@ -0,0 +1,167 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from functools import partial
+from typing import Callable, Optional, Tuple, Union
+
+import torch
+from torch import nn
+from torch.utils.data import Dataset
+
+from modelscope.metainfo import Trainers
+from modelscope.models.base import TorchModel
+from modelscope.msdatasets import MsDataset
+from modelscope.preprocessors import Preprocessor
+from modelscope.trainers import EpochBasedTrainer
+from modelscope.trainers.base import TRAINERS
+from modelscope.trainers.easycv.utils import register_util
+from modelscope.trainers.hooks import HOOKS
+from modelscope.trainers.parallel.builder import build_parallel
+from modelscope.trainers.parallel.utils import is_parallel
+from modelscope.utils.config import Config
+from modelscope.utils.constant import DEFAULT_MODEL_REVISION
+from modelscope.utils.import_utils import LazyImportModule
+from modelscope.utils.registry import default_group
+
+
+@TRAINERS.register_module(module_name=Trainers.easycv)
+class EasyCVEpochBasedTrainer(EpochBasedTrainer):
+    """Epoch based Trainer for EasyCV.
+
+    Args:
+        cfg_file(str): The config file of EasyCV.
+        model (:obj:`torch.nn.Module` or :obj:`TorchModel` or `str`): The model to be run, or a valid model dir
+            or a model id. If model is None, build_model method will be called.
+        train_dataset (`MsDataset` or `torch.utils.data.Dataset`, *optional*):
+            The dataset to use for training.
+            Note that if it's a `torch.utils.data.IterableDataset` with some randomization and you are training in a
+            distributed fashion, your iterable dataset should either use a internal attribute `generator` that is a
+            `torch.Generator` for the randomization that must be identical on all processes (and the Trainer will
+            manually set the seed of this `generator` at each epoch) or have a `set_epoch()` method that internally
+            sets the seed of the RNGs used.
+        eval_dataset (`MsDataset` or `torch.utils.data.Dataset`, *optional*): The dataset to use for evaluation.
+        preprocessor (:obj:`Preprocessor`, *optional*): The optional preprocessor.
+            NOTE: If the preprocessor has been called before the dataset fed into this trainer by user's custom code,
+            this parameter should be None, meanwhile remove the 'preprocessor' key from the cfg_file.
+            Else the preprocessor will be instantiated from the cfg_file or assigned from this parameter and
+            this preprocessing action will be executed every time the dataset's __getitem__ is called.
+        optimizers (`Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler._LRScheduler]`, *optional*): A tuple
+            containing the optimizer and the scheduler to use.
+        max_epochs: (int, optional): Total training epochs.
+    """
+
+    def __init__(
+            self,
+            cfg_file: Optional[str] = None,
+            model: Optional[Union[TorchModel, nn.Module, str]] = None,
+            arg_parse_fn: Optional[Callable] = None,
+            train_dataset: Optional[Union[MsDataset, Dataset]] = None,
+            eval_dataset: Optional[Union[MsDataset, Dataset]] = None,
+            preprocessor: Optional[Preprocessor] = None,
+            optimizers: Tuple[torch.optim.Optimizer,
+                              torch.optim.lr_scheduler._LRScheduler] = (None,
+                                                                        None),
+            model_revision: Optional[str] = DEFAULT_MODEL_REVISION,
+            **kwargs):
+
+        register_util.register_parallel()
+        register_util.register_part_mmcv_hooks_to_ms()
+
+        super(EasyCVEpochBasedTrainer, self).__init__(
+            model=model,
+            cfg_file=cfg_file,
+            arg_parse_fn=arg_parse_fn,
+            preprocessor=preprocessor,
+            optimizers=optimizers,
+            model_revision=model_revision,
+            train_dataset=train_dataset,
+            eval_dataset=eval_dataset,
+            **kwargs)
+
+        # reset data_collator
+        from mmcv.parallel import collate
+
+        self.train_data_collator = partial(
+            collate,
+            samples_per_gpu=self.cfg.train.dataloader.batch_size_per_gpu)
+        self.eval_data_collator = partial(
+            collate,
+            samples_per_gpu=self.cfg.evaluation.dataloader.batch_size_per_gpu)
+
+        # Register easycv hooks dynamicly. If the hook already exists in modelscope,
+        # the hook in modelscope will be used, otherwise register easycv hook into ms.
+        # We must manually trigger lazy import to detect whether the hook is in modelscope.
+        # TODO: use ast index to detect whether the hook is in modelscope
+        for h_i in self.cfg.train.get('hooks', []):
+            sig = ('HOOKS', default_group, h_i['type'])
+            LazyImportModule.import_module(sig)
+            if h_i['type'] not in HOOKS._modules[default_group]:
+                if h_i['type'] in [
+                        'TensorboardLoggerHookV2', 'WandbLoggerHookV2'
+                ]:
+                    raise ValueError(
+                        'Not support hook %s now, we will support it in the future!'
+                        % h_i['type'])
+                register_util.register_hook_to_ms(h_i['type'], self.logger)
+
+        # reset parallel
+        if not self._dist:
+            assert not is_parallel(
+                self.model
+            ), 'Not support model wrapped by custom parallel if not in distributed mode!'
+            dp_cfg = dict(
+                type='MMDataParallel',
+                module=self.model,
+                device_ids=[torch.cuda.current_device()])
+            self.model = build_parallel(dp_cfg)
+
+    def create_optimizer_and_scheduler(self):
+        """ Create optimizer and lr scheduler
+        """
+        optimizer, lr_scheduler = self.optimizers
+        if optimizer is None:
+            optimizer_cfg = self.cfg.train.get('optimizer', None)
+        else:
+            optimizer_cfg = None
+
+        optim_options = {}
+        if optimizer_cfg is not None:
+            optim_options = optimizer_cfg.pop('options', {})
+            from easycv.apis.train import build_optimizer
+            optimizer = build_optimizer(self.model, optimizer_cfg)
+
+        if lr_scheduler is None:
+            lr_scheduler_cfg = self.cfg.train.get('lr_scheduler', None)
+        else:
+            lr_scheduler_cfg = None
+
+        lr_options = {}
+        # Adapt to mmcv lr scheduler hook.
+        # Please refer to: https://github.com/open-mmlab/mmcv/blob/master/mmcv/runner/hooks/lr_updater.py
+        if lr_scheduler_cfg is not None:
+            assert optimizer is not None
+            lr_options = lr_scheduler_cfg.pop('options', {})
+            assert 'policy' in lr_scheduler_cfg
+            policy_type = lr_scheduler_cfg.pop('policy')
+            if policy_type == policy_type.lower():
+                policy_type = policy_type.title()
+            hook_type = policy_type + 'LrUpdaterHook'
+            lr_scheduler_cfg['type'] = hook_type
+
+            self.cfg.train.lr_scheduler_hook = lr_scheduler_cfg
+
+        self.optimizer = optimizer
+        self.lr_scheduler = lr_scheduler
+
+        return self.optimizer, self.lr_scheduler, optim_options, lr_options
+
+    def to_parallel(self, model) -> Union[nn.Module, TorchModel]:
+        if self.cfg.get('parallel', None) is not None:
+            self.cfg.parallel.update(
+                dict(module=model, device_ids=[torch.cuda.current_device()]))
+            return build_parallel(self.cfg.parallel)
+
+        dp_cfg = dict(
+            type='MMDistributedDataParallel',
+            module=model,
+            device_ids=[torch.cuda.current_device()])
+
+        return build_parallel(dp_cfg)
diff --git a/modelscope/trainers/easycv/utils/__init__.py b/modelscope/trainers/easycv/utils/__init__.py
new file mode 100644
index 00000000..23cfa36a
--- /dev/null
+++ b/modelscope/trainers/easycv/utils/__init__.py
@@ -0,0 +1,21 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .hooks import AddLrLogHook
+    from .metric import EasyCVMetric
+
+else:
+    _import_structure = {'hooks': ['AddLrLogHook'], 'metric': ['EasyCVMetric']}
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/trainers/easycv/utils/hooks.py b/modelscope/trainers/easycv/utils/hooks.py
new file mode 100644
index 00000000..62bc6d1e
--- /dev/null
+++ b/modelscope/trainers/easycv/utils/hooks.py
@@ -0,0 +1,29 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from modelscope.trainers.hooks import HOOKS, Priority
+from modelscope.trainers.hooks.lr_scheduler_hook import LrSchedulerHook
+from modelscope.utils.constant import LogKeys
+
+
+@HOOKS.register_module(module_name='AddLrLogHook')
+class AddLrLogHook(LrSchedulerHook):
+    """For EasyCV to adapt to ModelScope, the lr log of EasyCV is added in the trainer,
+    but the trainer of ModelScope does not and it is added in the lr scheduler hook.
+    But The lr scheduler hook used by EasyCV is the hook of mmcv, and there is no lr log.
+    It will be deleted in the future.
+    """
+    PRIORITY = Priority.NORMAL
+
+    def __init__(self):
+        pass
+
+    def before_run(self, trainer):
+        pass
+
+    def before_train_iter(self, trainer):
+        trainer.log_buffer.output[LogKeys.LR] = self._get_log_lr(trainer)
+
+    def before_train_epoch(self, trainer):
+        trainer.log_buffer.output[LogKeys.LR] = self._get_log_lr(trainer)
+
+    def after_train_epoch(self, trainer):
+        pass
diff --git a/modelscope/trainers/easycv/utils/metric.py b/modelscope/trainers/easycv/utils/metric.py
new file mode 100644
index 00000000..53937b67
--- /dev/null
+++ b/modelscope/trainers/easycv/utils/metric.py
@@ -0,0 +1,52 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import itertools
+from typing import Dict
+
+import numpy as np
+import torch
+
+from modelscope.metrics.base import Metric
+from modelscope.metrics.builder import METRICS
+
+
+@METRICS.register_module(module_name='EasyCVMetric')
+class EasyCVMetric(Metric):
+    """Adapt to ModelScope Metric for EasyCV evaluator.
+    """
+
+    def __init__(self, trainer=None, evaluators=None, *args, **kwargs):
+        from easycv.core.evaluation.builder import build_evaluator
+
+        self.trainer = trainer
+        self.evaluators = build_evaluator(evaluators)
+        self.preds = []
+        self.grountruths = []
+
+    def add(self, outputs: Dict, inputs: Dict):
+        self.preds.append(outputs)
+        del inputs
+
+    def evaluate(self):
+        results = {}
+        for _, batch in enumerate(self.preds):
+            for k, v in batch.items():
+                if k not in results:
+                    results[k] = []
+                results[k].append(v)
+
+        for k, v in results.items():
+            if len(v) == 0:
+                raise ValueError(f'empty result for {k}')
+
+            if isinstance(v[0], torch.Tensor):
+                results[k] = torch.cat(v, 0)
+            elif isinstance(v[0], (list, np.ndarray)):
+                results[k] = list(itertools.chain.from_iterable(v))
+            else:
+                raise ValueError(
+                    f'value of batch prediction dict should only be tensor or list, {k} type is {v[0]}'
+                )
+
+        metric_values = self.trainer.eval_dataset.evaluate(
+            results, self.evaluators)
+        return metric_values
diff --git a/modelscope/trainers/easycv/utils/register_util.py b/modelscope/trainers/easycv/utils/register_util.py
new file mode 100644
index 00000000..04bf719b
--- /dev/null
+++ b/modelscope/trainers/easycv/utils/register_util.py
@@ -0,0 +1,97 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import inspect
+import logging
+
+from modelscope.trainers.hooks import HOOKS
+from modelscope.trainers.parallel.builder import PARALLEL
+from modelscope.utils.registry import default_group
+
+
+class _RegisterManager:
+
+    def __init__(self):
+        self.registries = {}
+
+    def add(self, module, name, group_key=default_group):
+        if module.name not in self.registries:
+            self.registries[module.name] = {}
+        if group_key not in self.registries[module.name]:
+            self.registries[module.name][group_key] = []
+
+        self.registries[module.name][group_key].append(name)
+
+    def exists(self, module, name, group_key=default_group):
+        if self.registries.get(module.name, None) is None:
+            return False
+        if self.registries[module.name].get(group_key, None) is None:
+            return False
+        if name in self.registries[module.name][group_key]:
+            return True
+
+        return False
+
+
+_dynamic_register = _RegisterManager()
+
+
+def register_parallel():
+    from mmcv.parallel import MMDistributedDataParallel, MMDataParallel
+
+    mmddp = 'MMDistributedDataParallel'
+    mmdp = 'MMDataParallel'
+
+    if not _dynamic_register.exists(PARALLEL, mmddp):
+        _dynamic_register.add(PARALLEL, mmddp)
+        PARALLEL.register_module(
+            module_name=mmddp, module_cls=MMDistributedDataParallel)
+    if not _dynamic_register.exists(PARALLEL, mmdp):
+        _dynamic_register.add(PARALLEL, mmdp)
+        PARALLEL.register_module(module_name=mmdp, module_cls=MMDataParallel)
+
+
+def register_hook_to_ms(hook_name, logger=None):
+    """Register EasyCV hook to ModelScope."""
+    from easycv.hooks import HOOKS as _EV_HOOKS
+
+    if hook_name not in _EV_HOOKS._module_dict:
+        raise ValueError(
+            f'Not found hook "{hook_name}" in EasyCV hook registries!')
+
+    if _dynamic_register.exists(HOOKS, hook_name):
+        return
+    _dynamic_register.add(HOOKS, hook_name)
+
+    obj = _EV_HOOKS._module_dict[hook_name]
+    HOOKS.register_module(module_name=hook_name, module_cls=obj)
+
+    log_str = f'Register hook "{hook_name}" to modelscope hooks.'
+    logger.info(log_str) if logger is not None else logging.info(log_str)
+
+
+def register_part_mmcv_hooks_to_ms():
+    """Register required mmcv hooks to ModelScope.
+    Currently we only registered all lr scheduler hooks in EasyCV and mmcv.
+    Please refer to:
+        EasyCV: https://github.com/alibaba/EasyCV/blob/master/easycv/hooks/lr_update_hook.py
+        mmcv: https://github.com/open-mmlab/mmcv/blob/master/mmcv/runner/hooks/lr_updater.py
+    """
+    from mmcv.runner.hooks import lr_updater
+    from mmcv.runner.hooks import HOOKS as _MMCV_HOOKS
+    from easycv.hooks import StepFixCosineAnnealingLrUpdaterHook, YOLOXLrUpdaterHook
+
+    mmcv_hooks_in_easycv = [('StepFixCosineAnnealingLrUpdaterHook',
+                             StepFixCosineAnnealingLrUpdaterHook),
+                            ('YOLOXLrUpdaterHook', YOLOXLrUpdaterHook)]
+
+    members = inspect.getmembers(lr_updater)
+    members.extend(mmcv_hooks_in_easycv)
+
+    for name, obj in members:
+        if name in _MMCV_HOOKS._module_dict:
+            if _dynamic_register.exists(HOOKS, name):
+                continue
+            _dynamic_register.add(HOOKS, name)
+            HOOKS.register_module(
+                module_name=name,
+                module_cls=obj,
+            )
diff --git a/modelscope/trainers/hooks/checkpoint_hook.py b/modelscope/trainers/hooks/checkpoint_hook.py
index fc0281a1..220929b8 100644
--- a/modelscope/trainers/hooks/checkpoint_hook.py
+++ b/modelscope/trainers/hooks/checkpoint_hook.py
@@ -1,12 +1,16 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import os
+import random
+
+import numpy as np
+import torch
 
 from modelscope import __version__
 from modelscope.metainfo import Hooks
-from modelscope.utils.checkpoint import save_checkpoint
-from modelscope.utils.constant import LogKeys
+from modelscope.utils.checkpoint import load_checkpoint, save_checkpoint
+from modelscope.utils.constant import LogKeys, ModelFile
 from modelscope.utils.logger import get_logger
-from modelscope.utils.torch_utils import is_master
+from modelscope.utils.torch_utils import get_dist_info, is_master
 from .builder import HOOKS
 from .hook import Hook
 from .priority import Priority
@@ -23,21 +27,26 @@ class CheckpointHook(Hook):
         save_optimizer (bool): Whether to save optimizer state dict.  Default: True.
         save_dir (str): The directory to save checkpoints. If is None, use `trainer.work_dir`
         save_last (bool): Whether to save the last checkpoint. Default: True.
+        checkpoint_file (str): The checkpoint file to be loaded.
     """
 
-    PRIORITY = Priority.NORMAL
+    PRIORITY = Priority.LOW
 
     def __init__(self,
                  interval=0,
                  by_epoch=True,
                  save_optimizer=True,
                  save_dir=None,
-                 save_last=True):
+                 save_last=True,
+                 checkpoint_file=None):
         self.interval = interval
         self.by_epoch = by_epoch
         self.save_optimizer = save_optimizer
         self.save_dir = save_dir
+        self.checkpoint_file = checkpoint_file
         self.save_last = save_last
+        self.rng_state = None
+        self.need_load_rng_state = False
 
     def before_run(self, trainer):
         if not self.save_dir:
@@ -54,6 +63,34 @@ class CheckpointHook(Hook):
         if is_master():
             self.logger.info(f'Checkpoints will be saved to {self.save_dir}')
 
+        if self.checkpoint_file is not None and os.path.isfile(
+                self.checkpoint_file):
+            meta = self.load_checkpoint(self.checkpoint_file, trainer)
+            self.rng_state = meta.get('rng_state')
+            self.need_load_rng_state = True
+
+    def before_train_epoch(self, trainer):
+        if self.need_load_rng_state:
+            if self.rng_state is not None:
+                random.setstate(self.rng_state['random'])
+                np.random.set_state(self.rng_state['numpy'])
+                torch.random.set_rng_state(self.rng_state['cpu'])
+                if torch.cuda.is_available():
+                    torch.cuda.random.set_rng_state_all(self.rng_state['cuda'])
+                self.need_load_rng_state = False
+            else:
+                self.logger.warn(
+                    'Random state cannot be found in checkpoint file, '
+                    'this may cause a random data order or model initialization.'
+                )
+
+        self.rng_state = {
+            'random': random.getstate(),
+            'numpy': np.random.get_state(),
+            'cpu': torch.random.get_rng_state(),
+            'cuda': torch.cuda.get_rng_state_all(),
+        }
+
     def after_train_epoch(self, trainer):
         if not self.by_epoch:
             return
@@ -64,6 +101,39 @@ class CheckpointHook(Hook):
                     f'Saving checkpoint at {trainer.epoch + 1} epoch')
                 self._save_checkpoint(trainer)
 
+    @classmethod
+    def load_checkpoint(cls, filename, trainer):
+        from modelscope.trainers.parallel.utils import is_parallel
+        if is_parallel(trainer.model):
+            model = trainer.model.module
+        else:
+            model = trainer.model
+        meta = load_checkpoint(filename, model, trainer.optimizer,
+                               trainer.lr_scheduler)
+        trainer._epoch = meta.get('epoch', trainer._epoch)
+        trainer._iter = meta.get('iter', trainer._iter)
+        trainer._inner_iter = meta.get('inner_iter', trainer._inner_iter)
+
+        for i, hook in enumerate(trainer.hooks):
+            # hook: Hook
+            key = f'{hook.__class__}-{i}'
+            if key in meta and hasattr(hook, 'load_state_dict'):
+                hook.load_state_dict(meta[key])
+            else:
+                trainer.logger.warn(
+                    f'The state_dict of hook {hook.__class__} at index {i} is not found in the checkpoint file.'
+                )
+
+        version = meta.get('modelscope')
+        if version != __version__:
+            trainer.logger.warn(
+                f'The modelscope version of loaded checkpoint does not match the runtime version. '
+                f'The saved version: {version}, runtime version: {__version__}'
+            )
+        trainer.logger.warn(
+            f'Checkpoint {filename} saving time: {meta.get("time")}')
+        return meta
+
     def _save_checkpoint(self, trainer):
         if self.by_epoch:
             cur_save_name = os.path.join(
@@ -72,7 +142,43 @@ class CheckpointHook(Hook):
             cur_save_name = os.path.join(
                 self.save_dir, f'{LogKeys.ITER}_{trainer.iter + 1}.pth')
 
-        save_checkpoint(trainer.model, cur_save_name, trainer.optimizer)
+        meta = {
+            'epoch': trainer.epoch,
+            'iter': trainer.iter + 1,
+            'inner_iter': trainer.inner_iter + 1,
+            'rng_state': self.rng_state,
+        }
+        for i, hook in enumerate(trainer.hooks):
+            if hasattr(hook, 'state_dict'):
+                meta[f'{hook.__class__}-{i}'] = hook.state_dict()
+
+        save_checkpoint(
+            trainer.model,
+            cur_save_name,
+            trainer.optimizer,
+            trainer.lr_scheduler,
+            meta=meta)
+        if (self.is_last_epoch(trainer)
+                and self.by_epoch) or (self.is_last_iter(trainer)
+                                       and not self.by_epoch):
+            self._save_pretrained(trainer)
+
+    def _save_pretrained(self, trainer):
+        output_dir = os.path.join(self.save_dir, ModelFile.TRAIN_OUTPUT_DIR)
+        from modelscope.trainers.parallel.utils import is_parallel
+
+        if is_parallel(trainer.model):
+            model = trainer.model.module
+        else:
+            model = trainer.model
+
+        if hasattr(model, 'save_pretrained'):
+            model.save_pretrained(
+                output_dir,
+                ModelFile.TORCH_MODEL_BIN_FILE,
+                save_function=save_checkpoint,
+                config=trainer.cfg.to_dict(),
+                with_meta=False)
 
     def after_train_iter(self, trainer):
         if self.by_epoch:
@@ -112,7 +218,7 @@ class BestCkptSaverHook(CheckpointHook):
         save_dir (str): Output directory to save best checkpoint.
     """
 
-    PRIORITY = Priority.NORMAL
+    PRIORITY = Priority.LOW
     rule_map = {'max': lambda x, y: x > y, 'min': lambda x, y: x < y}
 
     def __init__(self,
@@ -120,9 +226,12 @@ class BestCkptSaverHook(CheckpointHook):
                  rule='max',
                  by_epoch=True,
                  save_optimizer=True,
-                 save_dir=None):
+                 save_dir=None,
+                 save_file_name=None,
+                 interval=0):
         assert rule in ['max', 'min'], 'Only support "max" or "min" rule now.'
         super().__init__(
+            interval=interval,
             by_epoch=by_epoch,
             save_optimizer=save_optimizer,
             save_dir=save_dir,
@@ -131,6 +240,7 @@ class BestCkptSaverHook(CheckpointHook):
         self.rule = rule
         self._best_metric = None
         self._best_ckpt_file = None
+        self.save_file_name = save_file_name
 
     def _should_save(self, trainer):
         return self._is_best_metric(trainer.metric_values)
@@ -154,15 +264,44 @@ class BestCkptSaverHook(CheckpointHook):
         return False
 
     def _save_checkpoint(self, trainer):
-        if self.by_epoch:
-            cur_save_name = os.path.join(
-                self.save_dir,
-                f'best_{LogKeys.EPOCH}{trainer.epoch + 1}_{self.metric_key}{self._best_metric}.pth'
-            )
-        else:
-            cur_save_name = os.path.join(
-                self.save_dir,
-                f'best_{LogKeys.ITER}{trainer.iter + 1}_{self.metric_key}{self._best_metric}.pth'
-            )
-        save_checkpoint(trainer.model, cur_save_name, trainer.optimizer)
+        cur_save_name = self.save_file_name
+        if cur_save_name is None:
+            if self.by_epoch:
+                cur_save_name = os.path.join(
+                    self.save_dir,
+                    f'best_{LogKeys.EPOCH}{trainer.epoch + 1}_{self.metric_key}{self._best_metric}.pth'
+                )
+            else:
+                cur_save_name = os.path.join(
+                    self.save_dir,
+                    f'best_{LogKeys.ITER}{trainer.iter + 1}_{self.metric_key}{self._best_metric}.pth'
+                )
+
+        meta = {
+            'epoch': trainer.epoch,
+            'iter': trainer.iter + 1,
+            'inner_iter': trainer.inner_iter + 1,
+            'rng_state': self.rng_state,
+        }
+        for i, hook in enumerate(trainer.hooks):
+            meta[f'{hook.__class__}-{i}'] = hook.state_dict()
+
+        if os.path.isfile(cur_save_name):
+            os.remove(cur_save_name)
+        save_checkpoint(trainer.model, cur_save_name, trainer.optimizer,
+                        trainer.lr_scheduler, meta)
         self._best_ckpt_file = cur_save_name
+        self._save_pretrained(trainer)
+
+    def state_dict(self):
+        return {
+            'best_metric': self._best_metric,
+        }
+
+    def load_state_dict(self, state_dict):
+        if state_dict is not None and len(state_dict) > 0:
+            self._best_metric = state_dict.get('best_metric')
+        else:
+            self.logger.warn(
+                'The state_dict is not available, the best metric value will be affected.'
+            )
diff --git a/modelscope/trainers/hooks/hook.py b/modelscope/trainers/hooks/hook.py
index 75cc226c..d3805be8 100644
--- a/modelscope/trainers/hooks/hook.py
+++ b/modelscope/trainers/hooks/hook.py
@@ -199,14 +199,14 @@ class Hook:
         Whether to reach the last epoch
         Returns: bool
         """
-        return trainer.epoch + 1 == trainer._max_epochs
+        return trainer.epoch + 1 == trainer.max_epochs
 
     def is_last_iter(self, trainer):
         """
         Whether to reach the last iteration in the entire training process
         Returns: bool
         """
-        return trainer.iter + 1 == trainer._max_iters
+        return trainer.iter + 1 == trainer.max_iters
 
     def get_triggered_stages(self):
         trigger_stages = set()
@@ -215,3 +215,9 @@ class Hook:
                 trigger_stages.add(stage)
 
         return [stage for stage in Hook.stages if stage in trigger_stages]
+
+    def state_dict(self):
+        return {}
+
+    def load_state_dict(self, state_dict):
+        pass
diff --git a/modelscope/trainers/hooks/logger/base.py b/modelscope/trainers/hooks/logger/base.py
index e1da251f..684c4a8c 100644
--- a/modelscope/trainers/hooks/logger/base.py
+++ b/modelscope/trainers/hooks/logger/base.py
@@ -60,6 +60,18 @@ class LoggerHook(Hook):
         else:
             return False
 
+    def fetch_tensor(self, trainer, n=0):
+        """Fetch latest n values or all values, process tensor type, convert to numpy for dump logs."""
+        assert n >= 0
+        for key in trainer.log_buffer.val_history:
+            values = trainer.log_buffer.val_history[key][-n:]
+
+            for i, v in enumerate(values):
+                if isinstance(v, torch.Tensor):
+                    values[i] = v.clone().detach().cpu().numpy()
+
+            trainer.log_buffer.val_history[key][-n:] = values
+
     def get_epoch(self, trainer):
         if trainer.mode in [ModeKeys.TRAIN, ModeKeys.EVAL]:
             epoch = trainer.epoch + 1
@@ -88,11 +100,14 @@ class LoggerHook(Hook):
 
     def after_train_iter(self, trainer):
         if self.by_epoch and self.every_n_inner_iters(trainer, self.interval):
+            self.fetch_tensor(trainer, self.interval)
             trainer.log_buffer.average(self.interval)
         elif not self.by_epoch and self.every_n_iters(trainer, self.interval):
+            self.fetch_tensor(trainer, self.interval)
             trainer.log_buffer.average(self.interval)
         elif self.end_of_epoch(trainer) and not self.ignore_last:
             # not precise but more stable
+            self.fetch_tensor(trainer, self.interval)
             trainer.log_buffer.average(self.interval)
 
         if trainer.log_buffer.ready:
@@ -107,6 +122,7 @@ class LoggerHook(Hook):
                 trainer.log_buffer.clear_output()
 
     def after_val_epoch(self, trainer):
+        self.fetch_tensor(trainer)
         trainer.log_buffer.average()
         self.log(trainer)
         if self.reset_flag:
diff --git a/modelscope/trainers/hooks/optimizer/base.py b/modelscope/trainers/hooks/optimizer/base.py
index dffad6ea..8c61dfdb 100644
--- a/modelscope/trainers/hooks/optimizer/base.py
+++ b/modelscope/trainers/hooks/optimizer/base.py
@@ -4,6 +4,7 @@ import logging
 from torch.nn.utils import clip_grad
 
 from modelscope.metainfo import Hooks
+from modelscope.outputs import OutputKeys
 from modelscope.trainers.hooks.builder import HOOKS
 from modelscope.trainers.hooks.hook import Hook
 from modelscope.trainers.hooks.priority import Priority
@@ -27,7 +28,7 @@ class OptimizerHook(Hook):
     def __init__(self,
                  cumulative_iters=1,
                  grad_clip=None,
-                 loss_keys='loss') -> None:
+                 loss_keys=OutputKeys.LOSS) -> None:
         if isinstance(loss_keys, str):
             loss_keys = [loss_keys]
         assert isinstance(loss_keys, (tuple, list))
diff --git a/modelscope/trainers/lrscheduler/warmup/base.py b/modelscope/trainers/lrscheduler/warmup/base.py
index 81497817..4b066281 100644
--- a/modelscope/trainers/lrscheduler/warmup/base.py
+++ b/modelscope/trainers/lrscheduler/warmup/base.py
@@ -28,10 +28,10 @@ class BaseWarmup(_LRScheduler):
         return self.base_scheduler.get_lr()
 
     def state_dict(self):
-        self.base_scheduler.state_dict()
+        return self.base_scheduler.state_dict()
 
     def load_state_dict(self, state_dict):
-        self.base_scheduler.load_state_dict(state_dict)
+        return self.base_scheduler.load_state_dict(state_dict)
 
     def scale(self):
         """Scale the learning rates.
diff --git a/modelscope/trainers/nlp/__init__.py b/modelscope/trainers/nlp/__init__.py
index 7ab8fd70..001cfefc 100644
--- a/modelscope/trainers/nlp/__init__.py
+++ b/modelscope/trainers/nlp/__init__.py
@@ -6,10 +6,12 @@ from modelscope.utils.import_utils import LazyImportModule
 if TYPE_CHECKING:
     from .sequence_classification_trainer import SequenceClassificationTrainer
     from .csanmt_translation_trainer import CsanmtTranslationTrainer
+    from .passage_ranking_trainer import PassageRankingTranier
 else:
     _import_structure = {
         'sequence_classification_trainer': ['SequenceClassificationTrainer'],
         'csanmt_translation_trainer': ['CsanmtTranslationTrainer'],
+        'passage_ranking_trainer': ['PassageRankingTrainer']
     }
 
     import sys
diff --git a/modelscope/trainers/nlp/csanmt_translation_trainer.py b/modelscope/trainers/nlp/csanmt_translation_trainer.py
index 067c1d83..c93599c7 100644
--- a/modelscope/trainers/nlp/csanmt_translation_trainer.py
+++ b/modelscope/trainers/nlp/csanmt_translation_trainer.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import os.path as osp
 from typing import Dict, Optional
 
@@ -241,8 +243,10 @@ def input_fn(src_file,
     trg_dataset = tf.data.TextLineDataset(trg_file)
     src_trg_dataset = tf.data.Dataset.zip((src_dataset, trg_dataset))
     src_trg_dataset = src_trg_dataset.map(
-        lambda src, trg:
-        (tf.string_split([src]).values, tf.string_split([trg]).values),
+        lambda src, trg: (tf.string_split([src]), tf.string_split([trg])),
+        num_parallel_calls=10).prefetch(1000000)
+    src_trg_dataset = src_trg_dataset.map(
+        lambda src, trg: (src.values, trg.values),
         num_parallel_calls=10).prefetch(1000000)
     src_trg_dataset = src_trg_dataset.map(
         lambda src, trg: (src_vocab.lookup(src), trg_vocab.lookup(trg)),
diff --git a/modelscope/trainers/nlp/passage_ranking_trainer.py b/modelscope/trainers/nlp/passage_ranking_trainer.py
new file mode 100644
index 00000000..711fd0c4
--- /dev/null
+++ b/modelscope/trainers/nlp/passage_ranking_trainer.py
@@ -0,0 +1,199 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import time
+from dataclasses import dataclass
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+
+import numpy as np
+import torch
+from torch import nn
+from torch.utils.data import DataLoader, Dataset
+
+from modelscope.metainfo import Trainers
+from modelscope.models.base import Model, TorchModel
+from modelscope.msdatasets.ms_dataset import MsDataset
+from modelscope.preprocessors.base import Preprocessor
+from modelscope.trainers.base import BaseTrainer
+from modelscope.trainers.builder import TRAINERS
+from modelscope.trainers.nlp_trainer import NlpEpochBasedTrainer
+from modelscope.utils.constant import DEFAULT_MODEL_REVISION
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+
+@dataclass
+class GroupCollator():
+    """
+    Wrapper that does conversion from List[Tuple[encode_qry, encode_psg]] to List[qry], List[psg]
+    and pass batch separately to the actual collator.
+    Abstract out data detail for the model.
+    """
+
+    def __call__(self, features: List[Dict[str, Any]]) -> Dict[str, Any]:
+        if isinstance(features[0], list):
+            features = sum(features, [])
+        keys = features[0].keys()
+        batch = {k: list() for k in keys}
+        for ele in features:
+            for k, v in ele.items():
+                batch[k].append(v)
+        batch = {k: torch.cat(v, dim=0) for k, v in batch.items()}
+        return batch
+
+
+@TRAINERS.register_module(module_name=Trainers.nlp_passage_ranking_trainer)
+class PassageRankingTrainer(NlpEpochBasedTrainer):
+
+    def __init__(
+            self,
+            model: Optional[Union[TorchModel, nn.Module, str]] = None,
+            cfg_file: Optional[str] = None,
+            cfg_modify_fn: Optional[Callable] = None,
+            arg_parse_fn: Optional[Callable] = None,
+            data_collator: Optional[Callable] = None,
+            train_dataset: Optional[Union[MsDataset, Dataset]] = None,
+            eval_dataset: Optional[Union[MsDataset, Dataset]] = None,
+            preprocessor: Optional[Preprocessor] = None,
+            optimizers: Tuple[torch.optim.Optimizer,
+                              torch.optim.lr_scheduler._LRScheduler] = (None,
+                                                                        None),
+            model_revision: Optional[str] = DEFAULT_MODEL_REVISION,
+            **kwargs):
+
+        if data_collator is None:
+            data_collator = GroupCollator()
+
+        super().__init__(
+            model=model,
+            cfg_file=cfg_file,
+            cfg_modify_fn=cfg_modify_fn,
+            arg_parse_fn=arg_parse_fn,
+            data_collator=data_collator,
+            preprocessor=preprocessor,
+            optimizers=optimizers,
+            train_dataset=train_dataset,
+            eval_dataset=eval_dataset,
+            model_revision=model_revision,
+            **kwargs)
+
+    def compute_mrr(self, result, k=10):
+        mrr = 0
+        for res in result.values():
+            sorted_res = sorted(res, key=lambda x: x[0], reverse=True)
+            ar = 0
+            for index, ele in enumerate(sorted_res[:k]):
+                if str(ele[1]) == '1':
+                    ar = 1.0 / (index + 1)
+                    break
+            mrr += ar
+        return mrr / len(result)
+
+    def compute_ndcg(self, result, k=10):
+        ndcg = 0
+        from sklearn import ndcg_score
+        for res in result.values():
+            sorted_res = sorted(res, key=lambda x: [0], reverse=True)
+            labels = np.array([[ele[1] for ele in sorted_res]])
+            scores = np.array([[ele[0] for ele in sorted_res]])
+            ndcg += float(ndcg_score(labels, scores, k=k))
+        ndcg = ndcg / len(result)
+        return ndcg
+
+    def evaluate(self,
+                 checkpoint_path: Optional[str] = None,
+                 *args,
+                 **kwargs) -> Dict[str, float]:
+        """evaluate a dataset
+
+        evaluate a dataset via a specific model from the `checkpoint_path` path, if the `checkpoint_path`
+        does not exist, read from the config file.
+
+        Args:
+            checkpoint_path (Optional[str], optional): the model path. Defaults to None.
+
+        Returns:
+            Dict[str, float]: the results about the evaluation
+            Example:
+            {"accuracy": 0.5091743119266054, "f1": 0.673780487804878}
+        """
+        from modelscope.models.nlp import PassageRanking
+        # get the raw online dataset
+        self.eval_dataloader = self._build_dataloader_with_dataset(
+            self.eval_dataset,
+            **self.cfg.evaluation.get('dataloader', {}),
+            collate_fn=self.eval_data_collator)
+        # generate a standard dataloader
+        # generate a model
+        if checkpoint_path is not None:
+            model = PassageRanking.from_pretrained(checkpoint_path)
+        else:
+            model = self.model
+
+        # copy from easynlp (start)
+        model.eval()
+        total_samples = 0
+
+        logits_list = list()
+        label_list = list()
+        qid_list = list()
+
+        total_spent_time = 0.0
+        device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
+        model.to(device)
+        for _step, batch in enumerate(self.eval_dataloader):
+            try:
+                batch = {
+                    key:
+                    val.to(device) if isinstance(val, torch.Tensor) else val
+                    for key, val in batch.items()
+                }
+            except RuntimeError:
+                batch = {key: val for key, val in batch.items()}
+
+            infer_start_time = time.time()
+            with torch.no_grad():
+                label_ids = batch.pop('labels').detach().cpu().numpy()
+                qids = batch.pop('qid').detach().cpu().numpy()
+                outputs = model(batch)
+            infer_end_time = time.time()
+            total_spent_time += infer_end_time - infer_start_time
+            total_samples += self.eval_dataloader.batch_size
+
+            assert 'scores' in outputs
+            logits = outputs['scores']
+
+            label_list.extend(label_ids)
+            logits_list.extend(logits)
+            qid_list.extend(qids)
+
+        logger.info('Inference time = {:.2f}s, [{:.4f} ms / sample] '.format(
+            total_spent_time, total_spent_time * 1000 / total_samples))
+
+        rank_result = {}
+        for qid, score, label in zip(qid_list, logits_list, label_list):
+            if qid not in rank_result:
+                rank_result[qid] = []
+            rank_result[qid].append((score, label))
+
+        for qid in rank_result:
+            rank_result[qid] = sorted(rank_result[qid], key=lambda x: x[0])
+
+        eval_outputs = list()
+        for metric in self.metrics:
+            if metric.startswith('mrr'):
+                k = metric.split('@')[-1]
+                k = int(k)
+                mrr = self.compute_mrr(rank_result, k=k)
+                logger.info('{}: {}'.format(metric, mrr))
+                eval_outputs.append((metric, mrr))
+            elif metric.startswith('ndcg'):
+                k = metric.split('@')[-1]
+                k = int(k)
+                ndcg = self.compute_ndcg(rank_result, k=k)
+                logger.info('{}: {}'.format(metric, ndcg))
+                eval_outputs.append(('ndcg', ndcg))
+            else:
+                raise NotImplementedError('Metric %s not implemented' % metric)
+
+        return dict(eval_outputs)
diff --git a/modelscope/trainers/nlp/sequence_classification_trainer.py b/modelscope/trainers/nlp/sequence_classification_trainer.py
index 64fd59b4..ec46e037 100644
--- a/modelscope/trainers/nlp/sequence_classification_trainer.py
+++ b/modelscope/trainers/nlp/sequence_classification_trainer.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import time
 from typing import Dict, Optional, Tuple, Union
 
diff --git a/modelscope/trainers/nlp/space/dialog_intent_trainer.py b/modelscope/trainers/nlp/space/dialog_intent_trainer.py
new file mode 100644
index 00000000..2e59cd80
--- /dev/null
+++ b/modelscope/trainers/nlp/space/dialog_intent_trainer.py
@@ -0,0 +1,136 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import os
+import time
+from typing import Callable, Dict, Optional, Tuple, Union
+
+import numpy as np
+
+from modelscope.metainfo import Trainers
+from modelscope.models.nlp.space.model.generator import SpaceGenerator
+from modelscope.models.nlp.space.model.model_base import SpaceModelBase
+from modelscope.preprocessors.space.data_loader import \
+    get_sequential_data_loader
+from modelscope.preprocessors.space.fields.intent_field import \
+    IntentBPETextField
+from modelscope.preprocessors.space.preprocess import intent_preprocess
+from modelscope.trainers.base import BaseTrainer
+from modelscope.trainers.builder import TRAINERS
+from modelscope.trainers.nlp.space.trainer.intent_trainer import IntentTrainer
+from modelscope.utils.config import Config
+from modelscope.utils.logger import get_logger
+
+PATH = None
+logger = get_logger(PATH)
+
+
+@TRAINERS.register_module(module_name=Trainers.dialog_intent_trainer)
+class DialogIntentTrainer(BaseTrainer):
+
+    def __init__(self,
+                 cfg_file: Optional[str] = None,
+                 cfg_modify_fn: Optional[Callable] = None,
+                 *args,
+                 **kwargs):
+        super().__init__(os.path.join(kwargs['model_dir'], kwargs['cfg_name']))
+
+        def to_tensor(array):
+            """
+            numpy array -> tensor
+            """
+            import torch
+            array = torch.tensor(array)
+            return array.cuda() if self.cfg.use_gpu else array
+
+        def setup_seed(seed):
+            import random
+            import torch
+            torch.manual_seed(seed)
+            torch.cuda.manual_seed_all(seed)
+            np.random.seed(seed)
+            random.seed(seed)
+            torch.backends.cudnn.deterministic = True
+
+        self.cfg_modify_fn = cfg_modify_fn
+        self.cfg = self.rebuild_config(self.cfg)
+
+        setup_seed(self.cfg.Trainer.seed)
+
+        # preprocess data
+        intent_preprocess(self.cfg.Model.init_checkpoint, self.cfg)
+        # set reader and evaluator
+        bpe = IntentBPETextField(self.cfg.Model.init_checkpoint, self.cfg)
+
+        self.cfg.Model.num_token_embeddings = bpe.vocab_size
+        self.cfg.Model.num_turn_embeddings = bpe.max_ctx_turn + 1
+        dataset_paths = [
+            os.path.join(self.cfg.Dataset.data_dir,
+                         self.cfg.Dataset.trigger_data)
+        ]
+        # set data and data status
+        collate_fn = bpe.collate_fn_multi_turn
+        self.train_label_loader = get_sequential_data_loader(
+            batch_size=self.cfg.Trainer.batch_size_label,
+            reader=bpe,
+            hparams=self.cfg,
+            data_paths=dataset_paths,
+            collate_fn=collate_fn,
+            data_type='train')
+        self.valid_label_loader = get_sequential_data_loader(
+            batch_size=self.cfg.Trainer.batch_size_label,
+            reader=bpe,
+            hparams=self.cfg,
+            data_paths=dataset_paths,
+            collate_fn=collate_fn,
+            data_type='valid')
+        self.test_label_loader = get_sequential_data_loader(
+            batch_size=self.cfg.Trainer.batch_size_label,
+            reader=bpe,
+            hparams=self.cfg,
+            data_paths=dataset_paths,
+            collate_fn=collate_fn,
+            data_type='test')
+
+        # set generator
+        generator = SpaceGenerator.create(self.cfg, reader=bpe)
+        # construct model
+        self.model = SpaceModelBase.create(
+            self.cfg.Model.init_checkpoint,
+            self.cfg,
+            reader=bpe,
+            generator=generator)
+
+        import torch
+
+        # multi-gpu
+        if self.cfg.Trainer.gpu > 1 and torch.cuda.device_count() > 1:
+            self.model = torch.nn.DataParallel(self.model)
+
+        # construct trainer
+        self.trainer = IntentTrainer(
+            self.model, to_tensor, self.cfg, reader=bpe)
+        num_batches = len(self.train_label_loader)
+        self.trainer.set_optimizers(num_training_steps_per_epoch=num_batches)
+        # load model, optimizer and lr_scheduler
+        self.trainer.load()
+
+    def rebuild_config(self, cfg: Config):
+        if self.cfg_modify_fn is not None:
+            return self.cfg_modify_fn(cfg)
+        return cfg
+
+    def train(self, *args, **kwargs):
+        logger.info('Train')
+
+        self.trainer.train(
+            train_label_iter=self.train_label_loader,
+            valid_label_iter=self.valid_label_loader)
+
+    def evaluate(self,
+                 checkpoint_path: Optional[str] = None,
+                 *args,
+                 **kwargs) -> Dict[str, float]:
+        logger.info('Evaluate')
+        self.trainer.infer(
+            data_iter=self.test_label_loader,
+            ex_data_iter=self.train_label_loader)
diff --git a/modelscope/trainers/nlp/space/dialog_modeling_trainer.py b/modelscope/trainers/nlp/space/dialog_modeling_trainer.py
new file mode 100644
index 00000000..726404d4
--- /dev/null
+++ b/modelscope/trainers/nlp/space/dialog_modeling_trainer.py
@@ -0,0 +1,132 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import os
+import time
+from typing import Callable, Dict, Optional, Tuple, Union
+
+import numpy as np
+
+from modelscope.metainfo import Trainers
+from modelscope.models.nlp.space.model.generator import SpaceGenerator
+from modelscope.models.nlp.space.model.model_base import SpaceModelBase
+from modelscope.preprocessors.space.fields.gen_field import \
+    MultiWOZBPETextField
+from modelscope.trainers.base import BaseTrainer
+from modelscope.trainers.builder import TRAINERS
+from modelscope.trainers.nlp.space.eval import MultiWOZEvaluator
+from modelscope.trainers.nlp.space.trainer.gen_trainer import MultiWOZTrainer
+from modelscope.utils.config import Config, ModelFile
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+
+def setup_seed(seed: int):
+    import random
+    import torch
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+    np.random.seed(seed)
+    random.seed(seed)
+    torch.backends.cudnn.deterministic = True
+
+
+@TRAINERS.register_module(module_name=Trainers.dialog_modeling_trainer)
+class DialogModelingTrainer(BaseTrainer):
+
+    def __init__(self,
+                 cfg_file: Optional[str] = None,
+                 cfg_modify_fn: Optional[Callable] = None,
+                 *args,
+                 **kwargs):
+
+        super().__init__(os.path.join(kwargs['model_dir'], kwargs['cfg_name']))
+
+        self.cfg_modify_fn = cfg_modify_fn
+        self.cfg = self.rebuild_config(self.cfg)
+
+        setup_seed(self.cfg.Trainer.seed)
+
+        # set reader and evaluator
+        self.bpe = MultiWOZBPETextField(self.cfg, **kwargs)
+
+        self.cfg.Model.num_token_embeddings = self.bpe.vocab_size
+        self.cfg.Model.num_turn_embeddings = self.bpe.max_ctx_turn + 1
+
+        if 'work_dir' in kwargs:
+            self.cfg.Trainer.save_dir = kwargs['work_dir']
+        else:
+            self.cfg.Trainer.save_dir = './default_save_dir'
+
+        # set data and data status
+        self.train_data = self.bpe.get_batches('train')
+        self.dev_data = self.bpe.get_batches('dev')
+
+        self.evaluator = MultiWOZEvaluator(reader=self.bpe, **kwargs)
+        # set generator
+        self.generator = SpaceGenerator.create(self.cfg, reader=self.bpe)
+        self._load_model(**kwargs)
+
+    def _load_model(self, **kwargs):
+
+        def to_tensor(array):
+            """
+            numpy array -> tensor
+            """
+            import torch
+            array = torch.tensor(array)
+            return array.cuda(
+            ) if self.cfg.use_gpu and torch.cuda.is_available() else array
+
+        # construct model
+        if 'model' in kwargs:
+            self.model = kwargs['model']
+        else:
+            self.model = SpaceModelBase.create(
+                kwargs['model_dir'],
+                self.cfg,
+                reader=self.bpe,
+                generator=self.generator)
+
+        import torch
+        # multi-gpu
+        if self.cfg.Trainer.gpu > 1 and torch.cuda.device_count() > 1:
+            self.model = torch.nn.DataParallel(self.model)
+
+        # construct trainer
+        self.trainer = MultiWOZTrainer(
+            self.model,
+            to_tensor,
+            self.cfg,
+            reader=self.bpe,
+            evaluator=self.evaluator)
+        self.trainer.set_optimizers()
+        # load model, optimizer and lr_scheduler
+        self.trainer.load()
+
+    def rebuild_config(self, cfg: Config):
+        if self.cfg_modify_fn is not None:
+            return self.cfg_modify_fn(cfg)
+        return cfg
+
+    def train(self, *args, **kwargs):
+        logger.info('Train')
+
+        self.trainer.train(train_data=self.train_data, dev_data=self.dev_data)
+
+    def evaluate(self,
+                 checkpoint_path: Optional[str] = None,
+                 *args,
+                 **kwargs) -> Dict[str, float]:
+        logger.info('Evaluate')
+        self.cfg.do_infer = True
+
+        # get best checkpoint path
+        pos = checkpoint_path.rfind('/')
+        checkpoint_name = checkpoint_path[pos + 1:]
+        checkpoint_dir = checkpoint_path[:pos]
+
+        assert checkpoint_name == ModelFile.TORCH_MODEL_BIN_FILE
+        kwargs['model_dir'] = checkpoint_dir
+        self._load_model(**kwargs)
+        self.trainer.infer(data_type='test')
diff --git a/modelscope/trainers/nlp/space/eval.py b/modelscope/trainers/nlp/space/eval.py
new file mode 100644
index 00000000..f315ff07
--- /dev/null
+++ b/modelscope/trainers/nlp/space/eval.py
@@ -0,0 +1,952 @@
+# Copyright 2021-2022 The Alibaba DAMO NLP Team Authors.
+# Copyright from https://github.com/thu-spmi/LABES
+# Copyright from https://github.com/TonyNemo/UBAR-MultiWOZ
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+from collections import Counter
+
+import json
+import numpy as np
+from nltk.util import ngrams
+from sklearn.metrics import f1_score
+
+from modelscope.utils.nlp.space import ontology, utils
+from modelscope.utils.nlp.space.clean_dataset import clean_slot_values
+
+
+def similar(a, b):
+    return a == b or a in b or b in a or a.split()[0] == b.split(
+    )[0] or a.split()[-1] == b.split()[-1]
+
+
+def setsub(a, b):
+    junks_a = []
+    useless_constraint = [
+        'temperature', 'week', 'est ', 'quick', 'reminder', 'near'
+    ]
+    for i in a:
+        flg = False
+        for j in b:
+            if similar(i, j):
+                flg = True
+        if not flg:
+            junks_a.append(i)
+    for junk in junks_a:
+        flg = False
+        for item in useless_constraint:
+            if item in junk:
+                flg = True
+        if not flg:
+            return False
+    return True
+
+
+def setsim(a, b):
+    a, b = set(a), set(b)
+    return setsub(a, b) and setsub(b, a)
+
+
+def DA_evaluate(preds, labels):
+    preds = np.array(preds)
+    labels = np.array(labels)
+    results = {}
+
+    for avg_name in ['micro']:
+        my_f1_score = f1_score(y_true=labels, y_pred=preds, average=avg_name)
+        results['f1_{}'.format(avg_name)] = my_f1_score
+
+    return results
+
+
+class BLEUScorer(object):
+    # BLEU score calculator via GentScorer interface
+    # it calculates the BLEU-4 by taking the entire corpus in
+    # Calulate based multiple candidates against multiple references
+    def __init__(self):
+        pass
+
+    def score(self, parallel_corpus):
+
+        # containers
+        count = [0, 0, 0, 0]
+        clip_count = [0, 0, 0, 0]
+        r = 0
+        c = 0
+        weights = [0.25, 0.25, 0.25, 0.25]
+
+        # accumulate ngram statistics
+        for hyps, refs in parallel_corpus:
+            hyps = [hyp.split() for hyp in hyps]
+            refs = [ref.split() for ref in refs]
+            for hyp in hyps:
+
+                for i in range(4):
+                    # accumulate ngram counts
+                    hypcnts = Counter(ngrams(hyp, i + 1))
+                    cnt = sum(hypcnts.values())
+                    count[i] += cnt
+
+                    # compute clipped counts
+                    max_counts = {}
+                    for ref in refs:
+                        refcnts = Counter(ngrams(ref, i + 1))
+                        for ng in hypcnts:
+                            max_counts[ng] = max(
+                                max_counts.get(ng, 0), refcnts[ng])
+                    clipcnt = \
+                        dict((ng, min(count, max_counts[ng])) for ng, count in hypcnts.items())
+                    clip_count[i] += sum(clipcnt.values())
+
+                # accumulate r & c
+                bestmatch = [1000, 1000]
+                for ref in refs:
+                    if bestmatch[0] == 0:
+                        break
+                    diff = abs(len(ref) - len(hyp))
+                    if diff < bestmatch[0]:
+                        bestmatch[0] = diff
+                        bestmatch[1] = len(ref)
+                r += bestmatch[1]
+                c += len(hyp)
+
+        # computing bleu score
+        p0 = 1e-7
+        bp = \
+            1 if c > r else math.exp(1 - float(r) / float(c))
+        p_ns = \
+            [float(clip_count[i]) / float(count[i] + p0) + p0 for i in range(4)]
+        s = \
+            math.fsum(w * math.log(p_n) for w, p_n in zip(weights, p_ns) if p_n)
+        bleu = bp * math.exp(s)
+        return bleu * 100
+
+
+""""
+For the data preparation and evaluation on MultiWOZ2.0/2.1,
+we refer to the code of UBAR (https://github.com/TonyNemo/UBAR-MultiWOZ)
+"""
+
+
+class MultiWOZEvaluator(object):
+
+    def __init__(self, reader, **kwargs):
+        self.reader = reader
+        self.domains = ontology.all_domains
+        self.all_data = self.reader.data
+        self.test_data = self.reader.test
+
+        self.bleu_scorer = BLEUScorer()
+
+        self.all_info_slot = []
+        for d, s_list in ontology.informable_slots.items():
+            for s in s_list:
+                self.all_info_slot.append(d + '-' + s)
+
+        # only evaluate these slots for dialog success
+        self.requestables = ['phone', 'address', 'postcode', 'reference', 'id']
+        self.db_dir = kwargs['data_dir']
+
+    def pack_dial(self, data):
+        dials = {}
+        for turn in data:
+            dial_id = turn['dial_id']
+            if dial_id not in dials:
+                dials[dial_id] = []
+            dials[dial_id].append(turn)
+        return dials
+
+    def validation_metric(self, data, fout=None):
+        bleu = self.bleu_metric(data)
+        # accu_single_dom, accu_multi_dom, multi_dom_num = self.domain_eval(data)
+        success, match, req_offer_counts, dial_num = \
+            self.context_to_response_eval(data, same_eval_as_cambridge=True, fout=fout)
+        return bleu, success, match
+
+    def bleu_metric(self, data, eval_dial_list=None):
+        gen, truth = [], []
+        for row in data:
+            if eval_dial_list and row[
+                    'dial_id'] + '.json' not in eval_dial_list:
+                continue
+            gen.append(row['resp_gen'])
+            truth.append(row['resp'])
+        wrap_generated = [[_] for _ in gen]
+        wrap_truth = [[_] for _ in truth]
+        if gen and truth:
+            try:
+                sc = self.bleu_scorer.score(zip(wrap_generated, wrap_truth))
+            except Exception:
+                sc = 0.0
+        else:
+            sc = 0.0
+        return sc
+
+    def context_to_response_eval(self,
+                                 data,
+                                 eval_dial_list=None,
+                                 same_eval_as_cambridge=False,
+                                 fout=None):
+        dials = self.pack_dial(data)
+        counts = {}
+        for req in self.requestables:
+            counts[req + '_total'] = 0
+            counts[req + '_offer'] = 0
+
+        dial_num, successes, matches = 0, 0, 0
+
+        for dial_id in dials:
+            if eval_dial_list and dial_id + '.json' not in eval_dial_list:
+                continue
+            dial = dials[dial_id]
+            reqs = {}
+            goal = {}
+            if '.json' not in dial_id and '.json' in list(
+                    self.all_data.keys())[0]:
+                dial_id = dial_id + '.json'
+            for domain in ontology.all_domains:
+                if self.all_data[dial_id]['goal'].get(domain):
+                    true_goal = self.all_data[dial_id]['goal']
+                    goal = self._parseGoal(goal, true_goal, domain)
+
+            for domain in goal.keys():
+                reqs[domain] = goal[domain]['requestable']
+
+            success, match, stats, counts = \
+                self._evaluateGeneratedDialogue(dial, goal, reqs, counts,
+                                                same_eval_as_cambridge=same_eval_as_cambridge, fout=fout)
+
+            successes += success
+            matches += match
+            dial_num += 1
+
+        succ_rate = successes / (float(dial_num) + 1e-10) * 100
+        match_rate = matches / (float(dial_num) + 1e-10) * 100
+        return succ_rate, match_rate, counts, dial_num
+
+    def _evaluateGeneratedDialogue(self,
+                                   dialog,
+                                   goal,
+                                   real_requestables,
+                                   counts,
+                                   soft_acc=False,
+                                   same_eval_as_cambridge=False,
+                                   fout=None):
+        """Evaluates the dialogue created by the model.
+            First we load the user goal of the dialogue, then for each turn
+            generated by the system we look for key-words.
+            For the Inform rate we look whether the entity was proposed.
+            For the Success rate we look for requestables slots"""
+        # for computing corpus success
+        requestables = self.requestables
+
+        # CHECK IF MATCH HAPPENED
+        provided_requestables = {}
+        venue_offered = {}
+        domains_in_goal = []
+        log = []
+        bspans = {}
+
+        for domain in goal.keys():
+            venue_offered[domain] = []
+            provided_requestables[domain] = []
+            domains_in_goal.append(domain)
+
+        for t, turn in enumerate(dialog):
+            if t == 0:
+                continue
+            if fout is not None:
+                log.append({
+                    'turn_num': turn['turn_num'],
+                    'turn_domain': turn['dspn'],
+                    'user': turn['user'],
+                    'aspn': turn['aspn'],
+                    'aspn_gen': turn['aspn_gen'],
+                    'resp': turn['resp'],
+                    'resp_gen': turn['resp_gen'],
+                    'pointer': turn['pointer'],
+                })
+
+            sent_t = turn['resp_gen']
+
+            for domain in goal.keys():
+                # for computing success
+                if same_eval_as_cambridge:
+                    # [restaurant_name], [hotel_name] instead of [value_name]
+                    if self.reader.use_true_domain_for_ctr_eval:
+                        dom_pred = [d[1:-1] for d in turn['dspn'].split()]
+                    else:
+                        dom_pred = [d[1:-1] for d in turn['dspn_gen'].split()]
+
+                    if domain not in dom_pred:  # fail
+                        continue
+                if '[value_name]' in sent_t or '[value_id]' in sent_t:
+                    if domain in [
+                            'restaurant', 'hotel', 'attraction', 'train'
+                    ]:
+                        # HERE YOU CAN PUT YOUR BELIEF STATE ESTIMATION
+                        if not self.reader.use_true_curr_bspn and not self.reader.use_true_bspn_for_ctr_eval:
+                            bspn = turn['bspn_gen']
+                        else:
+                            bspn = turn['bspn']
+
+                        constraint_dict = self.reader.bspan_to_constraint_dict(
+                            bspn)
+                        if constraint_dict.get(domain):
+                            venues = self.reader.db.queryJsons(
+                                domain,
+                                constraint_dict[domain],
+                                return_name=True)
+                        else:
+                            venues = []
+
+                        if len(venue_offered[domain]) == 0 and venues:
+
+                            venue_offered[domain] = venues
+                            bspans[domain] = constraint_dict[domain]
+                        else:
+                            flag = False
+                            for ven in venues:
+                                if ven not in venue_offered[domain]:
+                                    flag = True
+                                    break
+                            if flag and venues:  # sometimes there are no results so sample won't work
+                                venue_offered[domain] = venues
+                                bspans[domain] = constraint_dict[domain]
+                    else:  # not limited so we can provide one
+                        venue_offered[domain] = '[value_name]'
+
+                # ATTENTION: assumption here - we didn't provide phone or address twice! etc
+                for requestable in requestables:
+                    if requestable == 'reference':
+                        if '[value_reference]' in sent_t:
+                            if domain in ['restaurant', 'hotel', 'train']:
+                                if 'booked' in turn['pointer'] or 'ok' in turn[
+                                        'pointer'] or '[value_reference]' in turn[
+                                            'resp']:
+                                    # if pointer was allowing for that?
+                                    provided_requestables[domain].append(
+                                        'reference')
+                            else:
+                                provided_requestables[domain].append(
+                                    'reference')
+                    else:
+                        if '[value_' + requestable + ']' in sent_t:
+                            provided_requestables[domain].append(requestable)
+
+        # if name was given in the task
+        for domain in goal.keys():
+            # if name was provided for the user, the match is being done automatically
+            if 'name' in goal[domain]['informable']:
+                venue_offered[domain] = '[value_name]'
+
+            # special domains - entity does not need to be provided
+            if domain in ['taxi', 'police', 'hospital']:
+                venue_offered[domain] = '[value_name]'
+
+            if domain == 'train':
+                if not venue_offered[domain] and 'id' not in goal[domain][
+                        'requestable']:
+                    venue_offered[domain] = '[value_name]'
+        """
+        Given all inform and requestable slots
+        we go through each domain from the user goal
+        and check whether right entity was provided and
+        all requestable slots were given to the user.
+        The dialogue is successful if that's the case for all domains.
+        """
+        # HARD EVAL
+        stats = {
+            'restaurant': [0, 0, 0],
+            'hotel': [0, 0, 0],
+            'attraction': [0, 0, 0],
+            'train': [0, 0, 0],
+            'taxi': [0, 0, 0],
+            'hospital': [0, 0, 0],
+            'police': [0, 0, 0]
+        }
+
+        match = 0
+        success = 0
+        # MATCH
+        for domain in goal.keys():
+            match_stat = 0
+            if domain in ['restaurant', 'hotel', 'attraction', 'train']:
+                goal_venues = self.reader.db.queryJsons(
+                    domain, goal[domain]['informable'], return_name=True)
+                if type(venue_offered[domain]
+                        ) is str and '_name' in venue_offered[domain]:
+                    match += 1
+                    match_stat = 1
+                elif len(venue_offered[domain]) > 0 and len(
+                        set(venue_offered[domain]) & set(goal_venues)) > 0:
+                    match += 1
+                    match_stat = 1
+            else:
+                if '_name]' in venue_offered[domain]:
+                    match += 1
+                    match_stat = 1
+
+            stats[domain][0] = match_stat
+            stats[domain][2] = 1
+
+        if soft_acc:
+            match = float(match) / len(goal.keys())
+        else:
+            if match == len(goal.keys()):
+                match = 1.0
+            else:
+                match = 0.0
+
+        for domain in domains_in_goal:
+            for request in real_requestables[domain]:
+                counts[request + '_total'] += 1
+                if request in provided_requestables[domain]:
+                    counts[request + '_offer'] += 1
+
+        # SUCCESS
+        if fout is not None:
+            for domain in domains_in_goal:
+                success_stat = 0
+                domain_success = 0
+                if len(real_requestables[domain]) == 0:
+                    success += 1
+                    success_stat = 1
+                    stats[domain][1] = success_stat
+                    continue
+                # if values in sentences are super set of requestables
+                for request in real_requestables[domain]:
+                    if request in provided_requestables[domain]:
+                        domain_success += 1
+
+                if domain_success == len(real_requestables[domain]):
+                    success += 1
+                    success_stat = 1
+
+                stats[domain][1] = success_stat
+
+            # final eval
+            if soft_acc:
+                success = float(success) / len(real_requestables)
+            else:
+                if success >= len(real_requestables):
+                    success = 1
+                else:
+                    success = 0
+        else:
+            if match == 1.0:
+                for domain in domains_in_goal:
+                    success_stat = 0
+                    domain_success = 0
+                    if len(real_requestables[domain]) == 0:
+                        success += 1
+                        success_stat = 1
+                        stats[domain][1] = success_stat
+                        continue
+                    # if values in sentences are super set of requestables
+                    for request in real_requestables[domain]:
+                        if request in provided_requestables[domain]:
+                            domain_success += 1
+
+                    if domain_success == len(real_requestables[domain]):
+                        success += 1
+                        success_stat = 1
+
+                    stats[domain][1] = success_stat
+
+                # final eval
+                if soft_acc:
+                    success = float(success) / len(real_requestables)
+                else:
+                    if success >= len(real_requestables):
+                        success = 1
+                    else:
+                        success = 0
+
+        if fout is not None and success == 0:
+            sample = {
+                dialog[0]['dial_id']: {
+                    'log': log,
+                    'real_requestables': real_requestables,
+                    'provided_requestables': provided_requestables
+                }
+            }
+            line = json.dumps(sample)
+            fout.write(line)
+            fout.write('\n')
+
+        return success, match, stats, counts
+
+    def _parseGoal(self, goal, true_goal, domain):
+        """Parses user goal into dictionary format."""
+        goal[domain] = {}
+        goal[domain] = {'informable': {}, 'requestable': [], 'booking': []}
+        if 'info' in true_goal[domain]:
+            if domain == 'train':
+                # we consider dialogues only where train had to be booked!
+                if 'book' in true_goal[domain]:
+                    goal[domain]['requestable'].append('reference')
+                if 'reqt' in true_goal[domain]:
+                    if 'id' in true_goal[domain]['reqt']:
+                        goal[domain]['requestable'].append('id')
+            else:
+                if 'reqt' in true_goal[domain]:
+                    for s in true_goal[domain]['reqt']:  # addtional requests:
+                        if s in [
+                                'phone', 'address', 'postcode', 'reference',
+                                'id'
+                        ]:
+                            # ones that can be easily delexicalized
+                            goal[domain]['requestable'].append(s)
+                if 'book' in true_goal[domain]:
+                    goal[domain]['requestable'].append('reference')
+
+            for s, v in true_goal[domain]['info'].items():
+                s_, v_ = clean_slot_values(self.db_dir, domain, s, v)
+                if len(v_.split()) > 1:
+                    v_ = ' '.join(
+                        [token.text for token in self.reader.nlp(v_)]).strip()
+                goal[domain]['informable'][s_] = v_
+
+            if 'book' in true_goal[domain]:
+                goal[domain]['booking'] = true_goal[domain]['book']
+        return goal
+
+
+class GenericEvaluator:
+
+    def __init__(self, reader):
+        self.reader = reader
+        self.metric_dict = {}
+
+    def pack_dial(self, data):
+        dials = {}
+        for turn in data:
+            dial_id = turn['dial_id']
+            if dial_id not in dials:
+                dials[dial_id] = []
+            dials[dial_id].append(turn)
+        return dials
+
+    def run_metrics(self, results):
+        raise ValueError('Please specify the evaluator first')
+
+    def bleu_metric(self, data, type='bleu'):
+        gen, truth = [], []
+        for row in data:
+            gen.append(self.clean(row['resp_gen']))
+            # gen.append(self.clean(row['resp']))
+            truth.append(self.clean(row['resp']))
+        wrap_generated = [[_] for _ in gen]
+        wrap_truth = [[_] for _ in truth]
+        sc = BLEUScorer().score(zip(wrap_generated, wrap_truth))
+        return sc
+
+    def _normalize_constraint(self,
+                              constraint,
+                              ignore_dontcare=False,
+                              intersection=True):
+        """
+        Normalize belief span, e.g. delete repeated words
+        :param constraint - {'food': 'asian oritental', 'pricerange': 'cheap'}
+        :param intersection: if true, only keeps the words that appear in th ontology
+                                        we set intersection=True as in previous works
+        :returns: normalized constraint dict
+                      e.g. - {'food': 'asian oritental', 'pricerange': 'cheap', 'area': ''}
+        """
+        normalized = {}
+        for s in self.informable_slots:
+            normalized[s] = ''
+        for s, v in constraint.items():
+            if ignore_dontcare and v == 'dontcare':
+                continue
+            if intersection and v != 'dontcare' and v not in self.entities_flat:
+                continue
+
+            normalized[s] = v
+
+        return normalized
+
+    def _normalize_act(self, aspn, intersection=False):
+        aspn_list = aspn.split('|')
+        normalized = {}
+        for i, v in enumerate(aspn_list):
+            seq = v.strip()
+            word_set = set()
+            for w in seq.split():
+                if intersection:
+                    if self.reader.act_order[i] == 'av':
+                        if '[value' in w:
+                            word_set.add(w)
+                    else:
+                        if w in self.requestable_slots:
+                            word_set.add(w)
+                else:
+                    word_set.add(w)
+            normalized[self.reader.act_order[i]] = word_set
+        return normalized
+
+    def tracker_metric(self, data, normalize=True):
+        # turn level metric
+        tp, fp, fn, db_correct = 0, 0, 0, 0
+        goal_accr, slot_accr, total = 0, {}, 1e-8
+        for s in self.informable_slots:
+            slot_accr[s] = 0
+
+        for row in data:
+            if normalize:
+                gen = self._normalize_constraint(row['bspn_gen'])
+                truth = self._normalize_constraint(row['bspn'])
+            else:
+                gen = self._normalize_constraint(
+                    row['bspn_gen'], intersection=False)
+                truth = self._normalize_constraint(
+                    row['bspn'], intersection=False)
+            valid = 'thank' not in row['user'] and 'bye' not in row['user']
+            if valid:
+                for slot, value in gen.items():
+                    if value in truth[slot]:
+                        tp += 1
+                    else:
+                        fp += 1
+                for slot, value in truth.items():
+                    if value not in gen[slot]:
+                        fn += 1
+
+            if truth and valid:
+                total += 1
+                for s in self.informable_slots:
+                    if gen[s] == truth[s]:
+                        slot_accr[s] += 1
+                if gen == truth:
+                    goal_accr += 1
+                if row.get('db_gen') and row.get('db_match'):
+                    if row['db_gen'] == row['db_match']:
+                        db_correct += 1
+        precision, recall = tp / (tp + fp + 1e-8), tp / (tp + fn + 1e-8)
+        f1 = 2 * precision * recall / (precision + recall + 1e-8)
+        goal_accr /= total
+        db_correct /= total
+        for s in slot_accr:
+            slot_accr[s] /= total
+        return precision, recall, f1, goal_accr, slot_accr, db_correct
+
+    def request_metric(self, data):
+        # dialog level metric
+        dials = self.pack_dial(data)
+        tp, fp, fn = 0, 0, 0
+        for dial_id in dials:
+            truth_req, gen_req = set(), set()
+            dial = dials[dial_id]
+            for turn_num, turn in enumerate(dial):
+                resp_gen_token = self.clean(turn['resp_gen']).split()
+                resp_token = self.clean(turn['resp']).split()
+                for w in resp_gen_token:
+                    if '[value_' in w and w.endswith(
+                            ']') and w != '[value_name]':
+                        gen_req.add(w[1:-1].split('_')[1])
+                for w in resp_token:
+                    if '[value_' in w and w.endswith(
+                            ']') and w != '[value_name]':
+                        truth_req.add(w[1:-1].split('_')[1])
+            for req in gen_req:
+                if req in truth_req:
+                    tp += 1
+                else:
+                    fp += 1
+            for req in truth_req:
+                if req not in gen_req:
+                    fn += 1
+        precision, recall = tp / (tp + fp + 1e-8), tp / (tp + fn + 1e-8)
+        f1 = 2 * precision * recall / (precision + recall + 1e-8)
+        return f1, precision, recall
+
+    def act_metric(self, data):
+        # turn level metric
+        tp, fp, fn = {
+            'all_s': 0,
+            'all_v': 0
+        }, {
+            'all_s': 0,
+            'all_v': 0
+        }, {
+            'all_s': 0,
+            'all_v': 0
+        }
+        for s in self.requestable_slots:
+            tp[s], fp[s], fn[s] = 0, 0, 0
+            tp['[value_%s]' % s], fp['[value_%s]' % s], fn['[value_%s]'
+                                                           % s] = 0, 0, 0
+
+        for row in data:
+            gen = self._normalize_act(row['aspn_gen'])
+            truth = self._normalize_act(row['aspn'])
+            valid = 'thank' not in row['user'] and 'bye' not in row['user']
+            if valid:
+                # how well the act decoder captures user's requests
+                for value in gen['av']:
+                    if value in truth['av']:
+                        tp['all_v'] += 1
+                        if tp.get(value):
+                            tp[value] += 1
+                    else:
+                        fp['all_v'] += 1
+                        if fp.get(value):
+                            fp[value] += 1
+                for value in truth['av']:
+                    if value not in gen['av']:
+                        fn['all_v'] += 1
+                        if fn.get(value):
+                            fn[value] += 1
+
+                # how accurately the act decoder predicts system's question
+                if 'as' not in gen:
+                    continue
+                for slot in gen['as']:
+                    if slot in truth['as']:
+                        tp['all_s'] += 1
+                        if tp.get(slot):
+                            tp[slot] += 1
+                    else:
+                        fp['all_s'] += 1
+                        if fp.get(slot):
+                            fp[slot] += 1
+                for slot in truth['as']:
+                    if slot not in gen['as']:
+                        fn['all_s'] += 1
+                        if fn.get(slot):
+                            fn[slot] += 1
+
+        result = {}
+        for k, v in tp.items():
+            precision, recall = tp[k] / (tp[k] + fp[k] + 1e-8), tp[k] / (
+                tp[k] + fn[k] + 1e-8)
+            f1 = 2 * precision * recall / (precision + recall + 1e-8)
+            result[k] = [f1, precision, recall]
+        return result
+
+
+"""
+For the data preparation and evaluation on In-Car Assistant/CamRest,
+we refer to the code of LABES (https://github.com/thu-spmi/LABES)
+"""
+
+
+class CamRestEvaluator(GenericEvaluator):
+
+    def __init__(self, reader):
+        super().__init__(reader)
+        self.entities_flat, self.entitiy_to_slot_dict = self.get_entities(
+            self.reader.ontology_path)
+        self.informable_slots = self.reader.otlg.informable_slots
+        self.requestable_slots = self.reader.otlg.requestable_slots
+
+    def run_metrics(self, results):
+        metrics = {}
+        bleu = self.bleu_metric(results)
+        p, r, f1, goal_acc, slot_acc, db_acc = self.tracker_metric(results)
+        match = self.match_metric(results)
+        req_f1, req_p, req_r = self.request_metric(results)
+
+        metrics['bleu'] = bleu
+        metrics['match'] = match
+        metrics['req_f1'] = req_f1
+        metrics['joint_goal'] = goal_acc
+        metrics['slot_accu'] = slot_acc
+        metrics['slot-p/r/f1'] = (p, r, f1)
+        metrics['db_acc'] = db_acc
+
+        return metrics
+
+    def get_entities(self, entity_path):
+        entities_flat = []
+        entitiy_to_slot_dict = {}
+        raw_entities = json.loads(open(entity_path).read().lower())
+        for s in raw_entities['informable']:
+            entities_flat.extend(raw_entities['informable'][s])
+            for v in raw_entities['informable'][s]:
+                entitiy_to_slot_dict[v] = s
+        return entities_flat, entitiy_to_slot_dict
+
+    def constraint_same(self, truth_cons, gen_cons):
+        if not truth_cons and not gen_cons:
+            return True
+        if not truth_cons or not gen_cons:
+            return False
+        return setsim(gen_cons, truth_cons)
+
+    def match_metric(self, data):
+        dials = self.pack_dial(data)
+        match, total = 0, 1e-8
+        for dial_id in dials:
+            dial = dials[dial_id]
+            truth_cons, gen_cons = {'1': '', '2': '', '3': ''}, None
+            for turn_num, turn in enumerate(dial):
+                # find the last turn which the system provide an entity
+                if '[value' in turn['resp_gen']:
+                    gen_cons = self._normalize_constraint(
+                        turn['bspn_gen'], ignore_dontcare=True)
+                if '[value' in turn['resp']:
+                    truth_cons = self._normalize_constraint(
+                        turn['bspn'], ignore_dontcare=True)
+            if not gen_cons:
+                # if no entity is provided, choose the state of the last dialog turn
+                gen_cons = self._normalize_constraint(
+                    dial[-1]['bspn_gen'], ignore_dontcare=True)
+            if list(truth_cons.values()) != ['', '', '']:
+                if gen_cons == truth_cons:
+                    match += 1
+                total += 1
+
+        return match / total
+
+    def clean(self, resp):
+        # we  use the same clean process as in Sequicity, SEDST, FSDM
+        # to ensure comparable results
+        resp = resp.replace(f'{self.reader.sos_r_token} ', '')
+        resp = resp.replace(f' {self.reader.eos_r_token}', '')
+        resp = f'{self.reader.sos_r_token} {resp} {self.reader.eos_r_token}'
+        for value, slot in self.entitiy_to_slot_dict.items():
+
+            resp = utils.clean_replace(resp, value, '[value_%s]' % slot)
+        return resp
+
+
+class KvretEvaluator(GenericEvaluator):
+
+    def __init__(self, reader):
+        super().__init__(reader)
+        self.entities_flat, self.entitiy_to_slot_dict = self.get_entities(
+            self.reader.ontology_path)
+        self.informable_slots = self.reader.otlg.informable_slots
+        self.requestable_slots = self.reader.otlg.requestable_slots
+
+    def run_metrics(self, results):
+        metrics = {}
+        bleu = self.bleu_metric(results)
+        p, r, f1, goal_acc, slot_acc, db_acc = self.tracker_metric(
+            results, normalize=True)
+        match = self.match_metric(results)
+        req_f1, req_p, req_r = self.request_metric(results)
+
+        metrics['bleu'] = bleu
+        metrics['match'] = match
+        metrics['req_f1'] = req_f1
+        metrics['joint_goal'] = goal_acc
+        metrics['slot_accu'] = slot_acc
+        metrics['slot-p/r/f1'] = (p, r, f1)
+        metrics['db_acc'] = db_acc
+
+        return metrics
+
+    def _normalize_constraint(self,
+                              constraint,
+                              ignore_dontcare=False,
+                              intersection=True):
+        """
+        Normalize belief span, e.g. delete repeated words
+        :param constraint - {'food': 'asian oritental', 'pricerange': 'cheap'}
+        :param intersection: if true, only keeps the words that appear in th ontology
+                                        we set intersection=True as in previous works
+        :returns: normalized constraint dict
+                      e.g. - {'food': 'asian oritental', 'pricerange': 'cheap', 'area': ''}
+        """
+        junk = [
+            'good', 'great', 'quickest', 'shortest', 'route', 'week',
+            'fastest', 'nearest', 'next', 'closest', 'way', 'mile', 'activity',
+            'restaurant', 'appointment'
+        ]
+        normalized = {}
+        for s in self.informable_slots:
+            normalized[s] = ''
+        for s, v in constraint.items():
+            for j in junk:
+                v = ' '.join(v.replace(j, '').split())
+            if intersection and v not in self.entities_flat:
+                continue
+
+            if s in self.informable_slots:
+                normalized[s] = v
+            else:
+                # TODO only use slot (not domain) in s for matching !!!
+                pass
+
+        return normalized
+
+    def get_entities(self, entity_path):
+        entities_flat = []
+        entitiy_to_slot_dict = {}
+
+        entitiy_to_slot_dict = self.reader.entity_dict
+        for s in entitiy_to_slot_dict:
+            if s not in entities_flat:
+                entities_flat.append(s)
+        return entities_flat, entitiy_to_slot_dict
+
+    def constraint_same(self, truth_cons, gen_cons):
+        if not truth_cons and not gen_cons:
+            return True
+        if not truth_cons or not gen_cons:
+            return False
+        return setsim(gen_cons, truth_cons)
+
+    def match_metric(self, data):
+        dials = self.pack_dial(data)
+        match, total = 0, 1e-8
+        for dial_id in dials:
+            dial = dials[dial_id]
+            truth_cons, gen_cons = {
+                '1': '',
+                '2': '',
+                '3': '',
+                '4': '',
+                '5': '',
+                '6': '',
+                '7': '',
+                '8': '',
+                '9': '',
+                '10': '',
+                '11': ''
+            }, None
+            for turn_num, turn in enumerate(dial):
+                # find the last turn which the system provide an entity
+                if '[value' in turn['resp_gen']:
+                    gen_cons = self._normalize_constraint(
+                        turn['bspn_gen'], ignore_dontcare=True)
+                if '[value' in turn['resp']:
+                    truth_cons = self._normalize_constraint(
+                        turn['bspn'], ignore_dontcare=True)
+
+            if not gen_cons:
+                # if no entity is provided, choose the state of the last dialog turn
+                gen_cons = self._normalize_constraint(
+                    dial[-1]['bspn_gen'], ignore_dontcare=True)
+
+            if list(truth_cons.values()) != [''] * 11:
+                gen_cons = [x for x in gen_cons.values() if x]
+                truth_cons = [x for x in truth_cons.values() if x]
+                if self.constraint_same(gen_cons, truth_cons):
+                    match += 1
+                total += 1
+
+        return match / total
+
+    def clean(self, resp):
+        # we  use the same clean process as in Sequicity, SEDST, FSDM
+        # to ensure comparable results
+        resp = resp.replace(f'{self.reader.sos_r_token} ', '')
+        resp = resp.replace(f' {self.reader.eos_r_token}', '')
+        resp = f'{self.reader.sos_r_token} {resp} {self.reader.eos_r_token}'
+        for value, slot in self.entitiy_to_slot_dict.items():
+            resp = utils.clean_replace(resp, value, '[value_%s]' % slot)
+        return resp
diff --git a/modelscope/trainers/nlp/space/metrics/metrics_tracker.py b/modelscope/trainers/nlp/space/metrics/metrics_tracker.py
index 865600d3..340077a6 100644
--- a/modelscope/trainers/nlp/space/metrics/metrics_tracker.py
+++ b/modelscope/trainers/nlp/space/metrics/metrics_tracker.py
@@ -1,6 +1,4 @@
-"""
-MetricsTracker class
-"""
+# Copyright (c) Alibaba, Inc. and its affiliates.
 
 import math
 from collections import defaultdict
diff --git a/modelscope/trainers/nlp/space/trainer/gen_trainer.py b/modelscope/trainers/nlp/space/trainer/gen_trainer.py
index aa28d798..34cd2f9b 100644
--- a/modelscope/trainers/nlp/space/trainer/gen_trainer.py
+++ b/modelscope/trainers/nlp/space/trainer/gen_trainer.py
@@ -15,27 +15,11 @@ from transformers.optimization import AdamW, get_linear_schedule_with_warmup
 
 from modelscope.trainers.nlp.space.metrics.metrics_tracker import \
     MetricsTracker
+from modelscope.utils.constant import ModelFile
+from modelscope.utils.logger import get_logger
 from modelscope.utils.nlp.space import ontology
 
 
-def get_logger(log_path, name='default'):
-    logger = logging.getLogger(name)
-    logger.propagate = False
-    logger.setLevel(logging.DEBUG)
-
-    formatter = logging.Formatter('%(message)s')
-
-    sh = logging.StreamHandler(sys.stdout)
-    sh.setFormatter(formatter)
-    logger.addHandler(sh)
-
-    fh = logging.FileHandler(log_path, mode='w')
-    fh.setFormatter(formatter)
-    logger.addHandler(fh)
-
-    return logger
-
-
 class Trainer(object):
 
     def __init__(self,
@@ -51,15 +35,16 @@ class Trainer(object):
 
         self.do_train = config.do_train
         self.do_infer = config.do_infer
-        self.is_decreased_valid_metric = config.Trainer.valid_metric_name[
-            0] == '-'
-        self.valid_metric_name = config.Trainer.valid_metric_name[1:]
-        self.num_epochs = config.Trainer.num_epochs
-        # self.save_dir = config.Trainer.save_dir
-        self.log_steps = config.Trainer.log_steps
-        self.valid_steps = config.Trainer.valid_steps
-        self.save_checkpoint = config.Trainer.save_checkpoint
-        self.save_summary = config.Trainer.save_summary
+        if self.do_train:
+            self.is_decreased_valid_metric = config.Trainer.valid_metric_name[
+                0] == '-'
+            self.valid_metric_name = config.Trainer.valid_metric_name[1:]
+            self.num_epochs = config.Trainer.num_epochs
+            self.save_dir = config.Trainer.save_dir
+            self.log_steps = config.Trainer.log_steps
+            self.valid_steps = config.Trainer.valid_steps
+            self.save_checkpoint = config.Trainer.save_checkpoint
+            self.save_summary = config.Trainer.save_summary
         self.lr = config.Model.lr
         self.weight_decay = config.Model.weight_decay
         self.batch_size = config.Trainer.batch_size
@@ -71,22 +56,21 @@ class Trainer(object):
         self.optimizer = optimizer
 
         self.model = model
-        self.func_model = self.model.module if self.gpu > 1 else self.model
+        self.func_model = self.model.module if self.gpu > 1 and config.use_gpu else self.model
         self.reader = reader
         self.evaluator = evaluator
         self.tokenizer = reader.tokenizer
 
-        # if not os.path.exists(self.save_dir):
-        #     os.makedirs(self.save_dir)
-
-        # self.logger = logger or get_logger(os.path.join(self.save_dir, "trainer.log"), "trainer")
-        self.logger = logger or get_logger('trainer.log', 'trainer')
+        self.logger = get_logger()
 
         self.batch_metrics_tracker = MetricsTracker()
         self.token_metrics_tracker = MetricsTracker()
 
-        self.best_valid_metric = float(
-            'inf' if self.is_decreased_valid_metric else '-inf')
+        if self.do_train:
+            if not os.path.exists(self.save_dir):
+                os.makedirs(self.save_dir)
+            self.best_valid_metric = float(
+                'inf' if self.is_decreased_valid_metric else '-inf')
         self.epoch = 0
 
     def decode_generated_bspn_resp(self, generated):
@@ -248,9 +232,12 @@ class Trainer(object):
 
         # Save current best model
         if is_best:
-            best_model_file = os.path.join(self.save_dir, 'best.model')
+            best_model_file = os.path.join(self.save_dir,
+                                           ModelFile.TORCH_MODEL_BIN_FILE)
             torch.save(self.model.state_dict(), best_model_file)
-            best_train_file = os.path.join(self.save_dir, 'best.train')
+            best_train_file = os.path.join(
+                self.save_dir,
+                '{}.train'.format(ModelFile.TORCH_MODEL_BIN_FILE))
             torch.save(train_state, best_train_file)
             self.logger.info(
                 f"Saved best model state to '{best_model_file}' with new best valid metric "
@@ -324,8 +311,7 @@ class Trainer(object):
 
             self.func_model.load_state_dict(model_state_dict)
             self.logger.info(
-                f"Loaded model state from '{self.func_model.init_checkpoint}.model'"
-            )
+                f"Loaded model state from '{self.func_model.init_checkpoint}'")
 
         def _load_train_state():
             train_file = f'{self.func_model.init_checkpoint}.train'
@@ -558,19 +544,17 @@ class MultiWOZTrainer(Trainer):
                         generated_bs = outputs[0].cpu().numpy().tolist()
                         bspn_gen = self.decode_generated_bspn(generated_bs)
                         # check DB result
-                        if self.reader.use_true_db_pointer:  # To control whether current db is ground truth
+                        if self.reader.use_true_db_pointer:
                             db = turn['db']
                         else:
                             db_result = self.reader.bspan_to_DBpointer(
                                 self.tokenizer.decode(bspn_gen),
                                 turn['turn_domain'])
-                            assert len(turn['db']) == 4
-                            book_result = turn['db'][2]
+                            assert len(turn['db']) == 3
                             assert isinstance(db_result, str)
                             db = \
                                 [self.reader.sos_db_id] + \
                                 self.tokenizer.convert_tokens_to_ids([db_result]) + \
-                                [book_result] + \
                                 [self.reader.eos_db_id]
                             prompt_id = self.reader.sos_a_id
 
@@ -636,7 +620,7 @@ class MultiWOZTrainer(Trainer):
         score = 0.5 * (success + match) + bleu
 
         # log results
-        metrics_message = 'match: %2.2f  success: %2.2f  bleu: %2.2f  score: %.2f' %\
+        metrics_message = 'match: %2.2f  success: %2.2f  bleu: %2.2f  score: %.2f' % \
                           (match, success, bleu, score)
         message_prefix = f'[Infer][{self.epoch}]'
         time_cost = f'TIME-{time.time() - begin_time:.3f}'
diff --git a/modelscope/trainers/nlp_trainer.py b/modelscope/trainers/nlp_trainer.py
index 3692b486..b54aa666 100644
--- a/modelscope/trainers/nlp_trainer.py
+++ b/modelscope/trainers/nlp_trainer.py
@@ -1,6 +1,9 @@
-import os
-from typing import Callable, Dict, Optional, Tuple, Union
+# Copyright (c) Alibaba, Inc. and its affiliates.
 
+import os
+from typing import Callable, Optional, Tuple, Union
+
+import numpy as np
 import torch
 from torch import nn
 from torch.utils.data import Dataset
@@ -11,9 +14,10 @@ from modelscope.metrics.builder import build_metric
 from modelscope.models.base import Model, TorchModel
 from modelscope.msdatasets import MsDataset
 from modelscope.preprocessors import Preprocessor, build_preprocessor
-from modelscope.utils.config import Config, ConfigDict
+from modelscope.utils.config import Config
 from modelscope.utils.constant import (DEFAULT_MODEL_REVISION, ModeKeys,
                                        ModelFile, Tasks)
+from modelscope.utils.hub import parse_label_mapping
 from .base import TRAINERS
 from .trainer import EpochBasedTrainer
 
@@ -81,19 +85,32 @@ class NlpEpochBasedTrainer(EpochBasedTrainer):
             assert cfg_file is not None, 'Config file should not be None if model is an nn.Module class'
             model_dir = os.path.dirname(cfg_file)
 
+        self.label2id = None
+        self.id2label = None
+        self.num_labels = None
         self.cfg_modify_fn = cfg_modify_fn
         self.cfg = self.rebuild_config(Config.from_file(cfg_file))
-        try:
-            labels = self.cfg.dataset.train.labels
-        except AttributeError:
-            labels = None
 
-        self.label2id = None
-        self.num_labels = None
-        if labels is not None and len(labels) > 0:
-            self.label2id = {label: idx for idx, label in enumerate(labels)}
-            self.id2label = {idx: label for idx, label in enumerate(labels)}
-            self.num_labels = len(labels)
+        label2id = parse_label_mapping(model_dir)
+        if label2id is not None:
+            self.label2id = label2id
+            self.id2label = {id: label for label, id in label2id.items()}
+            self.num_labels = len(label2id)
+        else:
+            try:
+                labels = self.cfg.dataset.train.labels
+                if labels is not None and len(labels) > 0:
+                    self.label2id = {
+                        label: idx
+                        for idx, label in enumerate(labels)
+                    }
+                    self.id2label = {
+                        idx: label
+                        for idx, label in enumerate(labels)
+                    }
+                    self.num_labels = len(labels)
+            except AttributeError:
+                pass
 
         def build_dataset_keys(cfg):
             if cfg is not None:
@@ -130,7 +147,13 @@ class NlpEpochBasedTrainer(EpochBasedTrainer):
 
     def rebuild_config(self, cfg: Config):
         if self.cfg_modify_fn is not None:
-            return self.cfg_modify_fn(cfg)
+            cfg = self.cfg_modify_fn(cfg)
+        if not hasattr(cfg.model, 'label2id') and not hasattr(
+                cfg.model, 'id2label'):
+            if self.id2label is not None:
+                cfg.model['id2label'] = self.id2label
+            if self.label2id is not None:
+                cfg.model['label2id'] = self.label2id
         return cfg
 
     def build_model(self) -> Union[nn.Module, TorchModel]:
@@ -203,6 +226,9 @@ class VecoTrainer(NlpEpochBasedTrainer):
 
         """
         from modelscope.msdatasets.task_datasets import VecoDataset
+        if checkpoint_path is not None and os.path.isfile(checkpoint_path):
+            from modelscope.trainers.hooks import CheckpointHook
+            CheckpointHook.load_checkpoint(checkpoint_path, self)
         self.model.eval()
         self._mode = ModeKeys.EVAL
         metric_values = {}
@@ -223,12 +249,10 @@ class VecoTrainer(NlpEpochBasedTrainer):
                 self.eval_dataset, **self.cfg.evaluation.get('dataloader', {}))
             self.data_loader = self.eval_dataloader
 
-            metric_classes = [
-                build_metric(metric, default_args={'trainer': self})
-                for metric in self.metrics
-            ]
-            self.evaluation_loop(self.eval_dataloader, checkpoint_path,
-                                 metric_classes)
+            metric_classes = [build_metric(metric) for metric in self.metrics]
+            for m in metric_classes:
+                m.trainer = self
+            self.evaluation_loop(self.eval_dataloader, metric_classes)
 
             for m_idx, metric_cls in enumerate(metric_classes):
                 if f'eval_dataset[{idx}]' not in metric_values:
@@ -242,4 +266,8 @@ class VecoTrainer(NlpEpochBasedTrainer):
             else:
                 break
 
+        for metric_name in self.metrics:
+            metric_values[metric_name] = np.average(
+                [m[metric_name] for m in metric_values.values()])
+
         return metric_values
diff --git a/modelscope/trainers/optimizer/__init__.py b/modelscope/trainers/optimizer/__init__.py
index 884f3043..9962c2c2 100644
--- a/modelscope/trainers/optimizer/__init__.py
+++ b/modelscope/trainers/optimizer/__init__.py
@@ -1,4 +1,5 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 from .builder import OPTIMIZERS, build_optimizer
+from .child_tuning_adamw_optimizer import ChildTuningAdamW
 
-__all__ = ['OPTIMIZERS', 'build_optimizer']
+__all__ = ['OPTIMIZERS', 'build_optimizer', 'ChildTuningAdamW']
diff --git a/modelscope/trainers/optimizer/builder.py b/modelscope/trainers/optimizer/builder.py
index 4d772dd9..f43768d6 100644
--- a/modelscope/trainers/optimizer/builder.py
+++ b/modelscope/trainers/optimizer/builder.py
@@ -20,7 +20,10 @@ def build_optimizer(model: torch.nn.Module,
     """
     if hasattr(model, 'module'):
         model = model.module
-    cfg.params = model.parameters()
+
+    if default_args is None:
+        default_args = {}
+    default_args['params'] = model.parameters()
 
     return build_from_cfg(
         cfg, OPTIMIZERS, group_key=default_group, default_args=default_args)
diff --git a/modelscope/trainers/optimizer/child_tuning_adamw_optimizer.py b/modelscope/trainers/optimizer/child_tuning_adamw_optimizer.py
new file mode 100644
index 00000000..d004071f
--- /dev/null
+++ b/modelscope/trainers/optimizer/child_tuning_adamw_optimizer.py
@@ -0,0 +1,188 @@
+# Copyright 2021-2022 The Alibaba DAMO NLP Team Authors.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+import types
+from typing import Callable, Iterable, Tuple
+
+import numpy as np
+import torch
+from torch.distributions.bernoulli import Bernoulli
+from torch.optim import Optimizer
+
+from modelscope.utils.logger import get_logger
+from .builder import OPTIMIZERS, default_group
+
+logger = get_logger(__name__)
+
+__all__ = ['calculate_fisher', 'ChildTuningAdamW']
+
+
+def calculate_fisher(model: torch.nn.Module,
+                     data_loader,
+                     forward_step,
+                     reserve_p,
+                     grad_clip=None):
+
+    gradient_mask = dict()
+    model.train()
+    for name, params in model.named_parameters():
+        if 'layer' in name:
+            gradient_mask[params] = params.new_zeros(params.size())
+
+    iters = len(data_loader)
+    for inputs in data_loader:
+        loss = forward_step(model, inputs)
+        loss.backward()
+        for name, params in model.named_parameters():
+            if 'layer' in name:
+                if grad_clip is not None:
+                    torch.nn.utils.clip_grad_norm_(params, **grad_clip)
+                gradient_mask[params] += (params.grad**2) / iters
+        model.zero_grad()
+
+    logger.info('Calculate Fisher Information...')
+
+    # Numpy
+    r = None
+    for k, v in gradient_mask.items():
+        v = v.view(-1).cpu().numpy()
+        if r is None:
+            r = v
+        else:
+            r = np.append(r, v)
+    polar = np.percentile(r, (1 - reserve_p) * 100)
+    for k in gradient_mask:
+        gradient_mask[k] = gradient_mask[k] >= polar
+    print('Polar => {}'.format(polar))
+
+    # TODO: pytorch: torch.kthvalue
+
+    return gradient_mask
+
+
+@OPTIMIZERS.register_module(
+    group_key=default_group, module_name='ChildTuningAdamW')
+class ChildTuningAdamW(Optimizer):
+
+    def __init__(self,
+                 params: Iterable[torch.nn.parameter.Parameter],
+                 lr: float = 1e-3,
+                 betas: Tuple[float, float] = (0.9, 0.999),
+                 eps: float = 1e-6,
+                 weight_decay: float = 0.0,
+                 correct_bias: bool = True,
+                 reserve_p=1.0,
+                 mode=None):
+        if lr < 0.0:
+            raise ValueError(
+                'Invalid learning rate: {} - should be >= 0.0'.format(lr))
+        if not 0.0 <= betas[0] < 1.0:
+            raise ValueError(
+                'Invalid beta parameter: {} - should be in [0.0, 1.0['.format(
+                    betas[0]))
+        if not 0.0 <= betas[1] < 1.0:
+            raise ValueError(
+                'Invalid beta parameter: {} - should be in [0.0, 1.0['.format(
+                    betas[1]))
+        if not 0.0 <= eps:
+            raise ValueError(
+                'Invalid epsilon value: {} - should be >= 0.0'.format(eps))
+        defaults = dict(
+            lr=lr,
+            betas=betas,
+            eps=eps,
+            weight_decay=weight_decay,
+            correct_bias=correct_bias)
+        super().__init__(params, defaults)
+
+        self.gradient_mask = None
+        self.reserve_p = reserve_p
+        self.mode = mode
+
+    def set_gradient_mask(self, gradient_mask):
+        self.gradient_mask = gradient_mask
+
+    def step(self, closure: Callable = None):
+        """
+        Performs a single optimization step.
+        Arguments:
+            closure (:obj:`Callable`, `optional`): A closure that reevaluates the model and returns the loss.
+        """
+        loss = None
+        if closure is not None:
+            loss = closure()
+        for group in self.param_groups:
+            for p in group['params']:
+                if p.grad is None:
+                    continue
+                grad = p.grad.data
+                if grad.is_sparse:
+                    raise RuntimeError(
+                        'Adam does not support sparse gradients, please consider SparseAdam instead'
+                    )
+
+                # ChildTuning code
+                if self.mode is not None:
+                    if self.mode == 'ChildTuning-D':
+                        if p in self.gradient_mask:
+                            grad *= self.gradient_mask[p]
+                    else:
+                        # ChildTuning-F
+                        grad_mask = Bernoulli(
+                            grad.new_full(
+                                size=grad.size(), fill_value=self.reserve_p))
+                        grad *= grad_mask.sample() / self.reserve_p
+
+                state = self.state[p]
+
+                # State initialization
+                if len(state) == 0:
+                    state['step'] = 0
+                    # Exponential moving average of gradient values
+                    state['exp_avg'] = torch.zeros_like(p.data)
+                    # Exponential moving average of squared gradient values
+                    state['exp_avg_sq'] = torch.zeros_like(p.data)
+
+                exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
+                beta1, beta2 = group['betas']
+
+                state['step'] += 1
+
+                # Decay the first and second moment running average coefficient
+                # In-place operations to update the averages at the same time
+                exp_avg.mul_(beta1).add_(grad, alpha=1.0 - beta1)
+                exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1.0 - beta2)
+                denom = exp_avg_sq.sqrt().add_(group['eps'])
+
+                step_size = group['lr']
+                if group['correct_bias']:  # No bias correction for Bert
+                    bias_correction1 = 1.0 - beta1**state['step']
+                    bias_correction2 = 1.0 - beta2**state['step']
+                    step_size = step_size * math.sqrt(
+                        bias_correction2) / bias_correction1
+
+                p.data.addcdiv_(exp_avg, denom, value=-step_size)
+
+                # Just adding the square of the weights to the loss function is *not*
+                # the correct way of using L2 regularization/weight decay with Adam,
+                # since that will interact with the m and v parameters in strange ways.
+                #
+                # Instead we want to decay the weights in a manner that doesn't interact
+                # with the m/v parameters. This is equivalent to adding the square
+                # of the weights to the loss with plain (non-momentum) SGD.
+                # Add weight decay at the end (fixed version)
+                p.data.add_(p.data, alpha=-group['lr'] * group['weight_decay'])
+
+        return loss
diff --git a/modelscope/trainers/trainer.py b/modelscope/trainers/trainer.py
index c48ab2cd..a01d9b59 100644
--- a/modelscope/trainers/trainer.py
+++ b/modelscope/trainers/trainer.py
@@ -1,6 +1,5 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import os
-import random
 import time
 from collections.abc import Mapping
 from distutils.version import LooseVersion
@@ -8,7 +7,6 @@ from functools import partial
 from typing import Callable, Dict, List, Optional, Sequence, Tuple, Union
 
 import json
-import numpy as np
 import torch
 from torch import distributed as dist
 from torch import nn
@@ -26,7 +24,6 @@ from modelscope.msdatasets.task_datasets.torch_base_dataset import \
     TorchTaskDataset
 from modelscope.preprocessors.base import Preprocessor
 from modelscope.preprocessors.builder import build_preprocessor
-from modelscope.preprocessors.common import Compose
 from modelscope.trainers.hooks.builder import HOOKS
 from modelscope.trainers.hooks.priority import Priority, get_priority
 from modelscope.trainers.lrscheduler.builder import build_lr_scheduler
@@ -40,10 +37,11 @@ from modelscope.utils.device import create_device, verify_device
 from modelscope.utils.file_utils import func_receive_dict_inputs
 from modelscope.utils.logger import get_logger
 from modelscope.utils.registry import build_from_cfg
-from modelscope.utils.torch_utils import get_dist_info, init_dist
+from modelscope.utils.torch_utils import (get_dist_info, get_local_rank,
+                                          init_dist, set_random_seed)
 from .base import BaseTrainer
 from .builder import TRAINERS
-from .default_config import DEFAULT_CONFIG
+from .default_config import merge_cfg
 from .hooks.hook import Hook
 from .parallel.builder import build_parallel
 from .parallel.utils import is_parallel
@@ -75,6 +73,7 @@ class EpochBasedTrainer(BaseTrainer):
             this preprocessing action will be executed every time the dataset's __getitem__ is called.
         optimizers (`Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler._LRScheduler]`, *optional*): A tuple
             containing the optimizer and the scheduler to use.
+        seed (int): The optional random seed for torch, cuda, numpy and random.
         max_epochs: (int, optional): Total training epochs.
     """
 
@@ -83,7 +82,8 @@ class EpochBasedTrainer(BaseTrainer):
             model: Optional[Union[TorchModel, nn.Module, str]] = None,
             cfg_file: Optional[str] = None,
             arg_parse_fn: Optional[Callable] = None,
-            data_collator: Optional[Callable] = None,
+            data_collator: Optional[Union[Callable, Dict[str,
+                                                         Callable]]] = None,
             train_dataset: Optional[Union[MsDataset, Dataset]] = None,
             eval_dataset: Optional[Union[MsDataset, Dataset]] = None,
             preprocessor: Optional[Union[Preprocessor,
@@ -92,8 +92,11 @@ class EpochBasedTrainer(BaseTrainer):
                               torch.optim.lr_scheduler._LRScheduler] = (None,
                                                                         None),
             model_revision: Optional[str] = DEFAULT_MODEL_REVISION,
+            seed: int = 42,
             **kwargs):
 
+        self._seed = seed
+        set_random_seed(self._seed)
         if isinstance(model, str):
             if os.path.exists(model):
                 self.model_dir = model if os.path.isdir(
@@ -104,21 +107,24 @@ class EpochBasedTrainer(BaseTrainer):
             if cfg_file is None:
                 cfg_file = os.path.join(self.model_dir,
                                         ModelFile.CONFIGURATION)
-            self.model = self.build_model()
         else:
-            assert cfg_file is not None, 'Config file should not be None if model is an nn.Module class'
-            assert isinstance(
-                model,
-                (TorchModel, nn.Module
-                 )), 'model should be either str, TorchMode or nn.Module.'
+            assert cfg_file is not None, 'Config file should not be None if model is not from pretrained!'
             self.model_dir = os.path.dirname(cfg_file)
-            self.model = model
 
         super().__init__(cfg_file, arg_parse_fn)
+
         # add default config
-        self.cfg.merge_from_dict(self._get_default_config(), force=False)
+        merge_cfg(self.cfg)
         self.cfg = self.rebuild_config(self.cfg)
 
+        if 'cfg_options' in kwargs:
+            self.cfg.merge_from_dict(kwargs['cfg_options'])
+
+        if isinstance(model, (TorchModel, nn.Module)):
+            self.model = model
+        else:
+            self.model = self.build_model()
+
         if 'work_dir' in kwargs:
             self.work_dir = kwargs['work_dir']
         else:
@@ -149,20 +155,50 @@ class EpochBasedTrainer(BaseTrainer):
         if self.eval_preprocessor is not None:
             self.eval_preprocessor.mode = ModeKeys.EVAL
 
+        if kwargs.get('launcher', None) is not None:
+            init_dist(kwargs['launcher'])
+
+        _, world_size = get_dist_info()
+        self._dist = world_size > 1
+
         device_name = kwargs.get('device', 'gpu')
-        verify_device(device_name)
+        if self._dist:
+            local_rank = get_local_rank()
+            device_name = f'cuda:{local_rank}'
+
         self.device = create_device(device_name)
 
         self.train_dataset = self.to_task_dataset(
             train_dataset,
             mode=ModeKeys.TRAIN,
+            task_data_config=self.cfg.dataset.get('train', None) if hasattr(
+                self.cfg, 'dataset') else None,
             preprocessor=self.train_preprocessor)
         self.eval_dataset = self.to_task_dataset(
             eval_dataset,
             mode=ModeKeys.EVAL,
+            task_data_config=self.cfg.dataset.get('val', None) if hasattr(
+                self.cfg, 'dataset') else None,
             preprocessor=self.eval_preprocessor)
 
-        self.data_collator = data_collator if data_collator is not None else default_collate
+        self.train_data_collator, self.eval_default_collate = None, None
+        if isinstance(data_collator, Mapping):
+            if not (ConfigKeys.train in data_collator
+                    or ConfigKeys.val in data_collator):
+                raise ValueError(
+                    f'data_collator must split with `{ConfigKeys.train}` and `{ConfigKeys.val}` keys!'
+                )
+            if ConfigKeys.train in data_collator:
+                assert isinstance(data_collator[ConfigKeys.train], Callable)
+                self.train_data_collator = data_collator[ConfigKeys.train]
+            if ConfigKeys.val in data_collator:
+                assert isinstance(data_collator[ConfigKeys.val], Callable)
+                self.eval_data_collator = data_collator[ConfigKeys.val]
+        else:
+            collate_fn = default_collate if data_collator is None else data_collator
+            self.train_data_collator = collate_fn
+            self.eval_data_collator = collate_fn
+
         self.metrics = self.get_metrics()
         self._metric_values = None
         self.optimizers = optimizers
@@ -192,14 +228,6 @@ class EpochBasedTrainer(BaseTrainer):
 
         self.use_fp16 = kwargs.get('use_fp16', False)
 
-        # TODO @wenmeng.zwm add seed init fn
-        self._seed = 0
-
-        if kwargs.get('launcher', None) is not None:
-            init_dist(kwargs['launcher'])
-
-        self._dist = get_dist_info()[1] > 1
-
         # model placement
         if self.device.type == 'cuda':
             self.model.to(self.device)
@@ -276,6 +304,7 @@ class EpochBasedTrainer(BaseTrainer):
     def to_task_dataset(self,
                         datasets: Union[Dataset, List[Dataset]],
                         mode: str,
+                        task_data_config: Config = None,
                         preprocessor: Optional[Preprocessor] = None):
         """Build the task specific dataset processor for this trainer.
 
@@ -288,19 +317,29 @@ class EpochBasedTrainer(BaseTrainer):
             if isinstance(datasets, TorchTaskDataset):
                 return datasets
             elif isinstance(datasets, MsDataset):
-                cfg = ConfigDict(type=self.cfg.model.type, mode=mode) if hasattr(self.cfg, ConfigFields.model) \
-                    else ConfigDict(type=None, mode=mode)
+                if task_data_config is None:
+                    # adapt to some special models
+                    task_data_config = ConfigDict(
+                        type=self.cfg.model.type) if hasattr(
+                            self.cfg, ConfigFields.model) else ConfigDict(
+                                type=None)
+                task_data_config.update(dict(mode=mode))
                 return datasets.to_torch_dataset(
-                    task_data_config=cfg,
+                    task_data_config=task_data_config,
                     task_name=self.cfg.task,
                     preprocessors=preprocessor)
             elif isinstance(datasets, List) and isinstance(
                     datasets[0], MsDataset):
-                cfg = ConfigDict(type=self.cfg.model.type, mode=mode) if hasattr(self.cfg, ConfigFields.model) \
-                    else ConfigDict(type=None, mode=mode)
+                if task_data_config is None:
+                    # adapt to some special models
+                    task_data_config = ConfigDict(
+                        type=self.cfg.model.type) if hasattr(
+                            self.cfg, ConfigFields.model) else ConfigDict(
+                                type=None)
+                task_data_config.update(dict(mode=mode))
                 datasets = [
                     d.to_torch_dataset(
-                        task_data_config=cfg,
+                        task_data_config=task_data_config,
                         task_name=self.cfg.task,
                         preprocessors=preprocessor) for d in datasets
                 ]
@@ -308,12 +347,12 @@ class EpochBasedTrainer(BaseTrainer):
                     type=self.cfg.task, mode=mode, datasets=datasets)
                 return build_task_dataset(cfg, self.cfg.task)
             else:
-                cfg = ConfigDict(
-                    type=self.cfg.model.type,
-                    mode=mode,
-                    datasets=datasets,
-                    preprocessor=preprocessor)
-                return build_task_dataset(cfg, self.cfg.task)
+                # avoid add no str value datasets, preprocessors in cfg
+                task_data_build_config = ConfigDict(
+                    mode=mode, datasets=datasets, preprocessor=preprocessor)
+                task_data_build_config.update(task_data_config)
+                return build_task_dataset(task_data_build_config,
+                                          self.cfg.task)
         except Exception:
             if isinstance(datasets, (List, Tuple)) or preprocessor is not None:
                 return TorchTaskDataset(
@@ -364,7 +403,7 @@ class EpochBasedTrainer(BaseTrainer):
 
         return train_preprocessor, eval_preprocessor
 
-    def get_metrics(self) -> List[str]:
+    def get_metrics(self) -> List[Union[str, Dict]]:
         """Get the metric class types.
 
         The first choice will be the metrics configured in the config file, if not found, the default metrics will be
@@ -384,12 +423,20 @@ class EpochBasedTrainer(BaseTrainer):
                 f'Metrics are needed in evaluation, please try to either '
                 f'add metrics in configuration.json or add the default metric for {self.cfg.task}.'
             )
-        if isinstance(metrics, str):
+        if isinstance(metrics, (str, Mapping)):
             metrics = [metrics]
         return metrics
 
-    def train(self, *args, **kwargs):
-        self.model.train()
+    def set_checkpoint_file_to_hook(self, checkpoint_path):
+        if checkpoint_path is not None and os.path.isfile(checkpoint_path):
+            from modelscope.trainers.hooks import CheckpointHook
+            checkpoint_hooks = list(
+                filter(lambda hook: isinstance(hook, CheckpointHook),
+                       self.hooks))
+            for hook in checkpoint_hooks:
+                hook.checkpoint_file = checkpoint_path
+
+    def train(self, checkpoint_path=None, *args, **kwargs):
         self._mode = ModeKeys.TRAIN
 
         if self.train_dataset is None:
@@ -399,18 +446,23 @@ class EpochBasedTrainer(BaseTrainer):
                 self.train_dataset,
                 dist=self._dist,
                 seed=self._seed,
+                collate_fn=self.train_data_collator,
                 **self.cfg.train.get('dataloader', {}))
         self.data_loader = self.train_dataloader
 
         self.register_optimizers_hook()
         self.register_hook_from_cfg(self.cfg.train.hooks)
+        self.set_checkpoint_file_to_hook(checkpoint_path)
+        self.model.train()
 
         self.train_loop(self.train_dataloader)
 
     def evaluate(self, checkpoint_path=None):
+        if checkpoint_path is not None and os.path.isfile(checkpoint_path):
+            from modelscope.trainers.hooks import CheckpointHook
+            CheckpointHook.load_checkpoint(checkpoint_path, self)
         self.model.eval()
         self._mode = ModeKeys.EVAL
-
         if self.eval_dataset is None:
             self.eval_dataloader = self.get_eval_data_loader()
         else:
@@ -418,13 +470,15 @@ class EpochBasedTrainer(BaseTrainer):
                 self.eval_dataset,
                 dist=self._dist,
                 seed=self._seed,
+                collate_fn=self.eval_data_collator,
                 **self.cfg.evaluation.get('dataloader', {}))
         self.data_loader = self.eval_dataloader
         metric_classes = [build_metric(metric) for metric in self.metrics]
         for m in metric_classes:
             m.trainer = self
+
         metric_values = self.evaluation_loop(self.eval_dataloader,
-                                             checkpoint_path, metric_classes)
+                                             metric_classes)
 
         self._metric_values = metric_values
         return metric_values
@@ -440,7 +494,7 @@ class EpochBasedTrainer(BaseTrainer):
         override this method in a subclass.
 
         """
-        model = Model.from_pretrained(self.model_dir)
+        model = Model.from_pretrained(self.model_dir, cfg_dict=self.cfg)
         if not isinstance(model, nn.Module) and hasattr(model, 'model'):
             return model.model
         elif isinstance(model, nn.Module):
@@ -481,8 +535,14 @@ class EpochBasedTrainer(BaseTrainer):
         model.train()
         self._mode = ModeKeys.TRAIN
         # call model forward but not __call__ to skip postprocess
-        if isinstance(inputs,
-                      Mapping) and not func_receive_dict_inputs(model.forward):
+
+        if is_parallel(model):
+            receive_dict_inputs = func_receive_dict_inputs(
+                model.module.forward)
+        else:
+            receive_dict_inputs = func_receive_dict_inputs(model.forward)
+
+        if isinstance(inputs, Mapping) and not receive_dict_inputs:
             train_outputs = model.forward(**inputs)
         else:
             train_outputs = model.forward(inputs)
@@ -503,7 +563,7 @@ class EpochBasedTrainer(BaseTrainer):
                 value = train_outputs.get(key, None)
                 if value is not None:
                     if dist.is_available() and dist.is_initialized():
-                        value = value.data.clone()
+                        value = value.data.clone().to('cuda')
                         dist.all_reduce(value.div_(dist.get_world_size()))
                     log_vars.update({key: value.item()})
             self.log_buffer.update(log_vars)
@@ -552,6 +612,7 @@ class EpochBasedTrainer(BaseTrainer):
             self.train_dataset,
             dist=self._dist,
             seed=self._seed,
+            collate_fn=self.train_data_collator,
             **self.cfg.train.get('dataloader', {}))
         return data_loader
 
@@ -569,9 +630,9 @@ class EpochBasedTrainer(BaseTrainer):
                 mode=ModeKeys.EVAL,
                 preprocessor=self.eval_preprocessor)
 
-        batch_size = self.cfg.evaluation.batch_size
-        workers = self.cfg.evaluation.workers
-        shuffle = self.cfg.evaluation.get('shuffle', False)
+        batch_size = self.cfg.evaluation.dataloader.batch_size_per_gpu
+        workers = self.cfg.evaluation.dataloader.workers_per_gpu
+        shuffle = self.cfg.evaluation.dataloader.get('shuffle', False)
         data_loader = self._build_dataloader_with_dataset(
             self.eval_dataset,
             batch_size_per_gpu=batch_size,
@@ -580,25 +641,26 @@ class EpochBasedTrainer(BaseTrainer):
             dist=self._dist,
             seed=self._seed,
             persistent_workers=True,
+            collate_fn=self.eval_data_collator,
         )
         return data_loader
 
     def build_dataset(self, data_cfg, mode, preprocessor=None):
         """ Build torch dataset object using data config
         """
-        dataset = MsDataset.load(
-            dataset_name=data_cfg.name,
-            split=data_cfg.split,
-            subset_name=data_cfg.subset_name if hasattr(
-                data_cfg, 'subset_name') else None,
-            hub=data_cfg.hub if hasattr(data_cfg, 'hub') else Hubs.modelscope,
-            **data_cfg,
-        )
-        cfg = ConfigDict(type=self.cfg.model.type, mode=mode)
-        torch_dataset = dataset.to_torch_dataset(
-            task_data_config=cfg,
-            task_name=self.cfg.task,
-            preprocessors=self.preprocessor)
+        # TODO: support MsDataset load for cv
+        if hasattr(data_cfg, 'name'):
+            dataset = MsDataset.load(
+                dataset_name=data_cfg.name,
+                **data_cfg,
+            )
+            cfg = ConfigDict(type=self.cfg.model.type, mode=mode)
+            torch_dataset = dataset.to_torch_dataset(
+                task_data_config=cfg,
+                task_name=self.cfg.task,
+                preprocessors=preprocessor)
+        else:
+            torch_dataset = build_task_dataset(data_cfg, self.cfg.task)
         dataset = self.to_task_dataset(torch_dataset, mode)
         return dataset
 
@@ -746,7 +808,6 @@ class EpochBasedTrainer(BaseTrainer):
             sampler=sampler,
             num_workers=num_workers,
             batch_sampler=batch_sampler,
-            collate_fn=self.data_collator,
             pin_memory=kwargs.pop('pin_memory', False),
             worker_init_fn=init_fn,
             **kwargs)
@@ -757,32 +818,38 @@ class EpochBasedTrainer(BaseTrainer):
         """ Training loop used by `EpochBasedTrainer.train()`
         """
         self.invoke_hook(TrainerStages.before_run)
-        self._epoch = 0
         kwargs = {}
         self.model.train()
         for _ in range(self._epoch, self._max_epochs):
             self.invoke_hook(TrainerStages.before_train_epoch)
             time.sleep(2)  # Prevent possible deadlock during epoch transition
             for i, data_batch in enumerate(data_loader):
+                if i < self.inner_iter:
+                    # inner_iter may be read out from the checkpoint file, so skip the trained iters in the epoch.
+                    continue
                 data_batch = to_device(data_batch, self.device)
                 self.data_batch = data_batch
                 self._inner_iter = i
                 self.invoke_hook(TrainerStages.before_train_iter)
                 self.train_step(self.model, data_batch, **kwargs)
                 self.invoke_hook(TrainerStages.after_train_iter)
+                # Value changed after the hooks are invoked, do not move them above the invoke_hook code.
                 del self.data_batch
                 self._iter += 1
+                self._mode = ModeKeys.TRAIN
 
                 if i + 1 >= self.iters_per_epoch:
                     break
 
             self.invoke_hook(TrainerStages.after_train_epoch)
+            # Value changed after the hooks are invoked, do not move them above the invoke_hook code.
+            self._inner_iter = 0
             self._epoch += 1
 
         time.sleep(1)  # wait for some hooks like loggers to finish
         self.invoke_hook(TrainerStages.after_run)
 
-    def evaluation_loop(self, data_loader, checkpoint_path, metric_classes):
+    def evaluation_loop(self, data_loader, metric_classes):
         """ Evaluation loop used by `EpochBasedTrainer.evaluate()`.
 
         """
@@ -795,7 +862,7 @@ class EpochBasedTrainer(BaseTrainer):
                 tmpdir=None,
                 gpu_collect=False,
                 metric_classes=metric_classes,
-                data_loader_iters_per_gpu=self.iters_per_epoch)
+                data_loader_iters_per_gpu=self._eval_iters_per_epoch)
         else:
             from modelscope.trainers.utils.inference import single_gpu_test
             metric_values = single_gpu_test(
@@ -803,7 +870,7 @@ class EpochBasedTrainer(BaseTrainer):
                 data_loader,
                 device=self.device,
                 metric_classes=metric_classes,
-                data_loader_iters=self.iters_per_epoch)
+                data_loader_iters=self._eval_iters_per_epoch)
 
         self._inner_iter = self.iters_per_epoch - 1  # start from index 0
 
@@ -820,12 +887,14 @@ class EpochBasedTrainer(BaseTrainer):
         Args:
             hook (:obj:`Hook`): The hook to be registered.
         """
-        assert isinstance(hook, Hook)
         # insert the hook to a sorted list
         inserted = False
         for i in range(len(self._hooks) - 1, -1, -1):
-            if get_priority(hook.PRIORITY) > get_priority(
-                    self._hooks[i].PRIORITY):
+            p = hook.PRIORITY if hasattr(hook, 'PRIORITY') else Priority.NORMAL
+            p_i = self._hooks[i].PRIORITY if hasattr(
+                self._hooks[i], 'PRIORITY') else Priority.NORMAL
+
+            if get_priority(p) > get_priority(p_i):
                 self._hooks.insert(i + 1, hook)
                 inserted = True
                 break
@@ -882,14 +951,9 @@ class EpochBasedTrainer(BaseTrainer):
                 stage_hook_infos.append(info)
         return '\n'.join(stage_hook_infos)
 
-    def _get_default_config(self):
-        return DEFAULT_CONFIG
-
 
 def worker_init_fn(worker_id, num_workers, rank, seed):
     # The seed of each worker equals to
     # num_worker * rank + worker_id + user_seed
     worker_seed = num_workers * rank + worker_id + seed
-    np.random.seed(worker_seed)
-    random.seed(worker_seed)
-    torch.manual_seed(worker_seed)
+    set_random_seed(worker_seed)
diff --git a/modelscope/trainers/utils/inference.py b/modelscope/trainers/utils/inference.py
index d368c340..7f5d4ec3 100644
--- a/modelscope/trainers/utils/inference.py
+++ b/modelscope/trainers/utils/inference.py
@@ -11,6 +11,7 @@ import torch
 from torch import distributed as dist
 from tqdm import tqdm
 
+from modelscope.trainers.parallel.utils import is_parallel
 from modelscope.utils.data_utils import to_device
 from modelscope.utils.file_utils import func_receive_dict_inputs
 from modelscope.utils.torch_utils import (broadcast, get_dist_info, is_master,
@@ -134,7 +135,10 @@ def multi_gpu_test(model,
         data_len = data_loader_iters_per_gpu * world_size
         desc = 'Total test iterations with multi gpus'
 
-    time.sleep(2)  # This line can prevent deadlock problem in some cases.
+    if is_parallel(model):
+        receive_dict_inputs = func_receive_dict_inputs(model.module.forward)
+    else:
+        receive_dict_inputs = func_receive_dict_inputs(model.forward)
 
     count = 0
     with tqdm(total=data_len, desc=desc) as pbar:
@@ -142,8 +146,7 @@ def multi_gpu_test(model,
             data = to_device(data, device)
             data_list.append(data)
             with torch.no_grad():
-                if isinstance(data, Mapping) and not func_receive_dict_inputs(
-                        model.forward):
+                if isinstance(data, Mapping) and not receive_dict_inputs:
                     result = model.forward(**data)
                 else:
                     result = model.forward(data)
diff --git a/modelscope/utils/ast_utils.py b/modelscope/utils/ast_utils.py
index 2d2f61d8..f59100cb 100644
--- a/modelscope/utils/ast_utils.py
+++ b/modelscope/utils/ast_utils.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import ast
 import contextlib
 import hashlib
@@ -15,9 +17,9 @@ import json
 
 from modelscope import __version__
 from modelscope.fileio.file import LocalStorage
-from modelscope.metainfo import (Heads, Hooks, LR_Schedulers, Metrics, Models,
-                                 Optimizers, Pipelines, Preprocessors,
-                                 TaskModels, Trainers)
+from modelscope.metainfo import (Datasets, Heads, Hooks, LR_Schedulers,
+                                 Metrics, Models, Optimizers, Pipelines,
+                                 Preprocessors, TaskModels, Trainers)
 from modelscope.utils.constant import Fields, Tasks
 from modelscope.utils.file_utils import get_default_cache_dir
 from modelscope.utils.logger import get_logger
@@ -32,11 +34,11 @@ MODELSCOPE_PATH = p.resolve().parents[1]
 REGISTER_MODULE = 'register_module'
 IGNORED_PACKAGES = ['modelscope', '.']
 SCAN_SUB_FOLDERS = [
-    'models', 'metrics', 'pipelines', 'preprocessors',
-    'msdatasets/task_datasets', 'trainers'
+    'models', 'metrics', 'pipelines', 'preprocessors', 'trainers', 'msdatasets'
 ]
 INDEXER_FILE = 'ast_indexer'
 DECORATOR_KEY = 'decorators'
+EXPRESS_KEY = 'express'
 FROM_IMPORT_KEY = 'from_imports'
 IMPORT_KEY = 'imports'
 FILE_NAME_KEY = 'filepath'
@@ -46,6 +48,9 @@ INDEX_KEY = 'index'
 REQUIREMENT_KEY = 'requirements'
 MODULE_KEY = 'module'
 CLASS_NAME = 'class_name'
+GROUP_KEY = 'group_key'
+MODULE_NAME = 'module_name'
+MODULE_CLS = 'module_cls'
 
 
 class AstScaning(object):
@@ -54,6 +59,7 @@ class AstScaning(object):
         self.result_import = dict()
         self.result_from_import = dict()
         self.result_decorator = []
+        self.express = []
 
     def _is_sub_node(self, node: object) -> bool:
         return isinstance(node,
@@ -109,6 +115,7 @@ class AstScaning(object):
         self.result_import = dict()
         self.result_from_import = dict()
         self.result_decorator = []
+        self.result_express = []
 
     def scan_ast(self, node: Union[ast.AST, None, str]):
         self._setup_global()
@@ -244,13 +251,19 @@ class AstScaning(object):
                             setattr(item, CLASS_NAME, node.name)
                         self.result_decorator.extend(attr)
 
+                    if attr != [] and type(
+                            attr
+                    ).__name__ == 'Call' and parent_node_name == 'Expr':
+                        self.result_express.append(attr)
+
                     out += f'{indentstr()}{field}={representation},\n'
 
             out += indentstr() + ')'
             return {
                 IMPORT_KEY: self.result_import,
                 FROM_IMPORT_KEY: self.result_from_import,
-                DECORATOR_KEY: self.result_decorator
+                DECORATOR_KEY: self.result_decorator,
+                EXPRESS_KEY: self.result_express
             }, out
 
     def _parse_decorator(self, node: ast.AST) -> tuple:
@@ -268,7 +281,10 @@ class AstScaning(object):
         def _get_args_name(nodes: list) -> list:
             result = []
             for node in nodes:
-                result.append(_get_attribute_item(node))
+                if type(node).__name__ == 'Str':
+                    result.append((node.s, None))
+                else:
+                    result.append(_get_attribute_item(node))
             return result
 
         def _get_keyword_name(nodes: ast.AST) -> list:
@@ -277,9 +293,14 @@ class AstScaning(object):
                 if type(node).__name__ == 'keyword':
                     attribute_node = getattr(node, 'value')
                     if type(attribute_node).__name__ == 'Str':
-                        result.append((attribute_node.s, None))
+                        result.append((getattr(node,
+                                               'arg'), attribute_node.s, None))
+                    elif type(attribute_node).__name__ == 'Constant':
+                        result.append(
+                            (getattr(node, 'arg'), attribute_node.value, None))
                     else:
-                        result.append(_get_attribute_item(attribute_node))
+                        result.append((getattr(node, 'arg'), )
+                                      + _get_attribute_item(attribute_node))
             return result
 
         functions = _get_attribute_item(node.func)
@@ -316,10 +337,26 @@ class AstScaning(object):
             args_list.append(default_group)
         if len(keyword_list) == 0 and len(args_list) == 1:
             args_list.append(class_name)
-        if len(keyword_list) == 1 and len(args_list) == 0:
+
+        if len(keyword_list) > 0 and len(args_list) == 0:
+            remove_group_item = None
+            for item in keyword_list:
+                key, name, attr = item
+                if key == GROUP_KEY:
+                    args_list.append((name, attr))
+                    remove_group_item = item
+            if remove_group_item is not None:
+                keyword_list.remove(remove_group_item)
+
+        if len(args_list) == 0:
             args_list.append(default_group)
 
-        args_list.extend(keyword_list)
+        for item in keyword_list:
+            key, name, attr = item
+            if key == MODULE_CLS:
+                class_name = name
+            else:
+                args_list.append((name, attr))
 
         for item in args_list:
             # the case empty input
@@ -348,22 +385,29 @@ class AstScaning(object):
         for node in nodes:
             if type(node).__name__ != 'Call':
                 continue
+            class_name = getattr(node, CLASS_NAME, None)
+            func = getattr(node, 'func')
+
+            if getattr(func, 'attr', None) != REGISTER_MODULE:
+                continue
+
             parse_output = self._parse_decorator(node)
-            index = self._registry_indexer(parse_output,
-                                           getattr(node, CLASS_NAME))
+            index = self._registry_indexer(parse_output, class_name)
             if None is not index:
                 results.append(index)
         return results
 
     def generate_ast(self, file):
         self._refresh()
-        with open(file, 'r') as code:
+        with open(file, 'r', encoding='utf8') as code:
             data = code.readlines()
         data = ''.join(data)
 
         node = gast.parse(data)
         output, _ = self.scan_import(node, indent='  ', show_offsets=False)
         output[DECORATOR_KEY] = self.parse_decorators(output[DECORATOR_KEY])
+        output[EXPRESS_KEY] = self.parse_decorators(output[EXPRESS_KEY])
+        output[DECORATOR_KEY].extend(output[EXPRESS_KEY])
         return output
 
 
@@ -482,6 +526,13 @@ class FilesAstScaning(object):
             module_import[value_dict[MODULE_KEY]] = value_dict[IMPORT_KEY]
         return module_import
 
+    def _ignore_useless_keys(self, inverted_index):
+        if ('OPTIMIZERS', 'default', 'name') in inverted_index:
+            del inverted_index[('OPTIMIZERS', 'default', 'name')]
+        if ('LR_SCHEDULER', 'default', 'name') in inverted_index:
+            del inverted_index[('LR_SCHEDULER', 'default', 'name')]
+        return inverted_index
+
     def get_files_scan_results(self,
                                target_dir=MODELSCOPE_PATH,
                                target_folders=SCAN_SUB_FOLDERS):
@@ -515,6 +566,8 @@ class FilesAstScaning(object):
                 MODULE_KEY: module_name
             }
         inverted_index_with_results = self._inverted_index(result)
+        inverted_index_with_results = self._ignore_useless_keys(
+            inverted_index_with_results)
         module_import = self._module_import(result)
         index = {
             INDEX_KEY: inverted_index_with_results,
diff --git a/modelscope/utils/audio/audio_utils.py b/modelscope/utils/audio/audio_utils.py
new file mode 100644
index 00000000..4c2c45cc
--- /dev/null
+++ b/modelscope/utils/audio/audio_utils.py
@@ -0,0 +1,80 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import struct
+from typing import Union
+from urllib.parse import urlparse
+
+from modelscope.fileio.file import HTTPStorage
+
+SEGMENT_LENGTH_TRAIN = 16000
+
+
+def to_segment(batch, segment_length=SEGMENT_LENGTH_TRAIN):
+    """
+    Dataset mapping function to split one audio into segments.
+    It only works in batch mode.
+    """
+    noisy_arrays = []
+    clean_arrays = []
+    for x, y in zip(batch['noisy'], batch['clean']):
+        length = min(len(x['array']), len(y['array']))
+        noisy = x['array']
+        clean = y['array']
+        for offset in range(segment_length, length + 1, segment_length):
+            noisy_arrays.append(noisy[offset - segment_length:offset])
+            clean_arrays.append(clean[offset - segment_length:offset])
+    return {'noisy': noisy_arrays, 'clean': clean_arrays}
+
+
+def audio_norm(x):
+    rms = (x**2).mean()**0.5
+    scalar = 10**(-25 / 20) / rms
+    x = x * scalar
+    pow_x = x**2
+    avg_pow_x = pow_x.mean()
+    rmsx = pow_x[pow_x > avg_pow_x].mean()**0.5
+    scalarx = 10**(-25 / 20) / rmsx
+    x = x * scalarx
+    return x
+
+
+def extract_pcm_from_wav(wav: bytes) -> bytes:
+    data = wav
+    if len(data) > 44:
+        frame_len = 44
+        file_len = len(data)
+        try:
+            header_fields = {}
+            header_fields['ChunkID'] = str(data[0:4], 'UTF-8')
+            header_fields['Format'] = str(data[8:12], 'UTF-8')
+            header_fields['Subchunk1ID'] = str(data[12:16], 'UTF-8')
+            if header_fields['ChunkID'] == 'RIFF' and header_fields[
+                    'Format'] == 'WAVE' and header_fields[
+                        'Subchunk1ID'] == 'fmt ':
+                header_fields['SubChunk1Size'] = struct.unpack(
+                    '<I', data[16:20])[0]
+
+                if header_fields['SubChunk1Size'] == 16:
+                    frame_len = 44
+                elif header_fields['SubChunk1Size'] == 18:
+                    frame_len = 46
+                else:
+                    return data
+
+                data = wav[frame_len:file_len]
+        except Exception:
+            # no treatment
+            pass
+
+    return data
+
+
+def load_bytes_from_url(url: str) -> Union[bytes, str]:
+    result = urlparse(url)
+    if result.scheme is not None and len(result.scheme) > 0:
+        storage = HTTPStorage()
+        data = storage.read(url)
+        data = extract_pcm_from_wav(data)
+    else:
+        data = url
+
+    return data
diff --git a/modelscope/utils/audio/tts_exceptions.py b/modelscope/utils/audio/tts_exceptions.py
index 8c73b603..43ec994b 100644
--- a/modelscope/utils/audio/tts_exceptions.py
+++ b/modelscope/utils/audio/tts_exceptions.py
@@ -1,3 +1,4 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
 """
 Define TTS exceptions
 """
@@ -10,7 +11,7 @@ class TtsException(Exception):
     pass
 
 
-class TtsModelConfigurationExcetion(TtsException):
+class TtsModelConfigurationException(TtsException):
     """
     TTS model configuration exceptions.
     """
diff --git a/modelscope/utils/checkpoint.py b/modelscope/utils/checkpoint.py
index 76fb2a19..a9d7f396 100644
--- a/modelscope/utils/checkpoint.py
+++ b/modelscope/utils/checkpoint.py
@@ -1,15 +1,26 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 
 import io
+import os
 import time
 from collections import OrderedDict
-from typing import Optional
+from shutil import copytree, ignore_patterns, rmtree
+from typing import Callable, List, Optional, Union
 
+import json
 import torch
 from torch.optim import Optimizer
+from torch.optim.lr_scheduler import _LRScheduler
 
 from modelscope import __version__
-from modelscope.fileio import File
+from modelscope.fileio import File, LocalStorage
+from modelscope.utils.config import JSONIteratorEncoder
+from modelscope.utils.constant import ConfigFields, ModelFile
+from modelscope.utils.logger import get_logger
+
+logger = get_logger(__name__)
+
+storage = LocalStorage()
 
 
 def weights_to_cpu(state_dict):
@@ -32,23 +43,27 @@ def weights_to_cpu(state_dict):
 def save_checkpoint(model: torch.nn.Module,
                     filename: str,
                     optimizer: Optional[Optimizer] = None,
-                    meta: Optional[dict] = None) -> None:
+                    lr_scheduler: Optional[_LRScheduler] = None,
+                    meta: Optional[dict] = None,
+                    with_meta: bool = True) -> None:
     """Save checkpoint to file.
 
     The checkpoint will have 3 fields: ``meta``, ``state_dict`` and
-    ``optimizer``. By default ``meta`` will contain version and time info.
+    ``optimizer``. By default, ``meta`` will contain version and time info.
 
     Args:
         model (Module): Module whose params are to be saved.
         filename (str): Checkpoint filename.
         optimizer (:obj:`Optimizer`, optional): Optimizer to be saved.
+        lr_scheduler(:obj:`_LRScheduler`, optional): LRScheduler to be saved.
         meta (dict, optional): Metadata to be saved in checkpoint.
+        with_meta (bool, optional):
     """
     if meta is None:
         meta = {}
     elif not isinstance(meta, dict):
         raise TypeError(f'meta must be a dict or None, but got {type(meta)}')
-    meta.update(modescope=__version__, time=time.asctime())
+    meta.update(modelscope=__version__, time=time.asctime())
 
     if isinstance(model, torch.nn.parallel.DistributedDataParallel):
         model = model.module
@@ -57,18 +72,141 @@ def save_checkpoint(model: torch.nn.Module,
         # save class name to the meta
         meta.update(CLASSES=model.CLASSES)
 
-    checkpoint = {
-        'meta': meta,
-        'state_dict': weights_to_cpu(model.state_dict())
-    }
-    # save optimizer state dict in the checkpoint
-    if isinstance(optimizer, Optimizer):
-        checkpoint['optimizer'] = optimizer.state_dict()
-    elif isinstance(optimizer, dict):
-        checkpoint['optimizer'] = {}
-        for name, optim in optimizer.items():
-            checkpoint['optimizer'][name] = optim.state_dict()
+    if with_meta:
+        checkpoint = {
+            'meta': meta,
+            'state_dict': weights_to_cpu(model.state_dict())
+        }
+
+        # save optimizer state dict in the checkpoint
+        if isinstance(optimizer, Optimizer):
+            checkpoint['optimizer'] = optimizer.state_dict()
+        elif isinstance(optimizer, dict):
+            checkpoint['optimizer'] = {}
+            for name, optim in optimizer.items():
+                checkpoint['optimizer'][name] = optim.state_dict()
+
+        # save lr_scheduler state dict in the checkpoint
+        if lr_scheduler is not None and hasattr(lr_scheduler, 'state_dict'):
+            checkpoint['lr_scheduler'] = lr_scheduler.state_dict()
+    else:
+        checkpoint = weights_to_cpu(model.state_dict())
 
     with io.BytesIO() as f:
         torch.save(checkpoint, f)
         File.write(f.getvalue(), filename)
+
+
+def load_checkpoint(filename,
+                    model,
+                    optimizer: Optimizer = None,
+                    lr_scheduler: _LRScheduler = None):
+    if not os.path.exists(filename):
+        raise ValueError(f'Checkpoint file {filename} does not exist!')
+    checkpoint = torch.load(filename, map_location='cpu')
+
+    if optimizer is not None:
+        if 'optimizer' in checkpoint:
+            if isinstance(optimizer, Optimizer):
+                optimizer.load_state_dict(checkpoint['optimizer'])
+            elif isinstance(optimizer, dict):
+                optimizer_dict = checkpoint['optimizer']
+                for key, optimizer_ins in optimizer.items():
+                    if key in optimizer_dict:
+                        optimizer_ins.load_state_dict(optimizer_dict[key])
+                    else:
+                        logger.warn(
+                            f'The state dict of optimizer {key} cannot be found in checkpoint file: {filename}'
+                        )
+        else:
+            logger.warn(
+                f'The state dict of optimizer cannot be found in checkpoint file: {filename}'
+            )
+
+    if lr_scheduler is not None:
+        if 'lr_scheduler' in checkpoint:
+            lr_scheduler.load_state_dict(checkpoint['lr_scheduler'])
+        else:
+            logger.warn(
+                f'The state dict of lr_scheduler cannot be found in checkpoint file: {filename}'
+            )
+
+    state_dict = checkpoint if 'state_dict' not in checkpoint else checkpoint[
+        'state_dict']
+    model.load_state_dict(state_dict)
+
+    if 'meta' in checkpoint:
+        return checkpoint.get('meta', {})
+
+
+def save_pretrained(model,
+                    target_folder: Union[str, os.PathLike],
+                    save_checkpoint_name: str = None,
+                    save_function: Callable = None,
+                    config: Optional[dict] = None,
+                    **kwargs):
+    """save the pretrained model, its configuration and other related files to a directory, so that it can be re-loaded
+
+    Args:
+        model (Model): Model whose params are to be saved.
+
+        target_folder (Union[str, os.PathLike]):
+        Directory to which to save. Will be created if it doesn't exist.
+
+        save_checkpoint_name (str):
+        The checkpoint name to be saved in the target_folder
+
+        save_function (Callable, optional):
+        The function to use to save the state dictionary.
+
+        config (Optional[dict], optional):
+        The config for the configuration.json, might not be identical with model.config
+    """
+
+    if save_function is None or not isinstance(save_function, Callable):
+        raise Exception('A valid save function must be passed in')
+
+    if target_folder is None or os.path.isfile(target_folder):
+        raise ValueError(
+            f'Provided path ({target_folder}) should be a directory, not a file'
+        )
+
+    if save_checkpoint_name is None:
+        raise Exception(
+            'At least pass in one checkpoint name for saving method')
+
+    if config is None:
+        raise ValueError('Configuration is not valid')
+
+    # Clean the folder from a previous save
+    if os.path.exists(target_folder):
+        rmtree(target_folder)
+
+    # Single ckpt path, sharded ckpt logic will be added later
+    output_ckpt_path = os.path.join(target_folder, save_checkpoint_name)
+
+    # Save the files to be copied to the save directory, ignore the original ckpts and configuration
+    origin_file_to_be_ignored = [save_checkpoint_name]
+    ignore_file_set = set(origin_file_to_be_ignored)
+    ignore_file_set.add(ModelFile.CONFIGURATION)
+    ignore_file_set.add('.*')
+    if hasattr(model, 'model_dir') and model.model_dir is not None:
+        copytree(
+            model.model_dir,
+            target_folder,
+            ignore=ignore_patterns(*ignore_file_set))
+
+    # Save the ckpt to the save directory
+    try:
+        save_function(model, output_ckpt_path, **kwargs)
+    except Exception as e:
+        raise Exception(
+            f'During saving checkpoints, the error of "{type(e).__name__} '
+            f'with msg {e} throwed')
+
+    # Dump the config to the configuration.json
+    if ConfigFields.pipeline not in config:
+        config[ConfigFields.pipeline] = {'type': config[ConfigFields.task]}
+    cfg_str = json.dumps(config, cls=JSONIteratorEncoder)
+    config_file = os.path.join(target_folder, ModelFile.CONFIGURATION)
+    storage.write(cfg_str.encode(), config_file)
diff --git a/modelscope/utils/config.py b/modelscope/utils/config.py
index a28ac1ab..0b966bef 100644
--- a/modelscope/utils/config.py
+++ b/modelscope/utils/config.py
@@ -1,4 +1,6 @@
-# Copyright (c) Alibaba, Inc. and its affiliates.
+# Copyright (c) OpenMMLab. All rights reserved.
+# Major implementation is borrowed and modified from
+# https://github.com/open-mmlab/mmcv/blob/master/mmcv/utils/config.py
 
 import copy
 import os
@@ -9,9 +11,11 @@ import sys
 import tempfile
 import types
 from pathlib import Path
+from types import FunctionType
 from typing import Dict, Union
 
 import addict
+import json
 from yapf.yapflib.yapf_api import FormatCode
 
 from modelscope.utils.constant import ConfigFields, ModelFile
@@ -627,3 +631,22 @@ def check_config(cfg: Union[str, ConfigDict]):
         check_attr(ConfigFields.model)
         check_attr(ConfigFields.preprocessor)
         check_attr(ConfigFields.evaluation)
+
+
+class JSONIteratorEncoder(json.JSONEncoder):
+    """Implement this method in order that supporting arbitrary iterators, it returns
+        a serializable object for ``obj``, or calls the base implementation
+        (to raise a ``TypeError``).
+
+    """
+
+    def default(self, obj):
+        if isinstance(obj, FunctionType):
+            return None
+        try:
+            iterable = iter(obj)
+        except TypeError:
+            pass
+        else:
+            return list(iterable)
+        return json.JSONEncoder.default(self, obj)
diff --git a/modelscope/msdatasets/config.py b/modelscope/utils/config_ds.py
similarity index 95%
rename from modelscope/msdatasets/config.py
rename to modelscope/utils/config_ds.py
index bafe3f99..fce823c4 100644
--- a/modelscope/msdatasets/config.py
+++ b/modelscope/utils/config_ds.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import os
 from pathlib import Path
 
diff --git a/modelscope/utils/constant.py b/modelscope/utils/constant.py
index d914767b..7968fcd1 100644
--- a/modelscope/utils/constant.py
+++ b/modelscope/utils/constant.py
@@ -20,10 +20,14 @@ class CVTasks(object):
     animal_recognition = 'animal-recognition'
     face_detection = 'face-detection'
     face_recognition = 'face-recognition'
+    facial_expression_recognition = 'facial-expression-recognition'
+    face_2d_keypoints = 'face-2d-keypoints'
     human_detection = 'human-detection'
     human_object_interaction = 'human-object-interaction'
     face_image_generation = 'face-image-generation'
     body_2d_keypoints = 'body-2d-keypoints'
+    body_3d_keypoints = 'body-3d-keypoints'
+    hand_2d_keypoints = 'hand-2d-keypoints'
     general_recognition = 'general-recognition'
 
     image_classification = 'image-classification'
@@ -34,7 +38,14 @@ class CVTasks(object):
     image_object_detection = 'image-object-detection'
 
     image_segmentation = 'image-segmentation'
+    semantic_segmentation = 'semantic-segmentation'
     portrait_matting = 'portrait-matting'
+    text_driven_segmentation = 'text-driven-segmentation'
+    shop_segmentation = 'shop-segmentation'
+    hand_static = 'hand-static'
+    face_human_hand_detection = 'face-human-hand-detection'
+    face_emotion = 'face-emotion'
+    product_segmentation = 'product-segmentation'
 
     # image editing
     skin_retouching = 'skin-retouching'
@@ -57,10 +68,15 @@ class CVTasks(object):
     # video recognition
     live_category = 'live-category'
     action_recognition = 'action-recognition'
+    action_detection = 'action-detection'
     video_category = 'video-category'
     video_embedding = 'video-embedding'
     virtual_try_on = 'virtual-try-on'
     crowd_counting = 'crowd-counting'
+    movie_scene_segmentation = 'movie-scene-segmentation'
+
+    # video editing
+    video_inpainting = 'video-inpainting'
 
     # reid and tracking
     video_single_object_tracking = 'video-single-object-tracking'
@@ -78,12 +94,15 @@ class NLPTasks(object):
     sentiment_analysis = 'sentiment-analysis'
     sentence_similarity = 'sentence-similarity'
     text_classification = 'text-classification'
+    sentence_embedding = 'sentence-embedding'
+    passage_ranking = 'passage-ranking'
     relation_extraction = 'relation-extraction'
     zero_shot = 'zero-shot'
     translation = 'translation'
     token_classification = 'token-classification'
     conversational = 'conversational'
     text_generation = 'text-generation'
+    text2text_generation = 'text2text-generation'
     task_oriented_conversation = 'task-oriented-conversation'
     dialog_intent_prediction = 'dialog-intent-prediction'
     dialog_state_tracking = 'dialog-state-tracking'
@@ -95,7 +114,11 @@ class NLPTasks(object):
     zero_shot_classification = 'zero-shot-classification'
     backbone = 'backbone'
     text_error_correction = 'text-error-correction'
+    faq_question_answering = 'faq-question-answering'
     conversational_text_to_sql = 'conversational-text-to-sql'
+    information_extraction = 'information-extraction'
+    document_segmentation = 'document-segmentation'
+    feature_extraction = 'feature-extraction'
 
 
 class AudioTasks(object):
@@ -115,9 +138,27 @@ class MultiModalTasks(object):
     text_to_image_synthesis = 'text-to-image-synthesis'
     multi_modal_embedding = 'multi-modal-embedding'
     generative_multi_modal_embedding = 'generative-multi-modal-embedding'
+    multi_modal_similarity = 'multi-modal-similarity'
     visual_question_answering = 'visual-question-answering'
     visual_entailment = 'visual-entailment'
     video_multi_modal_embedding = 'video-multi-modal-embedding'
+    image_text_retrieval = 'image-text-retrieval'
+
+
+class TasksIODescriptions(object):
+    image_to_image = 'image_to_image',
+    images_to_image = 'images_to_image',
+    image_to_text = 'image_to_text',
+    seed_to_image = 'seed_to_image',
+    text_to_speech = 'text_to_speech',
+    text_to_text = 'text_to_text',
+    speech_to_text = 'speech_to_text',
+    speech_to_speech = 'speech_to_speech'
+    speeches_to_speech = 'speeches_to_speech',
+    visual_grounding = 'visual_grounding',
+    visual_question_answering = 'visual_question_answering',
+    visual_entailment = 'visual_entailment',
+    generative_multi_modal_embedding = 'generative_multi_modal_embedding'
 
 
 class Tasks(CVTasks, NLPTasks, AudioTasks, MultiModalTasks):
@@ -211,6 +252,8 @@ class ModelFile(object):
     VOCAB_FILE = 'vocab.txt'
     ONNX_MODEL_FILE = 'model.onnx'
     LABEL_MAPPING = 'label_mapping.json'
+    TRAIN_OUTPUT_DIR = 'output'
+    TS_MODEL_FILE = 'model.ts'
 
 
 class ConfigFields(object):
@@ -253,6 +296,7 @@ class Frameworks(object):
 
 DEFAULT_MODEL_REVISION = 'master'
 DEFAULT_DATASET_REVISION = 'master'
+DEFAULT_DATASET_NAMESPACE = 'modelscope'
 
 
 class ModeKeys:
diff --git a/modelscope/utils/cv/image_utils.py b/modelscope/utils/cv/image_utils.py
index da8de672..98ba533e 100644
--- a/modelscope/utils/cv/image_utils.py
+++ b/modelscope/utils/cv/image_utils.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import cv2
 import numpy as np
 
@@ -66,8 +68,15 @@ def draw_joints(image, np_kps, score, threshold=0.2):
 
 
 def draw_box(image, box):
-    cv2.rectangle(image, (int(box[0][0]), int(box[0][1])),
-                  (int(box[1][0]), int(box[1][1])), (0, 0, 255), 2)
+    cv2.rectangle(image, (int(box[0]), int(box[1])),
+                  (int(box[2]), int(box[3])), (0, 0, 255), 2)
+
+
+def realtime_object_detection_bbox_vis(image, bboxes):
+    for bbox in bboxes:
+        cv2.rectangle(image, (bbox[0], bbox[1]), (bbox[2], bbox[3]),
+                      (255, 0, 0), 2)
+    return image
 
 
 def draw_keypoints(output, original_image):
@@ -82,6 +91,47 @@ def draw_keypoints(output, original_image):
     return image
 
 
+def draw_face_detection_no_lm_result(img_path, detection_result):
+    bboxes = np.array(detection_result[OutputKeys.BOXES])
+    scores = np.array(detection_result[OutputKeys.SCORES])
+    img = cv2.imread(img_path)
+    assert img is not None, f"Can't read img: {img_path}"
+    for i in range(len(scores)):
+        bbox = bboxes[i].astype(np.int32)
+        x1, y1, x2, y2 = bbox
+        score = scores[i]
+        cv2.rectangle(img, (x1, y1), (x2, y2), (255, 0, 0), 2)
+        cv2.putText(
+            img,
+            f'{score:.2f}', (x1, y2),
+            1,
+            1.0, (0, 255, 0),
+            thickness=1,
+            lineType=8)
+    print(f'Found {len(scores)} faces')
+    return img
+
+
+def draw_facial_expression_result(img_path, facial_expression_result):
+    label_idx = facial_expression_result[OutputKeys.LABELS]
+    map_list = [
+        'Angry', 'Disgust', 'Fear', 'Happy', 'Sad', 'Surprise', 'Neutral'
+    ]
+    label = map_list[label_idx]
+
+    img = cv2.imread(img_path)
+    assert img is not None, f"Can't read img: {img_path}"
+    cv2.putText(
+        img,
+        'facial expression: {}'.format(label), (10, 10),
+        1,
+        1.0, (0, 255, 0),
+        thickness=1,
+        lineType=8)
+    print('facial expression: {}'.format(label))
+    return img
+
+
 def draw_face_detection_result(img_path, detection_result):
     bboxes = np.array(detection_result[OutputKeys.BOXES])
     kpss = np.array(detection_result[OutputKeys.KEYPOINTS])
@@ -134,3 +184,56 @@ def show_video_tracking_result(video_in_path, bboxes, video_save_path):
         video_writer.write(frame)
     video_writer.release
     cap.release()
+
+
+def panoptic_seg_masks_to_image(masks):
+    draw_img = np.zeros([masks[0].shape[0], masks[0].shape[1], 3])
+    from mmdet.core.visualization.palette import get_palette
+    mask_palette = get_palette('coco', 133)
+
+    from mmdet.core.visualization.image import _get_bias_color
+    taken_colors = set([0, 0, 0])
+    for i, mask in enumerate(masks):
+        color_mask = mask_palette[i]
+        while tuple(color_mask) in taken_colors:
+            color_mask = _get_bias_color(color_mask)
+        taken_colors.add(tuple(color_mask))
+
+        mask = mask.astype(bool)
+        draw_img[mask] = color_mask
+
+    return draw_img
+
+
+def semantic_seg_masks_to_image(masks):
+    from mmdet.core.visualization.palette import get_palette
+    mask_palette = get_palette('coco', 133)
+
+    draw_img = np.zeros([masks[0].shape[0], masks[0].shape[1], 3])
+
+    for i, mask in enumerate(masks):
+        color_mask = mask_palette[i]
+        mask = mask.astype(bool)
+        draw_img[mask] = color_mask
+    return draw_img
+
+
+def show_video_summarization_result(video_in_path, result, video_save_path):
+    frame_indexes = result[OutputKeys.OUTPUT]
+    cap = cv2.VideoCapture(video_in_path)
+    for i in range(len(frame_indexes)):
+        idx = frame_indexes[i]
+        success, frame = cap.read()
+        if success is False:
+            raise Exception(video_in_path,
+                            ' can not be correctly decoded by OpenCV.')
+        if i == 0:
+            size = (frame.shape[1], frame.shape[0])
+            fourcc = cv2.VideoWriter_fourcc('M', 'J', 'P', 'G')
+            video_writer = cv2.VideoWriter(video_save_path, fourcc,
+                                           cap.get(cv2.CAP_PROP_FPS), size,
+                                           True)
+        if idx == 1:
+            video_writer.write(frame)
+    video_writer.release()
+    cap.release()
diff --git a/modelscope/utils/demo_utils.py b/modelscope/utils/demo_utils.py
new file mode 100644
index 00000000..363ae950
--- /dev/null
+++ b/modelscope/utils/demo_utils.py
@@ -0,0 +1,289 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import io
+
+import cv2
+import json
+import numpy as np
+
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks, TasksIODescriptions
+
+TASKS_INPUT_TEMPLATES = {
+    # vision tasks
+    Tasks.image_portrait_stylization: TasksIODescriptions.image_to_image,
+    Tasks.portrait_matting: TasksIODescriptions.image_to_image,
+    Tasks.skin_retouching: TasksIODescriptions.image_to_image,
+    Tasks.image_captioning: TasksIODescriptions.image_to_text,
+    Tasks.image_denoising: TasksIODescriptions.image_to_image,
+    Tasks.image_portrait_enhancement: TasksIODescriptions.image_to_image,
+    Tasks.image_super_resolution: TasksIODescriptions.image_to_image,
+    Tasks.image_colorization: TasksIODescriptions.image_to_image,
+    Tasks.image_color_enhancement: TasksIODescriptions.image_to_image,
+    Tasks.face_image_generation: TasksIODescriptions.seed_to_image,
+    Tasks.image_style_transfer: TasksIODescriptions.images_to_image,
+    Tasks.image_segmentation: TasksIODescriptions.image_to_text,
+    Tasks.image_object_detection: TasksIODescriptions.image_to_text,
+
+    # not tested
+    Tasks.image_classification: TasksIODescriptions.image_to_text,
+    Tasks.ocr_detection: TasksIODescriptions.image_to_text,
+    Tasks.ocr_recognition: TasksIODescriptions.image_to_text,
+    Tasks.body_2d_keypoints: TasksIODescriptions.image_to_text,
+
+    # nlp tasks
+    Tasks.text_classification: TasksIODescriptions.text_to_text,
+    Tasks.text_generation: TasksIODescriptions.text_to_text,
+    Tasks.word_segmentation: TasksIODescriptions.text_to_text,
+    Tasks.text_error_correction: TasksIODescriptions.text_to_text,
+    Tasks.named_entity_recognition: TasksIODescriptions.text_to_text,
+    Tasks.sentiment_classification: TasksIODescriptions.text_to_text,
+
+    # audio tasks
+    Tasks.text_to_speech: TasksIODescriptions.text_to_speech,
+    Tasks.auto_speech_recognition: TasksIODescriptions.speech_to_text,
+    Tasks.keyword_spotting: TasksIODescriptions.speech_to_text,
+    Tasks.acoustic_noise_suppression: TasksIODescriptions.speech_to_speech,
+    Tasks.acoustic_echo_cancellation: TasksIODescriptions.speeches_to_speech,
+
+    # multi-modal
+    Tasks.visual_grounding: TasksIODescriptions.visual_grounding,
+    Tasks.visual_question_answering:
+    TasksIODescriptions.visual_question_answering,
+    Tasks.visual_entailment: TasksIODescriptions.visual_entailment,
+    Tasks.generative_multi_modal_embedding:
+    TasksIODescriptions.generative_multi_modal_embedding,
+
+    # new tasks
+    Tasks.virtual_try_on: TasksIODescriptions.images_to_image,
+
+    # TODO(lingcai.wl): support more tasks and implement corresponding example
+}
+
+INPUT_EXAMPLES = {
+    # Must align with task schema defined in the Widget section of model card=
+    # cv
+    TasksIODescriptions.image_to_image: {
+        'inputs': [
+            'https://modelscope.oss-cn-beijing.aliyuncs.com/test/images/image_cartoon.png'
+        ],
+        'urlPaths': {
+            'outUrls': [{
+                'outputKey': OutputKeys.OUTPUT_IMG,
+                'fileType': 'png'
+            }]
+        }
+    },
+    TasksIODescriptions.images_to_image: {
+        'inputs': [
+            'https://modelscope.oss-cn-beijing.aliyuncs.com/demo/image-style-transfer/style_transfer_content.jpg',
+            'https://modelscope.oss-cn-beijing.aliyuncs.com/demo/image-style-transfer/style_transfer_style.jpg'
+        ],
+        'urlPaths': {
+            'outUrls': [{
+                'outputKey': OutputKeys.OUTPUT_IMG,
+                'fileType': 'png'
+            }]
+        }
+    },
+    TasksIODescriptions.image_to_text: {
+        'inputs': [
+            'https://modelscope.oss-cn-beijing.aliyuncs.com/test/images/image_cartoon.png'
+        ],
+        'urlPaths': {}
+    },
+    # nlp
+    TasksIODescriptions.text_to_text: {
+        'inputs': ['test'],
+        'urlPaths': {}
+    },
+
+    # audio
+    TasksIODescriptions.speech_to_text: {
+        'inputs': [
+            'https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example.wav'
+        ],
+        'urlPaths': {}
+    },
+    TasksIODescriptions.text_to_speech: {
+        'inputs': ['北京今天天气怎么样'],
+        'urlPaths': {
+            'outUrls': [{
+                'outputKey': OutputKeys.OUTPUT_PCM,
+                'fileType': 'pcm'
+            }]
+        }
+    },
+    TasksIODescriptions.speeches_to_speech: {
+        'inputs': [
+            'http://225252-file.oss-cn-hangzhou-zmf.aliyuncs.com/maas_demo/nearend_mic.wav',
+            'http://225252-file.oss-cn-hangzhou-zmf.aliyuncs.com/maas_demo/nearend_speech.wav'
+        ],
+        'urlPaths': {
+            'outUrls': [{
+                'outputKey': OutputKeys.OUTPUT_PCM,
+                'fileType': 'pcm'
+            }]
+        }
+    },
+    TasksIODescriptions.speech_to_speech: {
+        'inputs': [
+            'http://225252-file.oss-cn-hangzhou-zmf.aliyuncs.com/maas_demo/speech_with_noise.wav'
+        ],
+        'urlPaths': {
+            'outUrls': [{
+                'outputKey': OutputKeys.OUTPUT_PCM,
+                'fileType': 'pcm'
+            }]
+        }
+    },
+
+    # multi modal
+    TasksIODescriptions.visual_grounding: {
+        'task':
+        Tasks.visual_grounding,
+        'inputs': [
+            'http://xingchen-data.oss-cn-zhangjiakou.aliyuncs.com/maas/visual-grounding/visual_grounding.png',
+            'a blue turtle-like pokemon with round head'
+        ],
+        'urlPaths': {
+            'inUrls': [{
+                'name': 'image'
+            }, {
+                'name': 'text'
+            }]
+        }
+    },
+    TasksIODescriptions.visual_question_answering: {
+        'task':
+        Tasks.visual_question_answering,
+        'inputs': [
+            'http://225252-file.oss-cn-hangzhou-zmf.aliyuncs.com/maas_demo/visual_question_answering.png',
+            'what is grown on the plant?'
+        ],
+        'urlPaths': {
+            'inUrls': [{
+                'name': 'image'
+            }, {
+                'name': 'text'
+            }],
+            'outUrls': [{
+                'outputKey': 'text'
+            }]
+        }
+    },
+    TasksIODescriptions.visual_entailment: {
+        'task':
+        Tasks.visual_entailment,
+        'inputs': [
+            'http://xingchen-data.oss-cn-zhangjiakou.aliyuncs.com/maas/visual-entailment/visual_entailment.jpg',
+            'there are two birds.', 'test'
+        ],
+        'urlPaths': {
+            'inUrls': [{
+                'name': 'image'
+            }, {
+                'name': 'text'
+            }],
+            'outUrls': [{}]
+        }
+    },
+    TasksIODescriptions.generative_multi_modal_embedding: {
+        'task':
+        Tasks.generative_multi_modal_embedding,
+        'inputs': [
+            'http://clip-multimodal.oss-cn-beijing.aliyuncs.com/lingchen/demo/dogs.jpg',
+            'dogs playing in the grass'
+        ],
+        'urlPaths': {
+            'inUrls': [{
+                'name': 'image'
+            }, {
+                'name': 'text'
+            }],
+            'outUrls': [{}]
+        }
+    },
+}
+
+
+class DemoCompatibilityCheck(object):
+
+    def compatibility_check(self):
+        if self.task not in TASKS_INPUT_TEMPLATES:
+            print('task is not supported in demo service so far')
+            return False
+        if TASKS_INPUT_TEMPLATES[self.task] not in INPUT_EXAMPLES:
+            print('no example input for this task')
+            return False
+
+        print('testing demo: ', self.task, self.model_id)
+        test_pipline = pipeline(self.task, self.model_id)
+        req = INPUT_EXAMPLES[TASKS_INPUT_TEMPLATES[self.task]]
+        inputs = preprocess(req)
+        params = req.get('parameters', {})
+        # modelscope inference
+        if params != {}:
+            output = test_pipline(inputs, **params)
+        else:
+            output = test_pipline(inputs)
+        json.dumps(output, cls=NumpyEncoder)
+        result = postprocess(req, output)
+        print(result)
+        return True
+
+
+class NumpyEncoder(json.JSONEncoder):
+
+    def default(self, obj):
+        if isinstance(obj, np.ndarray):
+            return obj.tolist()
+
+        if isinstance(obj, np.floating):
+            return float(obj)
+
+        if isinstance(obj, np.integer):
+            return int(obj)
+
+        return json.JSONEncoder.default(self, obj)
+
+
+def preprocess(req):
+    in_urls = req.get('urlPaths').get('inUrls')
+    if len(req['inputs']) == 1:
+        inputs = req['inputs'][0]
+    else:
+        inputs = tuple(req['inputs'])
+    if in_urls is None or len(in_urls) == 0:
+        return inputs
+
+    inputs_dict = {}
+    for i, in_url in enumerate(in_urls):
+        input_name = in_url.get('name')
+        if input_name is None or input_name == '':
+            return inputs
+        inputs_dict[input_name] = req['inputs'][i]
+    return inputs_dict
+
+
+def postprocess(req, resp):
+    out_urls = req.get('urlPaths').get('outUrls')
+    if out_urls is None or len(out_urls) == 0:
+        return resp
+    new_resp = resp
+    if isinstance(resp, str):
+        new_resp = json.loads(resp)
+    for out_url in out_urls:
+        output_key = out_url['outputKey']
+        file_type = out_url['fileType']
+        new_resp.get(output_key)
+        if file_type == 'png' or file_type == 'jpg':
+            content = new_resp.get(output_key)
+            _, img_encode = cv2.imencode('.' + file_type, content)
+            img_bytes = img_encode.tobytes()
+            return type(img_bytes)
+        else:
+            out_mem_file = io.BytesIO()
+            out_mem_file.write(new_resp.get(output_key))
+            return type(out_mem_file)
diff --git a/modelscope/utils/device.py b/modelscope/utils/device.py
index aa8fda66..33c0910d 100644
--- a/modelscope/utils/device.py
+++ b/modelscope/utils/device.py
@@ -8,12 +8,6 @@ from modelscope.utils.logger import get_logger
 
 logger = get_logger()
 
-if is_tf_available():
-    import tensorflow as tf
-
-if is_torch_available():
-    import torch
-
 
 def verify_device(device_name):
     """ Verify device is valid, device should be either cpu, cuda, gpu, cuda:X or gpu:X.
@@ -25,10 +19,12 @@ def verify_device(device_name):
     Return:
         device info (tuple):  device_type and device_id, if device_id is not set, will use 0 as default.
     """
+    err_msg = 'device should be either cpu, cuda, gpu, gpu:X or cuda:X where X is the ordinal for gpu device.'
+    assert device_name is not None and device_name != '', err_msg
     device_name = device_name.lower()
     eles = device_name.split(':')
-    err_msg = 'device should be either cpu, cuda, gpu, gpu:X or cuda:X where X is the ordinal for gpu device.'
     assert len(eles) <= 2, err_msg
+    assert device_name is not None
     assert eles[0] in ['cpu', 'cuda', 'gpu'], err_msg
     device_type = eles[0]
     device_id = None
@@ -63,6 +59,7 @@ def device_placement(framework, device_name='gpu:0'):
     device_type, device_id = verify_device(device_name)
 
     if framework == Frameworks.tf:
+        import tensorflow as tf
         if device_type == Devices.gpu and not tf.test.is_gpu_available():
             logger.warning(
                 'tensorflow cuda is not available, using cpu instead.')
@@ -76,6 +73,7 @@ def device_placement(framework, device_name='gpu:0'):
                     yield
 
     elif framework == Frameworks.torch:
+        import torch
         if device_type == Devices.gpu:
             if torch.cuda.is_available():
                 torch.cuda.set_device(f'cuda:{device_id}')
@@ -86,12 +84,13 @@ def device_placement(framework, device_name='gpu:0'):
         yield
 
 
-def create_device(device_name) -> torch.DeviceObjType:
+def create_device(device_name):
     """ create torch device
 
     Args:
         device_name (str):  cpu, gpu, gpu:0, cuda:0 etc.
     """
+    import torch
     device_type, device_id = verify_device(device_name)
     use_cuda = False
     if device_type == Devices.gpu:
diff --git a/modelscope/utils/error.py b/modelscope/utils/error.py
index e7d1442f..a6bbc8b3 100644
--- a/modelscope/utils/error.py
+++ b/modelscope/utils/error.py
@@ -96,3 +96,18 @@ DECORD_IMPORT_ERROR = """
 {0} requires the decord library but it was not found in your environment. You can install it with pip:
 `pip install decord>=0.6.0`
 """
+
+# docstyle-ignore
+DEEPSPEED_IMPORT_ERROR = """
+{0} requires the Deepspeed library but it was not found in your environment. Checkout the instructions on the
+installation page: https://www.deepspeed.ai/tutorials/advanced-install/ and follow the ones that match your environment.
+"""
+
+# docstyle-ignore
+FAIRSEQ_IMPORT_ERROR = """
+{0} requires the fairseq library but it was not found in your environment.
+You can install it with pip on linux:
+`pip install fairseq`
+On windows, please checkout the instructions on the
+installation page: https://github.com/facebookresearch/fairseq and follow the ones that match your environment.
+"""
diff --git a/modelscope/utils/hub.py b/modelscope/utils/hub.py
index 6d685b87..2dbe7045 100644
--- a/modelscope/utils/hub.py
+++ b/modelscope/utils/hub.py
@@ -10,7 +10,8 @@ from modelscope.hub.constants import Licenses, ModelVisibility
 from modelscope.hub.file_download import model_file_download
 from modelscope.hub.snapshot_download import snapshot_download
 from modelscope.utils.config import Config
-from modelscope.utils.constant import DEFAULT_MODEL_REVISION, ModelFile
+from modelscope.utils.constant import (DEFAULT_MODEL_REVISION, ConfigFields,
+                                       ModelFile)
 from .logger import get_logger
 
 logger = get_logger(__name__)
@@ -76,19 +77,26 @@ def auto_load(model: Union[str, List[str]]):
 def get_model_type(model_dir):
     """Get the model type from the configuration.
 
-    This method will try to get the 'model.type' or 'model.model_type' field from the configuration.json file.
-    If this file does not exist, the method will try to get the 'model_type' field from the config.json.
+    This method will try to get the model type from 'model.backbone.type',
+    'model.type' or 'model.model_type' field in the configuration.json file. If
+    this file does not exist, the method will try to get the 'model_type' field
+    from the config.json.
 
-    @param model_dir: The local model dir to use.
-    @return: The model type string, returns None if nothing is found.
+    @param model_dir: The local model dir to use. @return: The model type
+    string, returns None if nothing is found.
     """
     try:
         configuration_file = osp.join(model_dir, ModelFile.CONFIGURATION)
         config_file = osp.join(model_dir, 'config.json')
         if osp.isfile(configuration_file):
             cfg = Config.from_file(configuration_file)
-            return cfg.model.model_type if hasattr(cfg.model, 'model_type') and not hasattr(cfg.model, 'type') \
-                else cfg.model.type
+            if hasattr(cfg.model, 'backbone'):
+                return cfg.model.backbone.type
+            elif hasattr(cfg.model,
+                         'model_type') and not hasattr(cfg.model, 'type'):
+                return cfg.model.model_type
+            else:
+                return cfg.model.type
         elif osp.isfile(config_file):
             cfg = Config.from_file(config_file)
             return cfg.model_type if hasattr(cfg, 'model_type') else None
@@ -119,11 +127,27 @@ def parse_label_mapping(model_dir):
     if label2id is None:
         config_path = os.path.join(model_dir, ModelFile.CONFIGURATION)
         config = Config.from_file(config_path)
-        if hasattr(config, 'model') and hasattr(config.model, 'label2id'):
-            label2id = config.model.label2id
-    if label2id is None:
-        config_path = os.path.join(model_dir, 'config.json')
+        if hasattr(config, ConfigFields.model) and hasattr(
+                config[ConfigFields.model], 'label2id'):
+            label2id = config[ConfigFields.model].label2id
+        elif hasattr(config, ConfigFields.model) and hasattr(
+                config[ConfigFields.model], 'id2label'):
+            id2label = config[ConfigFields.model].id2label
+            label2id = {label: id for id, label in id2label.items()}
+        elif hasattr(config, ConfigFields.preprocessor) and hasattr(
+                config[ConfigFields.preprocessor], 'label2id'):
+            label2id = config[ConfigFields.preprocessor].label2id
+        elif hasattr(config, ConfigFields.preprocessor) and hasattr(
+                config[ConfigFields.preprocessor], 'id2label'):
+            id2label = config[ConfigFields.preprocessor].id2label
+            label2id = {label: id for id, label in id2label.items()}
+
+    config_path = os.path.join(model_dir, 'config.json')
+    if label2id is None and os.path.exists(config_path):
         config = Config.from_file(config_path)
         if hasattr(config, 'label2id'):
             label2id = config.label2id
+        elif hasattr(config, 'id2label'):
+            id2label = config.id2label
+            label2id = {label: id for id, label in id2label.items()}
     return label2id
diff --git a/modelscope/utils/import_utils.py b/modelscope/utils/import_utils.py
index c9bea020..2a6fdc80 100644
--- a/modelscope/utils/import_utils.py
+++ b/modelscope/utils/import_utils.py
@@ -290,6 +290,8 @@ REQUIREMENTS_MAAPING = OrderedDict([
     ('easyasr', (is_package_available('easyasr'), AUDIO_IMPORT_ERROR)),
     ('kwsbp', (is_package_available('kwsbp'), AUDIO_IMPORT_ERROR)),
     ('decord', (is_package_available('decord'), DECORD_IMPORT_ERROR)),
+    ('deepspeed', (is_package_available('deepspeed'), DEEPSPEED_IMPORT_ERROR)),
+    ('fairseq', (is_package_available('fairseq'), FAIRSEQ_IMPORT_ERROR)),
 ])
 
 SYSTEM_PACKAGE = set(['os', 'sys', 'typing'])
diff --git a/modelscope/utils/model_tag.py b/modelscope/utils/model_tag.py
new file mode 100644
index 00000000..7065e8f3
--- /dev/null
+++ b/modelscope/utils/model_tag.py
@@ -0,0 +1,184 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import logging
+import os
+
+import json
+import requests
+
+from modelscope.version import __version__
+
+
+# 打标
+class ModelTag(object):
+    _URL = os.environ.get('MODEL_TAG_URL', None)
+
+    # 模型测试结果
+    BATCH_COMMIT_RESULT_URL = f'{_URL}/batchCommitResult'
+    # 测试阶段完成
+    BATCH_REFRESH_STAGE_URL = f'{_URL}/batchRefreshStage'
+    # query_model_stage
+    QUERY_MODEL_STAGE_URL = f'{_URL}/queryModelStage'
+
+    HEADER = {'Content-Type': 'application/json'}
+
+    # 检测结果
+    MODEL_SKIP = 0
+    MODEL_FAIL = 1
+    MODEL_PASS = 2
+
+    class ItemResult(object):
+
+        def __init__(self):
+            self.result = 0
+            self.name = ''
+            self.info = ''
+
+        def to_json(self):
+            return {
+                'name': self.name,
+                'result': self.result,
+                'info': self.info
+            }
+
+    def __init__(self):
+        self.job_name = ''
+        self.job_id = ''
+        self.model = ''
+        self.sdk_version = ''
+        self.image_version = ''
+        self.domain = ''
+        self.task = ''
+        self.source = ''
+        self.stage = ''
+        # ItemResult list
+        self.item_result = []
+
+    # 发送请求
+    def _post_request(self, url, param):
+        try:
+            logging.info(url + ' query: '
+                         + str(json.dumps(param, ensure_ascii=False)))
+            res = requests.post(
+                url=url,
+                headers=self.HEADER,
+                data=json.dumps(param, ensure_ascii=False).encode('utf8'))
+            if res.status_code == 200:
+                logging.info(f'{url} post结果: ' + res.text)
+                res_json = json.loads(res.text)
+                if int(res_json['errorCode']) == 200:
+                    return res_json['content']
+                else:
+                    logging.error(res.text)
+            else:
+                logging.error(res.text)
+        except Exception as e:
+            logging.error(e)
+
+        return None
+
+    # 提交模型测试结果
+    def batch_commit_result(self):
+        try:
+            param = {
+                'sdkVersion':
+                self.sdk_version,
+                'imageVersion':
+                self.image_version,
+                'source':
+                self.source,
+                'jobName':
+                self.job_name,
+                'jobId':
+                self.job_id,
+                'modelList': [{
+                    'model': self.model,
+                    'domain': self.domain,
+                    'task': self.task,
+                    'itemResult': self.item_result
+                }]
+            }
+            return self._post_request(self.BATCH_COMMIT_RESULT_URL, param)
+
+        except Exception as e:
+            logging.error(e)
+
+        return
+
+    # 测试阶段完成
+    def batch_refresh_stage(self):
+        try:
+            param = {
+                'sdkVersion':
+                self.sdk_version,
+                'imageVersion':
+                self.image_version,
+                'source':
+                self.source,
+                'stage':
+                self.stage,
+                'modelList': [{
+                    'model': self.model,
+                    'domain': self.domain,
+                    'task': self.task
+                }]
+            }
+            return self._post_request(self.BATCH_REFRESH_STAGE_URL, param)
+
+        except Exception as e:
+            logging.error(e)
+
+        return
+
+    # 查询模型某个阶段的最新测试结果（只返回单个结果
+    def query_model_stage(self):
+        try:
+            param = {
+                'sdkVersion': self.sdk_version,
+                'model': self.model,
+                'stage': self.stage,
+                'imageVersion': self.image_version
+            }
+            return self._post_request(self.QUERY_MODEL_STAGE_URL, param)
+
+        except Exception as e:
+            logging.error(e)
+
+        return None
+
+    # 提交模型UT测试结果
+    """
+        model_tag = ModelTag()
+        model_tag.model = "XXX"
+        model_tag.sdk_version = "0.3.7"
+        model_tag.domain = "nlp"
+        model_tag.task = "word-segmentation"
+        item = model_tag.ItemResult()
+        item.result = model_tag.MODEL_PASS
+        item.name = "ALL"
+        item.info = ""
+        model_tag.item_result.append(item.to_json())
+    """
+
+    def commit_ut_result(self):
+        if self._URL is not None and self._URL != '':
+            self.job_name = 'UT'
+            self.source = 'dev'
+            self.stage = 'integration'
+
+            self.batch_commit_result()
+            self.batch_refresh_stage()
+
+
+def commit_model_ut_result(model_name, ut_result):
+    model_tag = ModelTag()
+    model_tag.model = model_name.replace('damo/', '')
+    model_tag.sdk_version = __version__
+    # model_tag.domain = ""
+    # model_tag.task = ""
+    item = model_tag.ItemResult()
+    item.result = ut_result
+    item.name = 'ALL'
+    item.info = ''
+    model_tag.item_result.append(item.to_json())
+    model_tag.commit_ut_result()
diff --git a/modelscope/utils/nlp/__init__.py b/modelscope/utils/nlp/__init__.py
index e69de29b..62c0b888 100644
--- a/modelscope/utils/nlp/__init__.py
+++ b/modelscope/utils/nlp/__init__.py
@@ -0,0 +1,22 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .utils import import_external_nltk_data
+
+else:
+    _import_structure = {
+        'utils': ['import_external_nltk_data'],
+    }
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/utils/nlp/distributed.py b/modelscope/utils/nlp/distributed.py
new file mode 100755
index 00000000..2b590a10
--- /dev/null
+++ b/modelscope/utils/nlp/distributed.py
@@ -0,0 +1,130 @@
+# Copyright 2021-2022 The Alibaba DAMO NLP Team Authors.
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+
+import torch
+import torch.distributed as dist
+from megatron import mpu
+from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
+from torch.autograd import Variable
+from torch.nn.modules import Module
+
+from modelscope.utils.torch_utils import init_dist
+
+
+def initialize_distributed(rank, mpu, world_size, model_parallel_size,
+                           master_ip, master_port):
+    """Initialize torch.distributed."""
+    # Manually set the device ids.
+    device = rank % torch.cuda.device_count()
+    torch.cuda.set_device(device)
+    # Call the init process
+    init_method = 'tcp://'
+    init_method += master_ip + ':' + master_port
+    torch.distributed.init_process_group(
+        backend='nccl', world_size=8, rank=rank, init_method=init_method)
+    # Set the model-parallel communicators.
+    mpu.initialize_model_parallel(model_parallel_size)
+
+
+def normal_init_method(mean, std):
+
+    def init_(tensor):
+        return torch.nn.init.normal_(tensor, mean=mean, std=std)
+
+    return init_
+
+
+def scaled_init_method(mean, std, num_layers):
+    """Init method based on N(0, sigma/sqrt(2*num_layers)."""
+    std = std / math.sqrt(2.0 * num_layers)
+
+    def init_(tensor):
+        return torch.nn.init.normal_(tensor, mean=mean, std=std)
+
+    return init_
+
+
+class DistributedDataParallel(Module):
+
+    def __init__(self, module):
+        super(DistributedDataParallel, self).__init__()
+        self.warn_on_half = True if dist._backend == dist.dist_backend.GLOO else False
+
+        self.module = module
+        self.data_parallel_group = mpu.get_data_parallel_group()
+        src_rank = mpu.get_model_parallel_rank()
+        for p in self.module.parameters():
+            if torch.is_tensor(p):
+                dist.broadcast(p, src_rank, group=self.data_parallel_group)
+
+        def allreduce_params(reduce_after=True,
+                             no_scale=False,
+                             fp32_allreduce=False):
+            if (self.needs_reduction):
+                self.needs_reduction = False
+                buckets = {}
+                for name, param in self.module.named_parameters():
+                    if param.requires_grad and param.grad is not None:
+                        tp = (param.data.type())
+                        if tp not in buckets:
+                            buckets[tp] = []
+                        buckets[tp].append(param)
+                if self.warn_on_half:
+                    if torch.cuda.HalfTensor in buckets:
+                        print(
+                            'WARNING: gloo dist backend for half parameters may be extremely slow.',
+                            'It is recommended to use the NCCL backend in this case.'
+                        )
+                        self.warn_on_half = False
+                for tp in buckets:
+                    bucket = buckets[tp]
+                    grads = [param.grad.data for param in bucket]
+                    coalesced = _flatten_dense_tensors(grads)
+                    if fp32_allreduce:
+                        coalesced = coalesced.float()
+                    if not no_scale and not reduce_after:
+                        coalesced /= dist.get_world_size(
+                            group=self.data_parallel_group)
+                    dist.all_reduce(coalesced, group=self.data_parallel_group)
+                    torch.cuda.synchronize()
+                    if not no_scale and reduce_after:
+                        coalesced /= dist.get_world_size(
+                            group=self.data_parallel_group)
+                    for buf, synced in zip(
+                            grads, _unflatten_dense_tensors(coalesced, grads)):
+                        buf.copy_(synced)
+
+        self.hook_handles = []
+        self.hooks = []
+        for param in list(self.module.parameters()):
+
+            def allreduce_hook(*unused):
+                Variable._execution_engine.queue_callback(allreduce_params)
+
+        self.allreduce_params = allreduce_params
+
+    def forward(self, *inputs, **kwargs):
+        self.needs_reduction = True
+        return self.module(*inputs, **kwargs)
+
+    def state_dict(self, destination=None, prefix='', keep_vars=False):
+        sd = self.module.state_dict(destination, prefix, keep_vars)
+
+        return sd
+
+    def load_state_dict(self, state_dict, strict=True):
+        self.module.load_state_dict(state_dict, strict=strict)
diff --git a/modelscope/utils/nlp/load_checkpoint.py b/modelscope/utils/nlp/load_checkpoint.py
new file mode 100755
index 00000000..6534e18d
--- /dev/null
+++ b/modelscope/utils/nlp/load_checkpoint.py
@@ -0,0 +1,117 @@
+# Copyright 2021-2022 The Alibaba DAMO NLP Team Authors.
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+import torch
+
+
+def load_checkpoint(model,
+                    load_dir,
+                    tag,
+                    load_module_strict=True,
+                    load_optimizer_states=True,
+                    load_lr_scheduler_states=True):
+    r"""Load training checkpoint
+
+    Arguments:
+        load_dir: Required. Directory to load the checkpoint from
+        tag: Required. Checkpoint tag used as a unique identifier for the checkpoint. Ex. Global Step.
+        load_module_strict: Optional. Boolean to strictly enforce that the keys in state_dict of module and
+         checkpoint match.
+        load_optimizer_states: Optional. Boolean to load the training optimizer states from Checkpoint.
+         Ex. ADAM's momentum and variance
+        load_lr_scheduler_states: Optional. Boolean to add the learning rate scheduler states from Checkpoint.
+    Return:
+        load_path: Path of the loaded checkpoint. None if loading the checkpoint failed
+        client_state: State dictionary used for loading required training states in the client code.
+    """
+
+    load_path, client_states = _load_checkpoint(
+        model,
+        load_dir,
+        tag,
+        load_module_strict=load_module_strict,
+        load_optimizer_states=load_optimizer_states,
+        load_lr_scheduler_states=load_lr_scheduler_states)
+
+    if load_optimizer_states:
+        if model.zero_optimization() and load_path is not None:
+            model._load_zero_checkpoint(
+                load_dir, tag, load_optimizer_states=load_optimizer_states)
+
+    return load_path, client_states
+
+
+def _get_ckpt_name(mpu, checkpoints_path, tag):
+    mp_rank = 0 if mpu is None else mpu.get_model_parallel_rank()
+    ckpt_name = os.path.join(
+        checkpoints_path, str(tag),
+        'mp_rank_{:02d}'.format(mp_rank) + '_model_states.pt')
+    return ckpt_name
+
+
+def pre_load(mpu, load_dir, tag=''):
+    load_path = _get_ckpt_name(mpu, load_dir, tag)
+    checkpoint = torch.load(
+        load_path, map_location=lambda storage, loc: storage)
+    return checkpoint['module']
+
+
+def _load_checkpoint(model,
+                     load_dir,
+                     tag,
+                     load_module_strict=True,
+                     load_optimizer_states=True,
+                     load_lr_scheduler_states=True):
+
+    load_path = model._get_ckpt_name(load_dir, tag)
+
+    if not os.path.exists(load_path):
+        return None, None
+
+    checkpoint = torch.load(
+        load_path, map_location=lambda storage, loc: storage)
+
+    model.load_module_state_dict(
+        state_dict=checkpoint['module'], strict=load_module_strict)
+    if not model.zero_optimization() and load_optimizer_states:
+        if model.fp16_enabled():
+            model.optimizer.load_state_dict(
+                checkpoint['optimizer'],
+                load_optimizer_states=load_optimizer_states)
+        elif load_optimizer_states:
+            model.optimizer.load_state_dict(checkpoint['optimizer'])
+
+    if load_lr_scheduler_states and model.lr_scheduler is not None:
+        model.lr_scheduler.load_state_dict(checkpoint['lr_scheduler'])
+
+    model.csr_tensor_module_names = checkpoint['csr_tensor_module_names']
+    model.global_steps = checkpoint['global_steps']
+    model.global_samples = checkpoint.get(
+        'global_samples', model.global_steps * model.train_batch_size())
+    model.skipped_steps = checkpoint['skipped_steps']
+    model.loaded_checkpoint_mp_world_size = checkpoint['mp_world_size']
+    model.loaded_checkpoint_dp_world_size = checkpoint['dp_world_size']
+    deepspeed_states = [
+        'module', 'optimizer', 'lr_scheduler', 'csr_tensor_module_names',
+        'skipped_steps', 'global_steps', 'dp_world_size', 'mp_world_size'
+    ]
+    client_state = {
+        key: value
+        for key, value in checkpoint.items() if key not in deepspeed_states
+    }
+
+    return load_path, client_state
diff --git a/modelscope/utils/nlp/nlp_utils.py b/modelscope/utils/nlp/nlp_utils.py
index 35b374f2..eba12103 100644
--- a/modelscope/utils/nlp/nlp_utils.py
+++ b/modelscope/utils/nlp/nlp_utils.py
@@ -2,7 +2,8 @@ from typing import List
 
 from modelscope.outputs import OutputKeys
 from modelscope.pipelines.nlp import (ConversationalTextToSqlPipeline,
-                                      DialogStateTrackingPipeline)
+                                      DialogStateTrackingPipeline,
+                                      TableQuestionAnsweringPipeline)
 
 
 def text2sql_tracking_and_print_results(
@@ -41,3 +42,17 @@ def tracking_and_print_dialog_states(
         print(json.dumps(result))
 
         history_states.extend([result[OutputKeys.OUTPUT], {}])
+
+
+def tableqa_tracking_and_print_results(
+        test_case, pipelines: List[TableQuestionAnsweringPipeline]):
+    for pipeline in pipelines:
+        historical_queries = None
+        for question in test_case['utterance']:
+            output_dict = pipeline({
+                'question': question,
+                'history_sql': historical_queries
+            })
+            print('output_dict', output_dict['output'].string,
+                  output_dict['output'].query)
+            historical_queries = output_dict['history']
diff --git a/modelscope/utils/nlp/space/clean_dataset.py b/modelscope/utils/nlp/space/clean_dataset.py
new file mode 100644
index 00000000..4578ccc4
--- /dev/null
+++ b/modelscope/utils/nlp/space/clean_dataset.py
@@ -0,0 +1,333 @@
+import os
+import re
+
+from . import ontology
+
+
+def clean_text_split_dot(text):
+    text = re.sub(r'([a-zT]+)\.([a-z])', r'\1 . \2',
+                  text)  # 'abc.xyz' -> 'abc . xyz'
+    text = re.sub(r'(\w+)\.\.? ', r'\1 . ', text)  # if 'abc. ' -> 'abc . '
+    return text
+
+
+def clean_text(data_dir, text):
+    text = text.strip()
+    text = text.lower()
+    text = text.replace(u'’', "'")
+    text = text.replace(u'‘', "'")
+    text = text.replace(';', ',')
+    text = text.replace('"', ' ')
+    text = text.replace('/', ' and ')
+    text = text.replace("don't", "do n't")
+    text = clean_time(text)
+    baddata = {
+        r'c\.b (\d), (\d) ([a-z])\.([a-z])': r'cb\1\2\3\4',
+        'c.b. 1 7 d.y': 'cb17dy',
+        'c.b.1 7 d.y': 'cb17dy',
+        'c.b 25, 9 a.q': 'cb259aq',
+        'isc.b 25, 9 a.q': 'is cb259aq',
+        'c.b2, 1 u.f': 'cb21uf',
+        'c.b 1,2 q.a': 'cb12qa',
+        '0-122-336-5664': '01223365664',
+        'postcodecb21rs': 'postcode cb21rs',
+        r'i\.d': 'id',
+        ' i d ': 'id',
+        'Telephone:01223358966': 'Telephone: 01223358966',
+        'depature': 'departure',
+        'depearting': 'departing',
+        '-type': ' type',
+        r'b[\s]?&[\s]?b': 'bed and breakfast',
+        'b and b': 'bed and breakfast',
+        r'guesthouse[s]?': 'guest house',
+        r'swimmingpool[s]?': 'swimming pool',
+        "wo n\'t": 'will not',
+        " \'d ": ' would ',
+        " \'m ": ' am ',
+        " \'re' ": ' are ',
+        " \'ll' ": ' will ',
+        " \'ve ": ' have ',
+        r'^\'': '',
+        r'\'$': '',
+    }
+    for tmpl, good in baddata.items():
+        text = re.sub(tmpl, good, text)
+
+    text = re.sub(r'([a-zT]+)\.([a-z])', r'\1 . \2',
+                  text)  # 'abc.xyz' -> 'abc . xyz'
+    text = re.sub(r'(\w+)\.\.? ', r'\1 . ', text)  # if 'abc. ' -> 'abc . '
+
+    with open(os.path.join(data_dir, 'mapping.pair'), 'r') as fin:
+        for line in fin.readlines():
+            fromx, tox = line.replace('\n', '').split('\t')
+            text = ' ' + text + ' '
+            text = text.replace(' ' + fromx + ' ', ' ' + tox + ' ')[1:-1]
+
+    return text
+
+
+def clean_time(utter):
+    utter = re.sub(r'(\d+) ([ap]\.?m)', lambda x: x.group(1) + x.group(2),
+                   utter)  # 9 am -> 9am
+    utter = re.sub(r'((?<!\d)\d:\d+)(am)?', r'0\1', utter)
+    utter = re.sub(r'((?<!\d)\d)am', r'0\1:00', utter)
+    utter = re.sub(r'((?<!\d)\d)pm',
+                   lambda x: str(int(x.group(1)) + 12) + ':00', utter)
+    utter = re.sub(r'(\d+)(:\d+)pm',
+                   lambda x: str(int(x.group(1)) + 12) + x.group(2), utter)
+    utter = re.sub(r'(\d+)a\.?m', r'\1', utter)
+    return utter
+
+
+def clean_slot_values(data_dir, domain, slot, value):
+    value = clean_text(data_dir, value)
+    if not value:
+        value = ''
+    elif value == 'not mentioned':
+        value = ''
+        # value = 'not mentioned' # if in DST setting
+    elif domain == 'attraction':
+        if slot == 'name':
+            if value == 't':
+                value = ''
+            if value == 'trinity':
+                value = 'trinity college'
+        elif slot == 'area':
+            if value in ['town centre', 'cent', 'center', 'ce']:
+                value = 'centre'
+            elif value in [
+                    'ely', 'in town', 'museum', 'norwich', 'same area as hotel'
+            ]:
+                value = ''
+            elif value in ['we']:
+                value = 'west'
+        elif slot == 'type':
+            if value in ['m', 'mus', 'musuem']:
+                value = 'museum'
+            elif value in ['art', 'architectural']:
+                value = 'architecture'
+            elif value in ['churches']:
+                value = 'church'
+            elif value in ['coll']:
+                value = 'college'
+            elif value in ['concert', 'concerthall']:
+                value = 'concert hall'
+            elif value in ['night club']:
+                value = 'nightclub'
+            elif value in [
+                    'mutiple sports', 'mutliple sports', 'sports', 'galleria'
+            ]:
+                value = 'multiple sports'
+            elif value in ['ol', 'science', 'gastropub', 'la raza']:
+                value = ''
+            elif value in ['swimmingpool', 'pool']:
+                value = 'swimming pool'
+            elif value in ['fun']:
+                value = 'entertainment'
+
+    elif domain == 'hotel':
+        if slot == 'area':
+            if value in [
+                    'cen', 'centre of town', 'near city center', 'center'
+            ]:
+                value = 'centre'
+            elif value in ['east area', 'east side']:
+                value = 'east'
+            elif value in ['in the north', 'north part of town']:
+                value = 'north'
+            elif value in ['we']:
+                value = 'west'
+        elif slot == 'day':
+            if value == 'monda':
+                value = 'monday'
+            elif value == 't':
+                value = 'tuesday'
+        elif slot == 'name':
+            if value == 'uni':
+                value = 'university arms hotel'
+            elif value == 'university arms':
+                value = 'university arms hotel'
+            elif value == 'acron':
+                value = 'acorn guest house'
+            elif value == 'ashley':
+                value = 'ashley hotel'
+            elif value == 'arbury lodge guesthouse':
+                value = 'arbury lodge guest house'
+            elif value == 'la':
+                value = 'la margherit'
+            elif value == 'no':
+                value = ''
+        elif slot == 'internet':
+            if value == 'does not':
+                value = 'no'
+            elif value in ['y', 'free', 'free internet']:
+                value = 'yes'
+            elif value in ['4']:
+                value = ''
+        elif slot == 'parking':
+            if value == 'n':
+                value = 'no'
+            elif value in ['free parking']:
+                value = 'yes'
+            elif value in ['y']:
+                value = 'yes'
+        elif slot in ['pricerange', 'price range']:
+            slot = 'pricerange'
+            if value == 'moderately':
+                value = 'moderate'
+            elif value in ['any']:
+                value = "do n't care"
+            elif value in ['any']:
+                value = "do n't care"
+            elif value in ['inexpensive']:
+                value = 'cheap'
+            elif value in ['2', '4']:
+                value = ''
+        elif slot == 'stars':
+            if value == 'two':
+                value = '2'
+            elif value == 'three':
+                value = '3'
+            elif value in [
+                    '4-star', '4 stars', '4 star', 'four star', 'four stars'
+            ]:
+                value = '4'
+        elif slot == 'type':
+            if value == '0 star rarting':
+                value = ''
+            elif value == 'guesthouse':
+                value = 'guest house'
+            elif value not in ['hotel', 'guest house', "do n't care"]:
+                value = ''
+    elif domain == 'restaurant':
+        if slot == 'area':
+            if value in [
+                    'center', 'scentre', 'center of town', 'city center',
+                    'cb30aq', 'town center', 'centre of cambridge',
+                    'city centre'
+            ]:
+                value = 'centre'
+            elif value == 'west part of town':
+                value = 'west'
+            elif value == 'n':
+                value = 'north'
+            elif value in ['the south']:
+                value = 'south'
+            elif value not in [
+                    'centre', 'south', "do n't care", 'west', 'east', 'north'
+            ]:
+                value = ''
+        elif slot == 'day':
+            if value == 'monda':
+                value = 'monday'
+            elif value == 't':
+                value = 'tuesday'
+        elif slot in ['pricerange', 'price range']:
+            slot = 'pricerange'
+            if value in ['moderately', 'mode', 'mo']:
+                value = 'moderate'
+            elif value in ['not']:
+                value = ''
+            elif value in ['inexpensive', 'ch']:
+                value = 'cheap'
+        elif slot == 'food':
+            if value == 'barbecue':
+                value = 'barbeque'
+        elif slot == 'pricerange':
+            if value == 'moderately':
+                value = 'moderate'
+        elif slot == 'time':
+            if value == '9:00':
+                value = '09:00'
+            elif value == '9:45':
+                value = '09:45'
+            elif value == '1330':
+                value = '13:30'
+            elif value == '1430':
+                value = '14:30'
+            elif value == '9:15':
+                value = '09:15'
+            elif value == '9:30':
+                value = '09:30'
+            elif value == '1830':
+                value = '18:30'
+            elif value == '9':
+                value = '09:00'
+            elif value == '2:00':
+                value = '14:00'
+            elif value == '1:00':
+                value = '13:00'
+            elif value == '3:00':
+                value = '15:00'
+    elif domain == 'taxi':
+        if slot in ['arriveBy', 'arrive by']:
+            slot = 'arriveby'
+            if value == '1530':
+                value = '15:30'
+            elif value == '15 minutes':
+                value = ''
+        elif slot in ['leaveAt', 'leave at']:
+            slot = 'leaveat'
+            if value == '1:00':
+                value = '01:00'
+            elif value == '21:4':
+                value = '21:04'
+            elif value == '4:15':
+                value = '04:15'
+            elif value == '5:45':
+                value = '05:45'
+            elif value == '0700':
+                value = '07:00'
+            elif value == '4:45':
+                value = '04:45'
+            elif value == '8:30':
+                value = '08:30'
+            elif value == '9:30':
+                value = '09:30'
+            value = value.replace('.', ':')
+
+    elif domain == 'train':
+        if slot in ['arriveBy', 'arrive by']:
+            slot = 'arriveby'
+            if value == '1':
+                value = '01:00'
+            elif value in ['does not care', 'doesnt care', "doesn't care"]:
+                value = "do n't care"
+            elif value == '8:30':
+                value = '08:30'
+            elif value == 'not 15:45':
+                value = ''
+            value = value.replace('.', ':')
+        elif slot == 'day':
+            if value == 'doesnt care' or value == "doesn't care":
+                value = "do n't care"
+        elif slot in ['leaveAt', 'leave at']:
+            slot = 'leaveat'
+            if value == '2:30':
+                value = '02:30'
+            elif value == '7:54':
+                value = '07:54'
+            elif value == 'after 5:45 pm':
+                value = '17:45'
+            elif value in [
+                    'early evening', 'friday', 'sunday', 'tuesday', 'afternoon'
+            ]:
+                value = ''
+            elif value == '12':
+                value = '12:00'
+            elif value == '1030':
+                value = '10:30'
+            elif value == '1700':
+                value = '17:00'
+            elif value in [
+                    'does not care', 'doesnt care', 'do nt care',
+                    "doesn't care"
+            ]:
+                value = "do n't care"
+
+            value = value.replace('.', ':')
+    if value in ['dont care', "don't care", 'do nt care', "doesn't care"]:
+        value = "do n't care"
+    if ontology.normlize_slot_names.get(slot):
+        slot = ontology.normlize_slot_names[slot]
+    return slot, value
diff --git a/modelscope/utils/nlp/space/utils.py b/modelscope/utils/nlp/space/utils.py
index ef38684a..81d1b1c5 100644
--- a/modelscope/utils/nlp/space/utils.py
+++ b/modelscope/utils/nlp/space/utils.py
@@ -4,8 +4,11 @@ from collections import OrderedDict
 import json
 import numpy as np
 
+from modelscope.utils.logger import get_logger
 from . import ontology
 
+logger = get_logger()
+
 
 def max_lens(X):
     lens = [len(X)]
@@ -117,8 +120,8 @@ class MultiWOZVocab(object):
     def construct(self):
         freq_dict_sorted = sorted(
             self._freq_dict.keys(), key=lambda x: -self._freq_dict[x])
-        print('Vocabulary size including oov: %d' %
-              (len(freq_dict_sorted) + len(self._idx2word)))
+        logger.info('Vocabulary size including oov: %d' %
+                    (len(freq_dict_sorted) + len(self._idx2word)))
         if len(freq_dict_sorted) + len(self._idx2word) < self.vocab_size:
             logging.warning(
                 'actual label set smaller than that configured: {}/{}'.format(
@@ -148,8 +151,9 @@ class MultiWOZVocab(object):
         for w, idx in self._word2idx.items():
             self._idx2word[idx] = w
         self.vocab_size_oov = len(self._idx2word)
-        print('vocab file loaded from "' + vocab_path + '"')
-        print('Vocabulary size including oov: %d' % (self.vocab_size_oov))
+        logger.info('vocab file loaded from "' + vocab_path + '"')
+        logger.info('Vocabulary size including oov: %d' %
+                    (self.vocab_size_oov))
 
     def save_vocab(self, vocab_path):
         _freq_dict = OrderedDict(
diff --git a/modelscope/utils/nlp/utils.py b/modelscope/utils/nlp/utils.py
new file mode 100644
index 00000000..13a21480
--- /dev/null
+++ b/modelscope/utils/nlp/utils.py
@@ -0,0 +1,20 @@
+import os.path as osp
+
+
+def import_external_nltk_data(nltk_data_dir, package_name):
+    """import external nltk_data, and extract nltk zip package.
+
+    Args:
+        nltk_data_dir (str): external nltk_data dir path, eg. /home/xx/nltk_data
+        package_name (str): nltk package name, eg. tokenizers/punkt
+    """
+    import nltk
+    nltk.data.path.append(nltk_data_dir)
+
+    filepath = osp.join(nltk_data_dir, package_name + '.zip')
+    zippath = osp.join(nltk_data_dir, package_name)
+    packagepath = osp.dirname(zippath)
+    if not osp.exists(zippath):
+        import zipfile
+        with zipfile.ZipFile(filepath) as zf:
+            zf.extractall(osp.join(packagepath))
diff --git a/modelscope/utils/registry.py b/modelscope/utils/registry.py
index 3cf88114..7a9c79e2 100644
--- a/modelscope/utils/registry.py
+++ b/modelscope/utils/registry.py
@@ -74,7 +74,6 @@ class Registry(object):
             raise KeyError(f'{module_name} is already registered in '
                            f'{self._name}[{group_key}]')
         self._modules[group_key][module_name] = module_cls
-        module_cls.group_key = group_key
 
     def register_module(self,
                         group_key: str = default_group,
@@ -196,6 +195,7 @@ def build_from_cfg(cfg,
         if obj_cls is None:
             raise KeyError(f'{obj_type} is not in the {registry.name}'
                            f' registry group {group_key}')
+        obj_cls.group_key = group_key
     elif inspect.isclass(obj_type) or inspect.isfunction(obj_type):
         obj_cls = obj_type
     else:
diff --git a/modelscope/utils/regress_test_utils.py b/modelscope/utils/regress_test_utils.py
new file mode 100644
index 00000000..47bbadfe
--- /dev/null
+++ b/modelscope/utils/regress_test_utils.py
@@ -0,0 +1,726 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import contextlib
+import hashlib
+import os
+import pickle
+import random
+import shutil
+import tempfile
+from collections.abc import Mapping
+from pathlib import Path
+from types import FunctionType
+from typing import Any, Dict, Union
+
+import json
+import numpy as np
+import torch.optim
+from torch import nn
+
+
+class RegressTool:
+    """This class is used to stop inference/training results from changing by some unaware affections by unittests.
+
+    Firstly, run a baseline test to create a result file, then changes can be observed between
+    the latest version and the baseline file.
+    """
+
+    def __init__(self,
+                 baseline: bool = None,
+                 store_func: FunctionType = None,
+                 load_func: FunctionType = None):
+        """A func to store the baseline file and a func to load the baseline file.
+        """
+        self.baseline = baseline
+        self.store_func = store_func
+        self.load_func = load_func
+        print(f'Current working dir is: {Path.cwd()}')
+
+    def store(self, local, remote):
+        if self.store_func is not None:
+            self.store_func(local, remote)
+        else:
+            path = os.path.abspath(
+                os.path.join(Path.cwd(), 'data', 'test', 'regression'))
+            os.makedirs(path, exist_ok=True)
+            shutil.copy(local, os.path.join(path, remote))
+
+    def load(self, local, remote):
+        if self.load_func is not None:
+            self.load_func(local, remote)
+        else:
+            path = os.path.abspath(
+                os.path.join(Path.cwd(), 'data', 'test', 'regression'))
+            baseline = os.path.join(path, remote)
+            if not os.path.exists(baseline):
+                raise ValueError(f'base line file {baseline} not exist')
+            print(
+                f'local file found:{baseline}, md5:{hashlib.md5(open(baseline,"rb").read()).hexdigest()}'
+            )
+            if os.path.exists(local):
+                os.remove(local)
+            os.symlink(baseline, local, target_is_directory=False)
+
+    @contextlib.contextmanager
+    def monitor_module_single_forward(self,
+                                      module: nn.Module,
+                                      file_name: str,
+                                      compare_fn=None):
+        """Monitor a pytorch module in a single forward.
+
+        @param module: A torch module
+        @param file_name: The file_name to store or load file
+        @param compare_fn: A custom fn used to compare the results manually.
+
+        >>> def compare_fn(v1, v2, key, type):
+        >>>     return None
+
+        v1 is the baseline value
+        v2 is the value of current version
+        key is the key of submodules
+        type is in one of 'input', 'output'
+        """
+        baseline = os.getenv('REGRESSION_BASELINE')
+        if baseline is None or self.baseline is None:
+            yield
+            return
+
+        baseline = self.baseline
+        io_json = {}
+        absolute_path = f'./{file_name}.bin'
+        if not isinstance(module, nn.Module):
+            assert hasattr(module, 'model')
+            module = module.model
+
+        hack_forward(module, file_name, io_json)
+        intercept_module(module, io_json)
+        yield
+        hack_forward(module, None, None, restore=True)
+        intercept_module(module, None, restore=True)
+        if baseline:
+            with open(absolute_path, 'wb') as f:
+                pickle.dump(io_json, f)
+            self.store(absolute_path, f'{file_name}.bin')
+            os.remove(absolute_path)
+        else:
+            name = os.path.basename(absolute_path)
+            baseline = os.path.join(tempfile.gettempdir(), name)
+            self.load(baseline, name)
+            with open(baseline, 'rb') as f:
+                baseline_json = pickle.load(f)
+
+            class NumpyEncoder(json.JSONEncoder):
+                """Special json encoder for numpy types
+                """
+
+                def default(self, obj):
+                    if isinstance(obj, np.integer):
+                        return int(obj)
+                    elif isinstance(obj, np.floating):
+                        return float(obj)
+                    elif isinstance(obj, np.ndarray):
+                        return obj.tolist()
+                    return json.JSONEncoder.default(self, obj)
+
+            print(f'baseline: {json.dumps(baseline_json, cls=NumpyEncoder)}')
+            print(f'latest  : {json.dumps(io_json, cls=NumpyEncoder)}')
+            if not compare_io_and_print(baseline_json, io_json, compare_fn):
+                raise ValueError('Result not match!')
+
+    @contextlib.contextmanager
+    def monitor_module_train(self,
+                             trainer: Union[Dict, Any],
+                             file_name,
+                             level='config',
+                             compare_fn=None,
+                             ignore_keys=None,
+                             compare_random=True,
+                             reset_dropout=True,
+                             lazy_stop_callback=None):
+        """Monitor a pytorch module's backward data and cfg data within a step of the optimizer.
+
+        This is usually useful when you try to change some dangerous code
+        which has the risk of affecting the training loop.
+
+        @param trainer: A dict or an object contains the model/optimizer/lr_scheduler
+        @param file_name: The file_name to store or load file
+        @param level: The regression level.
+            'strict' for matching every single tensor.
+                     Please make sure the parameters of head are fixed
+                     and the drop-out rate is zero.
+            'config' for matching the initial config, like cfg file, optimizer param_groups,
+                     lr_scheduler params and the random seed.
+            'metric' for compare the best metrics in the evaluation loop.
+        @param compare_fn: A custom fn used to compare the results manually.
+        @param ignore_keys: The keys to ignore of the named_parameters.
+        @param compare_random: If to compare random setttings, default True.
+        @param reset_dropout: Reset all dropout modules to 0.0.
+        @param lazy_stop_callback: A callback passed in, when the moniting is over, this callback will be called.
+
+        >>> def compare_fn(v1, v2, key, type):
+        >>>     return None
+
+        v1 is the baseline value
+        v2 is the value of current version
+        key is the key of modules/parameters
+        type is in one of 'input', 'output', 'backward', 'optimizer', 'lr_scheduler', 'cfg', 'state'
+        """
+        baseline = os.getenv('REGRESSION_BASELINE')
+        if baseline is None or self.baseline is None:
+            yield
+            return
+
+        baseline = self.baseline
+
+        io_json = {}
+        bw_json = {}
+        absolute_path = f'./{file_name}.bin'
+
+        if level == 'strict':
+            print(
+                "[Important] The level of regression is 'strict', please make sure your model's parameters are "
+                'fixed and all drop-out rates have been set to zero.')
+
+        assert hasattr(
+            trainer, 'model') or 'model' in trainer, 'model must be in trainer'
+        module = trainer['model'] if isinstance(trainer,
+                                                dict) else trainer.model
+        if not isinstance(module, nn.Module):
+            assert hasattr(module, 'model')
+            module = module.model
+
+        assert hasattr(
+            trainer, 'optimizer'
+        ) or 'optimizer' in trainer, 'optimizer must be in trainer'
+        assert hasattr(
+            trainer, 'lr_scheduler'
+        ) or 'lr_scheduler' in trainer, 'lr_scheduler must be in trainer'
+        optimizer: torch.optim.Optimizer = trainer['optimizer'] if isinstance(
+            trainer, dict) else trainer.optimizer
+        lr_scheduler: torch.optim.lr_scheduler._LRScheduler = trainer['lr_scheduler'] if isinstance(trainer, dict) \
+            else trainer.lr_scheduler
+        torch_state = numpify_tensor_nested(torch.get_rng_state())
+        np_state = np.random.get_state()
+        random_seed = random.getstate()
+        seed = trainer._seed if hasattr(
+            trainer,
+            '_seed') else trainer.seed if hasattr(trainer, 'seed') else None
+
+        if reset_dropout:
+            with torch.no_grad():
+
+                def reinit_dropout(_module):
+                    for name, submodule in _module.named_children():
+                        if isinstance(submodule, torch.nn.Dropout):
+                            setattr(_module, name, torch.nn.Dropout(0.))
+                        else:
+                            reinit_dropout(submodule)
+
+                reinit_dropout(module)
+
+        if level == 'strict':
+            hack_forward(module, file_name, io_json)
+            intercept_module(module, io_json)
+        hack_backward(
+            module, optimizer, bw_json, lazy_stop_callback=lazy_stop_callback)
+        yield
+        hack_backward(module, optimizer, None, restore=True)
+        if level == 'strict':
+            hack_forward(module, None, None, restore=True)
+            intercept_module(module, None, restore=True)
+
+        optimizer_dict = optimizer.state_dict()
+        optimizer_dict.pop('state', None)
+        summary = {
+            'forward': io_json,
+            'backward': bw_json,
+            'optimizer': {
+                'type': optimizer.__class__.__name__,
+                'defaults': optimizer.defaults,
+                'state_dict': optimizer_dict
+            },
+            'lr_scheduler': {
+                'type': lr_scheduler.__class__.__name__,
+                'state_dict': lr_scheduler.state_dict()
+            },
+            'cfg': trainer.cfg.to_dict() if hasattr(trainer, 'cfg') else None,
+            'state': {
+                'torch_state': torch_state,
+                'np_state': np_state,
+                'random_seed': random_seed,
+                'seed': seed,
+            }
+        }
+
+        if baseline:
+            with open(absolute_path, 'wb') as f:
+                pickle.dump(summary, f)
+            self.store(absolute_path, f'{file_name}.bin')
+            os.remove(absolute_path)
+        else:
+            name = os.path.basename(absolute_path)
+            baseline = os.path.join(tempfile.gettempdir(), name)
+            self.load(baseline, name)
+            with open(baseline, 'rb') as f:
+                baseline_json = pickle.load(f)
+
+            if level == 'strict' and not compare_io_and_print(
+                    baseline_json['forward'], io_json, compare_fn):
+                raise RuntimeError('Forward not match!')
+            if not compare_backward_and_print(
+                    baseline_json['backward'],
+                    bw_json,
+                    compare_fn=compare_fn,
+                    ignore_keys=ignore_keys,
+                    level=level):
+                raise RuntimeError('Backward not match!')
+            cfg_opt1 = {
+                'optimizer': baseline_json['optimizer'],
+                'lr_scheduler': baseline_json['lr_scheduler'],
+                'cfg': baseline_json['cfg'],
+                'state': None if not compare_random else baseline_json['state']
+            }
+            cfg_opt2 = {
+                'optimizer': summary['optimizer'],
+                'lr_scheduler': summary['lr_scheduler'],
+                'cfg': summary['cfg'],
+                'state': None if not compare_random else summary['state']
+            }
+            if not compare_cfg_and_optimizers(cfg_opt1, cfg_opt2, compare_fn):
+                raise RuntimeError('Cfg or optimizers not match!')
+
+
+class MsRegressTool(RegressTool):
+
+    class EarlyStopError(Exception):
+        pass
+
+    @contextlib.contextmanager
+    def monitor_ms_train(self,
+                         trainer,
+                         file_name,
+                         level='config',
+                         compare_fn=None,
+                         ignore_keys=None,
+                         compare_random=True,
+                         lazy_stop_callback=None):
+
+        if lazy_stop_callback is None:
+
+            def lazy_stop_callback():
+
+                from modelscope.trainers.hooks.hook import Hook, Priority
+
+                class EarlyStopHook(Hook):
+                    PRIORITY = Priority.VERY_LOW
+
+                    def after_iter(self, trainer):
+                        raise MsRegressTool.EarlyStopError('Test finished.')
+
+                trainer.register_hook(EarlyStopHook())
+
+        def _train_loop(trainer, *args, **kwargs):
+            with self.monitor_module_train(
+                    trainer,
+                    file_name,
+                    level,
+                    compare_fn=compare_fn,
+                    ignore_keys=ignore_keys,
+                    compare_random=compare_random,
+                    lazy_stop_callback=lazy_stop_callback):
+                try:
+                    return trainer.train_loop_origin(*args, **kwargs)
+                except MsRegressTool.EarlyStopError:
+                    pass
+
+        trainer.train_loop_origin, trainer.train_loop = \
+            trainer.train_loop, type(trainer.train_loop)(_train_loop, trainer)
+        yield
+
+
+def compare_module(module1: nn.Module, module2: nn.Module):
+    for p1, p2 in zip(module1.parameters(), module2.parameters()):
+        if p1.data.ne(p2.data).sum() > 0:
+            return False
+    return True
+
+
+def numpify_tensor_nested(tensors, reduction=None, clip_value=10000):
+    import torch
+    "Numpify `tensors` (even if it's a nested list/tuple of tensors)."
+    if isinstance(tensors, (list, tuple)):
+        return type(tensors)(
+            numpify_tensor_nested(t, reduction, clip_value) for t in tensors)
+    if isinstance(tensors, Mapping):
+        return {
+            k: numpify_tensor_nested(t, reduction, clip_value)
+            for k, t in tensors.items()
+        }
+    if isinstance(tensors, torch.Tensor):
+        t: np.ndarray = tensors.cpu().numpy()
+        if clip_value is not None:
+            t = np.where(t > clip_value, clip_value, t)
+            t = np.where(t < -clip_value, -clip_value, t)
+        if reduction == 'sum':
+            return t.sum(dtype=np.float)
+        elif reduction == 'mean':
+            return t.mean(dtype=np.float)
+        return t
+    return tensors
+
+
+def detach_tensor_nested(tensors):
+    import torch
+    "Detach `tensors` (even if it's a nested list/tuple of tensors)."
+    if isinstance(tensors, (list, tuple)):
+        return type(tensors)(detach_tensor_nested(t) for t in tensors)
+    if isinstance(tensors, Mapping):
+        return {k: detach_tensor_nested(t) for k, t in tensors.items()}
+    if isinstance(tensors, torch.Tensor):
+        return tensors.detach()
+    return tensors
+
+
+def hack_forward(module: nn.Module,
+                 name,
+                 io_json,
+                 restore=False,
+                 keep_tensors=False):
+
+    def _forward(self, *args, **kwargs):
+        ret = self.forward_origin(*args, **kwargs)
+        if keep_tensors:
+            args = numpify_tensor_nested(detach_tensor_nested(args))
+            kwargs = numpify_tensor_nested(detach_tensor_nested(kwargs))
+            output = numpify_tensor_nested(detach_tensor_nested(ret))
+        else:
+            args = {
+                'sum':
+                numpify_tensor_nested(
+                    detach_tensor_nested(args), reduction='sum'),
+                'mean':
+                numpify_tensor_nested(
+                    detach_tensor_nested(args), reduction='mean'),
+            }
+            kwargs = {
+                'sum':
+                numpify_tensor_nested(
+                    detach_tensor_nested(kwargs), reduction='sum'),
+                'mean':
+                numpify_tensor_nested(
+                    detach_tensor_nested(kwargs), reduction='mean'),
+            }
+            output = {
+                'sum':
+                numpify_tensor_nested(
+                    detach_tensor_nested(ret), reduction='sum'),
+                'mean':
+                numpify_tensor_nested(
+                    detach_tensor_nested(ret), reduction='mean'),
+            }
+
+        io_json[name] = {
+            'input': {
+                'args': args,
+                'kwargs': kwargs,
+            },
+            'output': output,
+        }
+        return ret
+
+    if not restore and not hasattr(module, 'forward_origin'):
+        module.forward_origin, module.forward = module.forward, type(
+            module.forward)(_forward, module)
+    if restore and hasattr(module, 'forward_origin'):
+        module.forward = module.forward_origin
+        del module.forward_origin
+
+
+def hack_backward(module: nn.Module,
+                  optimizer,
+                  io_json,
+                  restore=False,
+                  lazy_stop_callback=None):
+
+    def _step(self, *args, **kwargs):
+        for name, param in module.named_parameters():
+            io_json[name] = {
+                'data': {
+                    'sum':
+                    numpify_tensor_nested(
+                        detach_tensor_nested(param.data), reduction='sum'),
+                    'mean':
+                    numpify_tensor_nested(
+                        detach_tensor_nested(param.data), reduction='mean'),
+                },
+                'grad': {
+                    'sum':
+                    numpify_tensor_nested(
+                        detach_tensor_nested(param.grad), reduction='sum'),
+                    'mean':
+                    numpify_tensor_nested(
+                        detach_tensor_nested(param.grad), reduction='mean'),
+                }
+            }
+        ret = self.step_origin(*args, **kwargs)
+        for name, param in module.named_parameters():
+            io_json[name]['data_after'] = {
+                'sum':
+                numpify_tensor_nested(
+                    detach_tensor_nested(param.data), reduction='sum'),
+                'mean':
+                numpify_tensor_nested(
+                    detach_tensor_nested(param.data), reduction='mean'),
+            }
+        if lazy_stop_callback is not None:
+            lazy_stop_callback()
+        return ret
+
+    if not restore and not hasattr(optimizer, 'step_origin'):
+        optimizer.step_origin, optimizer.step = optimizer.step, type(
+            optimizer.state_dict)(_step, optimizer)
+    if restore and hasattr(optimizer, 'step_origin'):
+        optimizer.step = optimizer.step_origin
+        del optimizer.step_origin
+
+
+def intercept_module(module: nn.Module,
+                     io_json,
+                     parent_name=None,
+                     restore=False):
+    for name, module in module.named_children():
+        full_name = parent_name + '.' + name if parent_name is not None else name
+        hack_forward(module, full_name, io_json, restore)
+        intercept_module(module, io_json, full_name, restore)
+
+
+def compare_arguments_nested(print_content,
+                             arg1,
+                             arg2,
+                             rtol=1.e-3,
+                             atol=1.e-8):
+    type1 = type(arg1)
+    type2 = type(arg2)
+    if type1.__name__ != type2.__name__:
+        if print_content is not None:
+            print(
+                f'{print_content}, type not equal:{type1.__name__} and {type2.__name__}'
+            )
+        return False
+
+    if arg1 is None:
+        return True
+    elif isinstance(arg1, (int, str, bool, np.bool, np.integer, np.str)):
+        if arg1 != arg2:
+            if print_content is not None:
+                print(f'{print_content}, arg1:{arg1}, arg2:{arg2}')
+            return False
+        return True
+    elif isinstance(arg1, (float, np.floating)):
+        if not np.isclose(arg1, arg2, rtol=rtol, atol=atol, equal_nan=True):
+            if print_content is not None:
+                print(f'{print_content}, arg1:{arg1}, arg2:{arg2}')
+            return False
+        return True
+    elif isinstance(arg1, (tuple, list)):
+        if len(arg1) != len(arg2):
+            if print_content is not None:
+                print(
+                    f'{print_content}, length is not equal:{len(arg1)}, {len(arg2)}'
+                )
+            return False
+        if not all([
+                compare_arguments_nested(None, sub_arg1, sub_arg2)
+                for sub_arg1, sub_arg2 in zip(arg1, arg2)
+        ]):
+            if print_content is not None:
+                print(f'{print_content}')
+            return False
+        return True
+    elif isinstance(arg1, Mapping):
+        keys1 = arg1.keys()
+        keys2 = arg2.keys()
+        if len(keys1) != len(keys2):
+            if print_content is not None:
+                print(
+                    f'{print_content}, key length is not equal:{len(keys1)}, {len(keys2)}'
+                )
+            return False
+        if len(set(keys1) - set(keys2)) > 0:
+            if print_content is not None:
+                print(f'{print_content}, key diff:{set(keys1) - set(keys2)}')
+            return False
+        if not all([
+                compare_arguments_nested(None, arg1[key], arg2[key])
+                for key in keys1
+        ]):
+            if print_content is not None:
+                print(f'{print_content}')
+            return False
+        return True
+    elif isinstance(arg1, np.ndarray):
+        arg1 = np.where(np.equal(arg1, None), np.NaN,
+                        arg1).astype(dtype=np.float)
+        arg2 = np.where(np.equal(arg2, None), np.NaN,
+                        arg2).astype(dtype=np.float)
+        if not all(
+                np.isclose(arg1, arg2, rtol=rtol, atol=atol,
+                           equal_nan=True).flatten()):
+            if print_content is not None:
+                print(f'{print_content}')
+            return False
+        return True
+    else:
+        raise ValueError(f'type not supported: {type1}')
+
+
+def compare_io_and_print(baseline_json, io_json, compare_fn=None):
+    if compare_fn is None:
+
+        def compare_fn(*args, **kwargs):
+            return None
+
+    keys1 = set(baseline_json.keys())
+    keys2 = set(io_json.keys())
+    added = keys1 - keys2
+    removed = keys2 - keys1
+    print(f'unmatched keys: {added}, {removed}')
+    shared_keys = keys1.intersection(keys2)
+    match = True
+    for key in shared_keys:
+        v1 = baseline_json[key]
+        v2 = io_json[key]
+
+        v1input = numpify_tensor_nested(v1['input'])
+        v2input = numpify_tensor_nested(v2['input'])
+        res = compare_fn(v1input, v2input, key, 'input')
+        if res is not None:
+            print(
+                f'input of {key} compared with user compare_fn with result:{res}\n'
+            )
+            match = match and res
+        else:
+            match = compare_arguments_nested(
+                f'unmatched module {key} input args', v1input['args'],
+                v2input['args']) and match
+            match = compare_arguments_nested(
+                f'unmatched module {key} input kwargs', v1input['kwargs'],
+                v2input['kwargs']) and match
+        v1output = numpify_tensor_nested(v1['output'])
+        v2output = numpify_tensor_nested(v2['output'])
+        res = compare_fn(v1output, v2output, key, 'output')
+        if res is not None:
+            print(
+                f'output of {key} compared with user compare_fn with result:{res}\n'
+            )
+            match = match and res
+        else:
+            match = compare_arguments_nested(f'unmatched module {key} outputs',
+                                             v1output, v2output) and match
+    return match
+
+
+def compare_backward_and_print(baseline_json,
+                               bw_json,
+                               level,
+                               ignore_keys=None,
+                               compare_fn=None):
+    if compare_fn is None:
+
+        def compare_fn(*args, **kwargs):
+            return None
+
+    keys1 = set(baseline_json.keys())
+    keys2 = set(bw_json.keys())
+    added = keys1 - keys2
+    removed = keys2 - keys1
+    print(f'unmatched backward keys: {added}, {removed}')
+    shared_keys = keys1.intersection(keys2)
+    match = True
+    for key in shared_keys:
+        if ignore_keys is not None and key in ignore_keys:
+            continue
+
+        res = compare_fn(baseline_json[key], bw_json[key], key, 'backward')
+        if res is not None:
+            print(f'backward data of {key} compared with '
+                  f'user compare_fn with result:{res}\n')
+            match = match and res
+        else:
+            data1, grad1, data_after1 = baseline_json[key][
+                'data'], baseline_json[key]['grad'], baseline_json[key][
+                    'data_after']
+            data2, grad2, data_after2 = bw_json[key]['data'], bw_json[key][
+                'grad'], bw_json[key]['data_after']
+            match = compare_arguments_nested(
+                f'unmatched module {key} tensor data', data1, data2) and match
+            if level == 'strict':
+                match = compare_arguments_nested(
+                    f'unmatched module {key} grad data', grad1,
+                    grad2) and match
+                match = compare_arguments_nested(
+                    f'unmatched module {key} data after step', data_after1,
+                    data_after2) and match
+    return match
+
+
+def compare_cfg_and_optimizers(baseline_json, cfg_json, compare_fn=None):
+    if compare_fn is None:
+
+        def compare_fn(*args, **kwargs):
+            return None
+
+    optimizer1, lr_scheduler1, cfg1, state1 = baseline_json[
+        'optimizer'], baseline_json['lr_scheduler'], baseline_json[
+            'cfg'], baseline_json['state']
+    optimizer2, lr_scheduler2, cfg2, state2 = cfg_json['optimizer'], cfg_json[
+        'lr_scheduler'], cfg_json['cfg'], baseline_json['state']
+
+    match = True
+    res = compare_fn(optimizer1, optimizer2, None, 'optimizer')
+    if res is not None:
+        print(f'optimizer compared with user compare_fn with result:{res}\n')
+        match = match and res
+    else:
+        if optimizer1['type'] != optimizer2['type']:
+            print(
+                f"Optimizer type not equal:{optimizer1['type']} and {optimizer2['type']}"
+            )
+        match = compare_arguments_nested('unmatched optimizer defaults',
+                                         optimizer1['defaults'],
+                                         optimizer2['defaults']) and match
+        match = compare_arguments_nested('unmatched optimizer state_dict',
+                                         optimizer1['state_dict'],
+                                         optimizer2['state_dict']) and match
+
+    res = compare_fn(lr_scheduler1, lr_scheduler2, None, 'lr_scheduler')
+    if res is not None:
+        print(
+            f'lr_scheduler compared with user compare_fn with result:{res}\n')
+        match = match and res
+    else:
+        if lr_scheduler1['type'] != lr_scheduler2['type']:
+            print(
+                f"Optimizer type not equal:{lr_scheduler1['type']} and {lr_scheduler2['type']}"
+            )
+        match = compare_arguments_nested('unmatched lr_scheduler state_dict',
+                                         lr_scheduler1['state_dict'],
+                                         lr_scheduler2['state_dict']) and match
+
+    res = compare_fn(cfg1, cfg2, None, 'cfg')
+    if res is not None:
+        print(f'cfg compared with user compare_fn with result:{res}\n')
+        match = match and res
+    else:
+        match = compare_arguments_nested('unmatched cfg', cfg1, cfg2) and match
+
+    res = compare_fn(state1, state2, None, 'state')
+    if res is not None:
+        print(
+            f'random state compared with user compare_fn with result:{res}\n')
+        match = match and res
+    else:
+        match = compare_arguments_nested('unmatched random state', state1,
+                                         state2) and match
+
+    return match
diff --git a/modelscope/utils/tensor_utils.py b/modelscope/utils/tensor_utils.py
index 7889d944..b68a639c 100644
--- a/modelscope/utils/tensor_utils.py
+++ b/modelscope/utils/tensor_utils.py
@@ -1,15 +1,24 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 # Part of the implementation is borrowed from huggingface/transformers.
-from collections.abc import Mapping
-
-import numpy as np
+from collections import Mapping
 
 
 def torch_nested_numpify(tensors):
+    """ Numpify nested torch tensors.
+
+    NOTE: If the type of input tensors is dict-like(Mapping, dict, OrderedDict, etc.), the return type will be dict.
+
+    @param tensors: Nested torch tensors.
+    @return: The numpify tensors.
+    """
+
     import torch
     "Numpify `tensors` (even if it's a nested list/tuple of tensors)."
     if isinstance(tensors, (list, tuple)):
         return type(tensors)(torch_nested_numpify(t) for t in tensors)
+    if isinstance(tensors, Mapping):
+        # return dict
+        return {k: torch_nested_numpify(t) for k, t in tensors.items()}
     if isinstance(tensors, torch.Tensor):
         t = tensors.cpu()
         return t.numpy()
@@ -17,10 +26,20 @@ def torch_nested_numpify(tensors):
 
 
 def torch_nested_detach(tensors):
+    """ Detach nested torch tensors.
+
+    NOTE: If the type of input tensors is dict-like(Mapping, dict, OrderedDict, etc.), the return type will be dict.
+
+    @param tensors: Nested torch tensors.
+    @return: The detached tensors.
+    """
+
     import torch
     "Detach `tensors` (even if it's a nested list/tuple of tensors)."
     if isinstance(tensors, (list, tuple)):
         return type(tensors)(torch_nested_detach(t) for t in tensors)
+    if isinstance(tensors, Mapping):
+        return {k: torch_nested_detach(t) for k, t in tensors.items()}
     if isinstance(tensors, torch.Tensor):
         return tensors.detach()
     return tensors
diff --git a/modelscope/utils/test_utils.py b/modelscope/utils/test_utils.py
index 7adba982..5109db11 100644
--- a/modelscope/utils/test_utils.py
+++ b/modelscope/utils/test_utils.py
@@ -11,12 +11,13 @@ import sys
 import tarfile
 import tempfile
 import unittest
+from collections import OrderedDict
 
 import requests
-from datasets import Dataset
+import torch
 from datasets.config import TF_AVAILABLE, TORCH_AVAILABLE
+from torch.utils.data import Dataset
 
-from modelscope.msdatasets import MsDataset
 from .torch_utils import _find_free_port
 
 TEST_LEVEL = 2
@@ -48,9 +49,25 @@ def set_test_level(level: int):
     TEST_LEVEL = level
 
 
+class DummyTorchDataset(Dataset):
+
+    def __init__(self, feat, label, num) -> None:
+        self.feat = feat
+        self.label = label
+        self.num = num
+
+    def __getitem__(self, index):
+        return {
+            'feat': torch.Tensor(self.feat),
+            'labels': torch.Tensor(self.label)
+        }
+
+    def __len__(self):
+        return self.num
+
+
 def create_dummy_test_dataset(feat, label, num):
-    return MsDataset.from_hf_dataset(
-        Dataset.from_dict(dict(feat=[feat] * num, labels=[label] * num)))
+    return DummyTorchDataset(feat, label, num)
 
 
 def download_and_untar(fpath, furl, dst) -> str:
@@ -71,6 +88,37 @@ def download_and_untar(fpath, furl, dst) -> str:
     return target_dir_path
 
 
+def get_case_model_info():
+    status_code, result = subprocess.getstatusoutput(
+        'grep -rn "damo/" tests/  | grep -v ".pyc" | grep -v "Binary file" | grep -v run.py '
+    )
+    lines = result.split('\n')
+    test_cases = OrderedDict()
+    model_cases = OrderedDict()
+    for line in lines:
+        # "tests/msdatasets/test_ms_dataset.py:92:        model_id = 'damo/bert-base-sst2'"
+        line = line.strip()
+        elements = line.split(':')
+        test_file = elements[0]
+        model_pos = line.find('damo')
+        left_quote = line[model_pos - 1]
+        rquote_idx = line.rfind(left_quote)
+        model_name = line[model_pos:rquote_idx]
+        if test_file not in test_cases:
+            test_cases[test_file] = set()
+        model_info = test_cases[test_file]
+        model_info.add(model_name)
+
+        if model_name not in model_cases:
+            model_cases[model_name] = set()
+        case_info = model_cases[model_name]
+        case_info.add(
+            test_file.replace('tests/', '').replace('.py',
+                                                    '').replace('/', '.'))
+
+    return model_cases
+
+
 _DIST_SCRIPT_TEMPLATE = """
 import ast
 import argparse
diff --git a/modelscope/utils/torch_utils.py b/modelscope/utils/torch_utils.py
index 45e33c3e..74d9bb7b 100644
--- a/modelscope/utils/torch_utils.py
+++ b/modelscope/utils/torch_utils.py
@@ -3,16 +3,16 @@
 import functools
 import os
 import pickle
+import random
 import socket
 import subprocess
 import tempfile
 from typing import Callable, List, Optional, Tuple
 
+import numpy as np
 import torch
 import torch.multiprocessing as mp
 from torch import distributed as dist
-from torch._utils import (_flatten_dense_tensors, _take_tensors,
-                          _unflatten_dense_tensors)
 
 
 def _find_free_port() -> str:
@@ -49,7 +49,6 @@ def init_dist(launcher: str, backend: str = 'nccl', **kwargs) -> None:
 def _init_dist_pytorch(backend: str, **kwargs) -> None:
     # rank = int(os.environ['RANK'])
     local_rank = int(os.environ['LOCAL_RANK'])
-
     torch.cuda.set_device(local_rank)
     dist.init_process_group(backend=backend, **kwargs)
 
@@ -116,6 +115,10 @@ def get_dist_info() -> Tuple[int, int]:
     return rank, world_size
 
 
+def get_local_rank():
+    return int(os.environ.get('LOCAL_RANK', 0))
+
+
 def is_master():
     rank, _ = get_dist_info()
     return rank == 0
@@ -180,3 +183,20 @@ def broadcast(inputs, src):
     dist.broadcast(inputs_tensor, src)
 
     return pickle.loads(inputs_tensor.cpu().numpy().tobytes())
+
+
+def set_random_seed(seed):
+    if seed is not None and seed >= 0:
+        random.seed(seed)
+        np.random.seed(seed)
+        torch.manual_seed(seed)
+        torch.cuda.manual_seed_all(seed)
+    else:
+        raise ValueError(
+            f'Random seed should be positive, current seed is {seed}')
+
+
+def set_random_seed_mpu(seed):
+    from megatron import mpu
+    set_random_seed(seed)
+    mpu.model_parallel_cuda_manual_seed(seed)
diff --git a/modelscope/utils/type_assert.py b/modelscope/utils/type_assert.py
index aaeadcb9..f732a81a 100644
--- a/modelscope/utils/type_assert.py
+++ b/modelscope/utils/type_assert.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 from functools import wraps
 from inspect import signature
 
diff --git a/modelscope/version.py b/modelscope/version.py
index 40ed83d9..1e4826d6 100644
--- a/modelscope/version.py
+++ b/modelscope/version.py
@@ -1 +1 @@
-__version__ = '0.3.5'
+__version__ = '0.4.7'
diff --git a/requirements.txt b/requirements.txt
index c6e294ba..0832e6ab 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1 +1 @@
--r requirements/runtime.txt
+-r requirements/framework.txt
diff --git a/requirements/audio.txt b/requirements/audio.txt
index 5e4bc104..d22ad8f1 100644
--- a/requirements/audio.txt
+++ b/requirements/audio.txt
@@ -1,6 +1,5 @@
 easyasr>=0.0.2
 espnet>=202204
-#tts
 h5py
 inflect
 keras
@@ -15,11 +14,7 @@ nltk
 numpy<=1.18
 # protobuf version beyond 3.20.0 is not compatible with TensorFlow 1.x, therefore is discouraged.
 protobuf>3,<3.21.0
-ptflops
 py_sound_connect
-pytorch_wavelets
-PyWavelets>=1.0.0
-scikit-learn
 SoundFile>0.10
 sox
 torchaudio
diff --git a/requirements/cv.txt b/requirements/cv.txt
index 8dcf6791..5a2d7763 100644
--- a/requirements/cv.txt
+++ b/requirements/cv.txt
@@ -14,13 +14,14 @@ mmcls>=0.21.0
 mmdet>=2.25.0
 networkx>=2.5
 onnxruntime>=1.10
-pai-easycv>=0.5
+pai-easycv>=0.6.3.6
 pandas
 psutil
 regex
 scikit-image>=0.19.3
 scikit-learn>=0.20.1
 shapely
+shotdetect_scenedetect_lgss
 tensorflow-estimator>=1.15.1
 tf_slim
 timm>=0.4.9
diff --git a/requirements/runtime.txt b/requirements/framework.txt
similarity index 96%
rename from requirements/runtime.txt
rename to requirements/framework.txt
index c059b4ba..b51faeda 100644
--- a/requirements/runtime.txt
+++ b/requirements/framework.txt
@@ -4,6 +4,7 @@ easydict
 einops
 filelock>=3.3.0
 gast>=0.2.2
+jsonplus
 numpy
 opencv-python
 oss2
diff --git a/requirements/multi-modal.txt b/requirements/multi-modal.txt
index ef5d4341..02e87baa 100644
--- a/requirements/multi-modal.txt
+++ b/requirements/multi-modal.txt
@@ -1,4 +1,3 @@
-fairseq
 ftfy>=6.0.3
 ofa>=0.0.2
 pycocoevalcap>=1.2
diff --git a/requirements/nlp.txt b/requirements/nlp.txt
index 6bd56aff..15f2f41a 100644
--- a/requirements/nlp.txt
+++ b/requirements/nlp.txt
@@ -1,11 +1,14 @@
 en_core_web_sm>=2.3.5
-fairseq>=0.10.2
+jieba>=0.42.1
+megatron_util
 pai-easynlp
 # rough-score was just recently updated from 0.0.4 to 0.0.7
 # which introduced compatability issues that are being investigated
 rouge_score<=0.0.4
+sacremoses>=0.0.41
 seqeval
 spacy>=2.3.5
+subword_nmt>=0.3.8
 text2sql_lgesql
 tokenizers
 transformers>=4.12.0
diff --git a/requirements/tensorflow1x.txt b/requirements/tensorflow1x.txt
new file mode 100644
index 00000000..b139efe1
--- /dev/null
+++ b/requirements/tensorflow1x.txt
@@ -0,0 +1 @@
+numpy==1.18.5
diff --git a/setup.cfg b/setup.cfg
index c98dbe05..3dc64f86 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -19,7 +19,7 @@ quiet-level = 3
 ignore-words-list = patten,nd,ty,mot,hist,formating,winn,gool,datas,wan,confids
 
 [flake8]
-select = B,C,E,F,P,T4,W,B9
 max-line-length = 120
-ignore = F401,F405,F821,W503
+select = B,C,E,F,P,T4,W,B9
+ignore = F401,F405,F821,W503,E251
 exclude = docs/src,*.pyi,.git
diff --git a/tests/export/__init__.py b/tests/export/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/tests/export/test_export_sbert_sequence_classification.py b/tests/export/test_export_sbert_sequence_classification.py
new file mode 100644
index 00000000..535b3f5d
--- /dev/null
+++ b/tests/export/test_export_sbert_sequence_classification.py
@@ -0,0 +1,37 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os
+import shutil
+import tempfile
+import unittest
+
+from modelscope.exporters import Exporter, TorchModelExporter
+from modelscope.models.base import Model
+from modelscope.utils.test_utils import test_level
+
+
+class TestExportSbertSequenceClassification(unittest.TestCase):
+
+    def setUp(self):
+        print(('Testing %s.%s' % (type(self).__name__, self._testMethodName)))
+        self.tmp_dir = tempfile.TemporaryDirectory().name
+        if not os.path.exists(self.tmp_dir):
+            os.makedirs(self.tmp_dir)
+        self.model_id = 'damo/nlp_structbert_sentence-similarity_chinese-base'
+
+    def tearDown(self):
+        shutil.rmtree(self.tmp_dir)
+        super().tearDown()
+
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_export_sbert_sequence_classification(self):
+        model = Model.from_pretrained(self.model_id)
+        print(
+            Exporter.from_model(model).export_onnx(
+                shape=(2, 256), outputs=self.tmp_dir))
+        print(
+            TorchModelExporter.from_model(model).export_torch_script(
+                shape=(2, 256), outputs=self.tmp_dir))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/hub/test_hub_examples.py b/tests/hub/test_hub_examples.py
index 3fb6823f..d1f7594e 100644
--- a/tests/hub/test_hub_examples.py
+++ b/tests/hub/test_hub_examples.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import unittest
 
 from modelscope.hub.api import HubApi
diff --git a/tests/hub/test_hub_private_repository.py b/tests/hub/test_hub_private_repository.py
index 8683a884..dab2b891 100644
--- a/tests/hub/test_hub_private_repository.py
+++ b/tests/hub/test_hub_private_repository.py
@@ -10,7 +10,8 @@ from modelscope.hub.errors import GitError
 from modelscope.hub.repository import Repository
 from modelscope.utils.constant import ModelFile
 from .test_utils import (TEST_ACCESS_TOKEN1, TEST_ACCESS_TOKEN2,
-                         TEST_MODEL_CHINESE_NAME, TEST_MODEL_ORG)
+                         TEST_MODEL_CHINESE_NAME, TEST_MODEL_ORG,
+                         delete_credential)
 
 DEFAULT_GIT_PATH = 'git'
 
@@ -65,6 +66,18 @@ class HubPrivateRepositoryTest(unittest.TestCase):
         print(repo2.model_dir)
         assert repo1.model_dir == repo2.model_dir
 
+    def test_clone_private_model_without_token(self):
+        delete_credential()
+        temporary_dir = tempfile.mkdtemp()
+        local_dir = os.path.join(temporary_dir, self.model_name)
+        with self.assertRaises(GitError) as cm:
+            Repository(local_dir, clone_from=self.model_id)
+
+        print(cm.exception)
+        assert not os.path.exists(os.path.join(local_dir, ModelFile.README))
+
+        self.api.login(TEST_ACCESS_TOKEN1)  # re-login for delete
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/hub/test_utils.py b/tests/hub/test_utils.py
index 38a74fd4..3d312dc0 100644
--- a/tests/hub/test_utils.py
+++ b/tests/hub/test_utils.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import os
 import shutil
 from codecs import ignore_errors
diff --git a/tests/msdatasets/test_dataset_upload.py b/tests/msdatasets/test_dataset_upload.py
new file mode 100644
index 00000000..1179414d
--- /dev/null
+++ b/tests/msdatasets/test_dataset_upload.py
@@ -0,0 +1,94 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os
+import shutil
+import tempfile
+import unittest
+import zipfile
+
+from modelscope.msdatasets import MsDataset
+from modelscope.utils.constant import ModelFile
+from modelscope.utils.test_utils import test_level
+
+KEY_EXTRACTED = 'extracted'
+
+
+class DatasetUploadTest(unittest.TestCase):
+
+    def setUp(self):
+        self.old_dir = os.getcwd()
+        self.dataset_name = 'small_coco_for_test'
+        self.dataset_file_name = self.dataset_name
+        self.prepared_dataset_name = 'pets_small'
+        self.token = os.getenv('TEST_UPLOAD_MS_TOKEN')
+        error_msg = 'The modelscope token can not be empty, please set env variable: TEST_UPLOAD_MS_TOKEN'
+        self.assertIsNotNone(self.token, msg=error_msg)
+        from modelscope.hub.api import HubApi
+        from modelscope.hub.api import ModelScopeConfig
+        self.api = HubApi()
+        self.api.login(self.token)
+
+        # get user info
+        self.namespace, _ = ModelScopeConfig.get_user_info()
+
+        self.temp_dir = tempfile.mkdtemp()
+        self.test_work_dir = os.path.join(self.temp_dir, self.dataset_name)
+        self.test_meta_dir = os.path.join(self.test_work_dir, 'meta')
+        if not os.path.exists(self.test_work_dir):
+            os.makedirs(self.test_work_dir)
+
+    def tearDown(self):
+        os.chdir(self.old_dir)
+        shutil.rmtree(self.temp_dir, ignore_errors=True)
+        print('The test dir successfully removed!')
+
+    @staticmethod
+    def get_raw_downloaded_file_path(extracted_path):
+        raw_downloaded_file_path = ''
+        raw_data_dir = os.path.abspath(
+            os.path.join(extracted_path, '../../..'))
+        for root, dirs, files in os.walk(raw_data_dir):
+            if KEY_EXTRACTED in dirs:
+                for file in files:
+                    curr_file_path = os.path.join(root, file)
+                    if zipfile.is_zipfile(curr_file_path):
+                        raw_downloaded_file_path = curr_file_path
+        return raw_downloaded_file_path
+
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_ds_upload(self):
+        # Get the prepared data from hub, using default modelscope namespace
+        ms_ds_train = MsDataset.load(self.prepared_dataset_name, split='train')
+        config_res = ms_ds_train._hf_ds.config_kwargs
+        extracted_path = config_res.get('split_config').get('train')
+        raw_zipfile_path = self.get_raw_downloaded_file_path(extracted_path)
+
+        MsDataset.upload(
+            object_name=self.dataset_file_name + '.zip',
+            local_file_path=raw_zipfile_path,
+            dataset_name=self.dataset_name,
+            namespace=self.namespace)
+
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_ds_clone_meta(self):
+        MsDataset.clone_meta(
+            dataset_work_dir=self.test_meta_dir,
+            dataset_id=os.path.join(self.namespace, self.dataset_name))
+
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_ds_upload_meta(self):
+        # Clone dataset meta repo first.
+        MsDataset.clone_meta(
+            dataset_work_dir=self.test_meta_dir,
+            dataset_id=os.path.join(self.namespace, self.dataset_name))
+
+        with open(os.path.join(self.test_meta_dir, ModelFile.README),
+                  'a') as f:
+            f.write('\nThis is a line for unit test.')
+
+        MsDataset.upload_meta(
+            dataset_work_dir=self.test_meta_dir,
+            commit_message='Update for unit test.')
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/msdatasets/test_ms_dataset.py b/tests/msdatasets/test_ms_dataset.py
index f9118353..91a3b5c5 100644
--- a/tests/msdatasets/test_ms_dataset.py
+++ b/tests/msdatasets/test_ms_dataset.py
@@ -1,10 +1,12 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import unittest
 
 from modelscope.models import Model
 from modelscope.msdatasets import MsDataset
 from modelscope.preprocessors import SequenceClassificationPreprocessor
 from modelscope.preprocessors.base import Preprocessor
-from modelscope.utils.constant import DownloadMode
+from modelscope.utils.constant import DEFAULT_DATASET_NAMESPACE, DownloadMode
 from modelscope.utils.test_utils import require_tf, require_torch, test_level
 
 
@@ -31,15 +33,21 @@ class ImgPreprocessor(Preprocessor):
 
 class MsDatasetTest(unittest.TestCase):
 
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_movie_scene_seg_toydata(self):
+        ms_ds_train = MsDataset.load('movie_scene_seg_toydata', split='train')
+        print(ms_ds_train._hf_ds.config_kwargs)
+        assert next(iter(ms_ds_train.config_kwargs['split_config'].values()))
+
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_coco(self):
         ms_ds_train = MsDataset.load(
             'pets_small',
-            namespace='modelscope',
-            split='train',
+            namespace=DEFAULT_DATASET_NAMESPACE,
             download_mode=DownloadMode.FORCE_REDOWNLOAD,
-            classes=('1', '2'))
-        print(ms_ds_train._hf_ds.config_kwargs)
+            split='train')
+        print(ms_ds_train.config_kwargs)
+        assert next(iter(ms_ds_train.config_kwargs['split_config'].values()))
 
     @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
     def test_ms_csv_basic(self):
@@ -67,7 +75,8 @@ class MsDatasetTest(unittest.TestCase):
         preprocessor = SequenceClassificationPreprocessor(
             nlp_model.model_dir,
             first_sequence='premise',
-            second_sequence=None)
+            second_sequence=None,
+            padding='max_length')
         ms_ds_train = MsDataset.load(
             'xcopa',
             subset_name='translation-et',
diff --git a/tests/pipelines/easycv_pipelines/__init__.py b/tests/pipelines/easycv_pipelines/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/tests/pipelines/easycv_pipelines/test_segmentation_pipeline.py b/tests/pipelines/easycv_pipelines/test_segmentation_pipeline.py
new file mode 100644
index 00000000..5f6dac4b
--- /dev/null
+++ b/tests/pipelines/easycv_pipelines/test_segmentation_pipeline.py
@@ -0,0 +1,88 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import unittest
+from distutils.version import LooseVersion
+
+import cv2
+import easycv
+import numpy as np
+from PIL import Image
+
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.cv.image_utils import semantic_seg_masks_to_image
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
+from modelscope.utils.test_utils import test_level
+
+
+class EasyCVSegmentationPipelineTest(unittest.TestCase,
+                                     DemoCompatibilityCheck):
+    img_path = 'data/test/images/image_segmentation.jpg'
+
+    def setUp(self) -> None:
+        self.task = Tasks.image_segmentation
+        self.model_id = 'damo/cv_segformer-b0_image_semantic-segmentation_coco-stuff164k'
+
+    def _internal_test_(self, model_id):
+        semantic_seg = pipeline(task=Tasks.image_segmentation, model=model_id)
+        outputs = semantic_seg(self.img_path)
+
+        draw_img = semantic_seg_masks_to_image(outputs[OutputKeys.MASKS])
+        cv2.imwrite('result.jpg', draw_img)
+        print('test ' + model_id + ' DONE')
+
+    def _internal_test_batch_(self, model_id, num_samples=2, batch_size=2):
+        # TODO: support in the future
+        img = np.asarray(Image.open(self.img_path))
+        num_samples = num_samples
+        batch_size = batch_size
+        semantic_seg = pipeline(
+            task=Tasks.image_segmentation,
+            model=model_id,
+            batch_size=batch_size)
+        outputs = semantic_seg([self.img_path] * num_samples)
+
+        self.assertEqual(semantic_seg.predict_op.batch_size, batch_size)
+        self.assertEqual(len(outputs), num_samples)
+
+        for output in outputs:
+            self.assertListEqual(
+                list(img.shape)[:2], list(output['seg_pred'].shape))
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_segformer_b0(self):
+        model_id = 'damo/cv_segformer-b0_image_semantic-segmentation_coco-stuff164k'
+        self._internal_test_(model_id)
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_segformer_b1(self):
+        model_id = 'damo/cv_segformer-b1_image_semantic-segmentation_coco-stuff164k'
+        self._internal_test_(model_id)
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_segformer_b2(self):
+        model_id = 'damo/cv_segformer-b2_image_semantic-segmentation_coco-stuff164k'
+        self._internal_test_(model_id)
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_segformer_b3(self):
+        model_id = 'damo/cv_segformer-b3_image_semantic-segmentation_coco-stuff164k'
+        self._internal_test_(model_id)
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_segformer_b4(self):
+        model_id = 'damo/cv_segformer-b4_image_semantic-segmentation_coco-stuff164k'
+        self._internal_test_(model_id)
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_segformer_b5(self):
+        model_id = 'damo/cv_segformer-b5_image_semantic-segmentation_coco-stuff164k'
+        self._internal_test_(model_id)
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/pipelines/test_action_detection.py b/tests/pipelines/test_action_detection.py
new file mode 100644
index 00000000..ae7e60b1
--- /dev/null
+++ b/tests/pipelines/test_action_detection.py
@@ -0,0 +1,29 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import unittest
+
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
+from modelscope.utils.test_utils import test_level
+
+
+class ActionDetectionTest(unittest.TestCase, DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.task = Tasks.action_detection
+        self.model_id = 'damo/cv_ResNetC3D_action-detection_detection2d'
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run(self):
+        action_detection_pipline = pipeline(self.task, model=self.model_id)
+        result = action_detection_pipline(
+            'data/test/videos/action_detection_test_video.mp4')
+        print('action detection results:', result)
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/pipelines/test_action_recognition.py b/tests/pipelines/test_action_recognition.py
index e955eb60..292eb238 100644
--- a/tests/pipelines/test_action_recognition.py
+++ b/tests/pipelines/test_action_recognition.py
@@ -1,24 +1,21 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
-# !/usr/bin/env python
-import os.path as osp
-import tempfile
 import unittest
 
-from modelscope.fileio import File
 from modelscope.pipelines import pipeline
-from modelscope.utils.constant import ModelFile, Tasks
+from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.test_utils import test_level
 
 
-class ActionRecognitionTest(unittest.TestCase):
+class ActionRecognitionTest(unittest.TestCase, DemoCompatibilityCheck):
 
     def setUp(self) -> None:
+        self.task = Tasks.action_recognition
         self.model_id = 'damo/cv_TAdaConv_action-recognition'
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_run_modelhub(self):
-        recognition_pipeline = pipeline(
-            Tasks.action_recognition, model=self.model_id)
+        recognition_pipeline = pipeline(self.task, self.model_id)
         result = recognition_pipeline(
             'data/test/videos/action_recognition_test_video.mp4')
 
@@ -26,12 +23,24 @@ class ActionRecognitionTest(unittest.TestCase):
 
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_modelhub_default_model(self):
-        recognition_pipeline = pipeline(Tasks.action_recognition)
+        recognition_pipeline = pipeline(self.task)
         result = recognition_pipeline(
             'data/test/videos/action_recognition_test_video.mp4')
 
         print(f'recognition output: {result}.')
 
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_pst(self):
+        pst_recognition_pipeline = pipeline(
+            self.task, model='damo/cv_pathshift_action-recognition')
+        result = pst_recognition_pipeline(
+            'data/test/videos/action_recognition_test_video.mp4')
+        print('pst recognition results:', result)
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_animal_recognition.py b/tests/pipelines/test_animal_recognition.py
index 3a31afed..eb9f92e6 100644
--- a/tests/pipelines/test_animal_recognition.py
+++ b/tests/pipelines/test_animal_recognition.py
@@ -1,20 +1,30 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import unittest
 
 from modelscope.pipelines import pipeline
 from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.test_utils import test_level
 
 
-class AnimalRecognitionTest(unittest.TestCase):
+class AnimalRecognitionTest(unittest.TestCase, DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.task = Tasks.animal_recognition
+        self.model_id = 'damo/cv_resnest101_animal_recognition'
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_run(self):
         animal_recognition = pipeline(
-            Tasks.animal_recognition,
-            model='damo/cv_resnest101_animal_recognition')
+            Tasks.animal_recognition, model=self.model_id)
         result = animal_recognition('data/test/images/dogs.jpg')
         print(result)
 
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_automatic_speech_recognition.py b/tests/pipelines/test_automatic_speech_recognition.py
index 88ebcdbd..303fb6b9 100644
--- a/tests/pipelines/test_automatic_speech_recognition.py
+++ b/tests/pipelines/test_automatic_speech_recognition.py
@@ -10,24 +10,24 @@ import soundfile
 from modelscope.outputs import OutputKeys
 from modelscope.pipelines import pipeline
 from modelscope.utils.constant import ColorCodes, Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.logger import get_logger
 from modelscope.utils.test_utils import download_and_untar, test_level
 
 logger = get_logger()
 
 WAV_FILE = 'data/test/audios/asr_example.wav'
+URL_FILE = 'https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example.wav'
 
 LITTLE_TESTSETS_FILE = 'data_aishell.tar.gz'
 LITTLE_TESTSETS_URL = 'https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/datasets/data_aishell.tar.gz'
 
-AISHELL1_TESTSETS_FILE = 'aishell1.tar.gz'
-AISHELL1_TESTSETS_URL = 'https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/datasets/aishell1.tar.gz'
-
 TFRECORD_TESTSETS_FILE = 'tfrecord.tar.gz'
 TFRECORD_TESTSETS_URL = 'https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/datasets/tfrecord.tar.gz'
 
 
-class AutomaticSpeechRecognitionTest(unittest.TestCase):
+class AutomaticSpeechRecognitionTest(unittest.TestCase,
+                                     DemoCompatibilityCheck):
     action_info = {
         'test_run_with_wav_pytorch': {
             'checking_item': OutputKeys.TEXT,
@@ -45,6 +45,10 @@ class AutomaticSpeechRecognitionTest(unittest.TestCase):
             'checking_item': OutputKeys.TEXT,
             'example': 'wav_example'
         },
+        'test_run_with_url_tf': {
+            'checking_item': OutputKeys.TEXT,
+            'example': 'wav_example'
+        },
         'test_run_with_wav_dataset_pytorch': {
             'checking_item': OutputKeys.TEXT,
             'example': 'dataset_example'
@@ -53,14 +57,6 @@ class AutomaticSpeechRecognitionTest(unittest.TestCase):
             'checking_item': OutputKeys.TEXT,
             'example': 'dataset_example'
         },
-        'test_run_with_ark_dataset': {
-            'checking_item': OutputKeys.TEXT,
-            'example': 'dataset_example'
-        },
-        'test_run_with_tfrecord_dataset': {
-            'checking_item': OutputKeys.TEXT,
-            'example': 'dataset_example'
-        },
         'dataset_example': {
             'Wrd': 49532,  # the number of words
             'Snt': 5000,  # the number of sentences
@@ -83,6 +79,7 @@ class AutomaticSpeechRecognitionTest(unittest.TestCase):
         self.am_tf_model_id = 'damo/speech_paraformer_asr_nat-zh-cn-16k-common-vocab8358-tensorflow1'
         # this temporary workspace dir will store waveform files
         self.workspace = os.path.join(os.getcwd(), '.tmp')
+        self.task = Tasks.auto_speech_recognition
         if not os.path.exists(self.workspace):
             os.mkdir(self.workspace)
 
@@ -140,8 +137,8 @@ class AutomaticSpeechRecognitionTest(unittest.TestCase):
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_run_with_wav_pytorch(self):
-        '''run with single waveform file
-        '''
+        """run with single waveform file
+        """
 
         logger.info('Run ASR test with waveform file (pytorch)...')
 
@@ -153,8 +150,8 @@ class AutomaticSpeechRecognitionTest(unittest.TestCase):
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_run_with_pcm_pytorch(self):
-        '''run with wav data
-        '''
+        """run with wav data
+        """
 
         logger.info('Run ASR test with wav data (pytorch)...')
 
@@ -166,8 +163,8 @@ class AutomaticSpeechRecognitionTest(unittest.TestCase):
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_run_with_wav_tf(self):
-        '''run with single waveform file
-        '''
+        """run with single waveform file
+        """
 
         logger.info('Run ASR test with waveform file (tensorflow)...')
 
@@ -179,8 +176,8 @@ class AutomaticSpeechRecognitionTest(unittest.TestCase):
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_run_with_pcm_tf(self):
-        '''run with wav data
-        '''
+        """run with wav data
+        """
 
         logger.info('Run ASR test with wav data (tensorflow)...')
 
@@ -190,9 +187,20 @@ class AutomaticSpeechRecognitionTest(unittest.TestCase):
             model_id=self.am_tf_model_id, audio_in=audio, sr=sr)
         self.check_result('test_run_with_pcm_tf', rec_result)
 
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_with_url_tf(self):
+        """run with single url file
+        """
+
+        logger.info('Run ASR test with url file (tensorflow)...')
+
+        rec_result = self.run_pipeline(
+            model_id=self.am_tf_model_id, audio_in=URL_FILE)
+        self.check_result('test_run_with_url_tf', rec_result)
+
     @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
     def test_run_with_wav_dataset_pytorch(self):
-        '''run with datasets, and audio format is waveform
+        """run with datasets, and audio format is waveform
            datasets directory:
              <dataset_path>
                wav
@@ -207,7 +215,7 @@ class AutomaticSpeechRecognitionTest(unittest.TestCase):
                    ...
                transcript
                  data.text  # hypothesis text
-        '''
+        """
 
         logger.info('Run ASR test with waveform dataset (pytorch)...')
         logger.info('Downloading waveform testsets file ...')
@@ -223,7 +231,7 @@ class AutomaticSpeechRecognitionTest(unittest.TestCase):
 
     @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
     def test_run_with_wav_dataset_tf(self):
-        '''run with datasets, and audio format is waveform
+        """run with datasets, and audio format is waveform
            datasets directory:
              <dataset_path>
                wav
@@ -238,7 +246,7 @@ class AutomaticSpeechRecognitionTest(unittest.TestCase):
                    ...
                transcript
                  data.text  # hypothesis text
-        '''
+        """
 
         logger.info('Run ASR test with waveform dataset (tensorflow)...')
         logger.info('Downloading waveform testsets file ...')
@@ -252,59 +260,9 @@ class AutomaticSpeechRecognitionTest(unittest.TestCase):
             model_id=self.am_tf_model_id, audio_in=dataset_path)
         self.check_result('test_run_with_wav_dataset_tf', rec_result)
 
-    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
-    def test_run_with_ark_dataset(self):
-        '''run with datasets, and audio format is kaldi_ark
-           datasets directory:
-             <dataset_path>
-               test   # testsets
-                 data.ark
-                 data.scp
-                 data.text
-               dev    # devsets
-                 data.ark
-                 data.scp
-                 data.text
-               train  # trainsets
-                 data.ark
-                 data.scp
-                 data.text
-        '''
-
-        logger.info('Run ASR test with ark dataset (pytorch)...')
-        logger.info('Downloading ark testsets file ...')
-
-        dataset_path = download_and_untar(
-            os.path.join(self.workspace, AISHELL1_TESTSETS_FILE),
-            AISHELL1_TESTSETS_URL, self.workspace)
-        dataset_path = os.path.join(dataset_path, 'test')
-
-        rec_result = self.run_pipeline(
-            model_id=self.am_pytorch_model_id, audio_in=dataset_path)
-        self.check_result('test_run_with_ark_dataset', rec_result)
-
-    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
-    def test_run_with_tfrecord_dataset(self):
-        '''run with datasets, and audio format is tfrecord
-           datasets directory:
-             <dataset_path>
-               test   # testsets
-                 data.records
-                 data.idx
-                 data.text
-        '''
-
-        logger.info('Run ASR test with tfrecord dataset (tensorflow)...')
-        logger.info('Downloading tfrecord testsets file ...')
-
-        dataset_path = download_and_untar(
-            os.path.join(self.workspace, TFRECORD_TESTSETS_FILE),
-            TFRECORD_TESTSETS_URL, self.workspace)
-        dataset_path = os.path.join(dataset_path, 'test')
-
-        rec_result = self.run_pipeline(
-            model_id=self.am_tf_model_id, audio_in=dataset_path)
-        self.check_result('test_run_with_tfrecord_dataset', rec_result)
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
 
 
 if __name__ == '__main__':
diff --git a/tests/pipelines/test_body_2d_keypoints.py b/tests/pipelines/test_body_2d_keypoints.py
index d010adc5..5d90cbf0 100644
--- a/tests/pipelines/test_body_2d_keypoints.py
+++ b/tests/pipelines/test_body_2d_keypoints.py
@@ -2,20 +2,20 @@
 import unittest
 
 import cv2
-import numpy as np
 from PIL import Image
 
-from modelscope.outputs import OutputKeys
 from modelscope.pipelines import pipeline
 from modelscope.pipelines.base import Pipeline
 from modelscope.utils.constant import Tasks
 from modelscope.utils.cv.image_utils import draw_keypoints
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.test_utils import test_level
 
 
-class Body2DKeypointsTest(unittest.TestCase):
+class Body2DKeypointsTest(unittest.TestCase, DemoCompatibilityCheck):
 
     def setUp(self) -> None:
+        self.task = Tasks.body_2d_keypoints
         self.model_id = 'damo/cv_hrnetv2w32_body-2d-keypoints_image'
         self.test_image = 'data/test/images/keypoints_detect/000000438862.jpg'
 
@@ -26,16 +26,18 @@ class Body2DKeypointsTest(unittest.TestCase):
 
     @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
     def test_run_modelhub_with_image_file(self):
-        body_2d_keypoints = pipeline(
-            Tasks.body_2d_keypoints, model=self.model_id)
+        body_2d_keypoints = pipeline(self.task, model=self.model_id)
         self.pipeline_inference(body_2d_keypoints, self.test_image)
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_run_modelhub_with_image_input(self):
-        body_2d_keypoints = pipeline(
-            Tasks.body_2d_keypoints, model=self.model_id)
+        body_2d_keypoints = pipeline(self.task, model=self.model_id)
         self.pipeline_inference(body_2d_keypoints, Image.open(self.test_image))
 
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_body_3d_keypoints.py b/tests/pipelines/test_body_3d_keypoints.py
new file mode 100644
index 00000000..bde04f8e
--- /dev/null
+++ b/tests/pipelines/test_body_3d_keypoints.py
@@ -0,0 +1,58 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import unittest
+
+import cv2
+import numpy as np
+
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines import pipeline
+from modelscope.pipelines.base import Pipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
+from modelscope.utils.test_utils import test_level
+
+
+class Body3DKeypointsTest(unittest.TestCase, DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.model_id = 'damo/cv_canonical_body-3d-keypoints_video'
+        self.test_video = 'data/test/videos/Walking.54138969.mp4'
+        self.task = Tasks.body_3d_keypoints
+
+    def pipeline_inference(self, pipeline: Pipeline, pipeline_input):
+        output = pipeline(pipeline_input)
+        poses = np.array(output[OutputKeys.POSES])
+        print(f'result 3d points shape {poses.shape}')
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_modelhub_with_video_file(self):
+        body_3d_keypoints = pipeline(
+            Tasks.body_3d_keypoints, model=self.model_id)
+        pipeline_input = {
+            'input_video': self.test_video,
+            'output_video_path': './result.mp4'
+        }
+        self.pipeline_inference(
+            body_3d_keypoints, pipeline_input=pipeline_input)
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_modelhub_with_video_stream(self):
+        body_3d_keypoints = pipeline(Tasks.body_3d_keypoints)
+        cap = cv2.VideoCapture(self.test_video)
+        if not cap.isOpened():
+            raise Exception('modelscope error: %s cannot be decoded by OpenCV.'
+                            % (self.test_video))
+        pipeline_input = {
+            'input_video': cap,
+            'output_video_path': './result.mp4'
+        }
+        self.pipeline_inference(
+            body_3d_keypoints, pipeline_input=pipeline_input)
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/pipelines/test_cmdssl_video_embedding.py b/tests/pipelines/test_cmdssl_video_embedding.py
index 694ebf40..5807c075 100644
--- a/tests/pipelines/test_cmdssl_video_embedding.py
+++ b/tests/pipelines/test_cmdssl_video_embedding.py
@@ -1,23 +1,31 @@
-# Copyright (c) Alibaba, Inc. and its affiliates.
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
 # !/usr/bin/env python
 import unittest
 
 from modelscope.pipelines import pipeline
 from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.test_utils import test_level
 
 
-class CMDSSLVideoEmbeddingTest(unittest.TestCase):
+class CMDSSLVideoEmbeddingTest(unittest.TestCase, DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.task = Tasks.video_embedding
+        self.model_id = 'damo/cv_r2p1d_video_embedding'
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_run_modelhub(self):
-        videossl_pipeline = pipeline(
-            Tasks.video_embedding, model='damo/cv_r2p1d_video_embedding')
+        videossl_pipeline = pipeline(task=self.task, model=self.model_id)
         result = videossl_pipeline(
             'data/test/videos/action_recognition_test_video.mp4')
 
         print(f'video embedding output: {result}.')
 
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_conversational_text_to_sql.py b/tests/pipelines/test_conversational_text_to_sql.py
index 0504cb7c..80c72337 100644
--- a/tests/pipelines/test_conversational_text_to_sql.py
+++ b/tests/pipelines/test_conversational_text_to_sql.py
@@ -1,6 +1,5 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import unittest
-from typing import List
 
 from modelscope.hub.snapshot_download import snapshot_download
 from modelscope.models import Model
@@ -9,11 +8,17 @@ from modelscope.pipelines import pipeline
 from modelscope.pipelines.nlp import ConversationalTextToSqlPipeline
 from modelscope.preprocessors import ConversationalTextToSqlPreprocessor
 from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.nlp.nlp_utils import text2sql_tracking_and_print_results
 from modelscope.utils.test_utils import test_level
 
 
-class ConversationalTextToSql(unittest.TestCase):
+class ConversationalTextToSql(unittest.TestCase, DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.task = Tasks.conversational_text_to_sql
+        self.model_id = 'damo/nlp_star_conversational-text-to-sql'
+
     model_id = 'damo/nlp_star_conversational-text-to-sql'
     test_case = {
         'database_id':
@@ -39,10 +44,7 @@ class ConversationalTextToSql(unittest.TestCase):
         pipelines = [
             ConversationalTextToSqlPipeline(
                 model=model, preprocessor=preprocessor),
-            pipeline(
-                task=Tasks.conversational_text_to_sql,
-                model=model,
-                preprocessor=preprocessor)
+            pipeline(task=self.task, model=model, preprocessor=preprocessor)
         ]
         text2sql_tracking_and_print_results(self.test_case, pipelines)
 
@@ -55,26 +57,24 @@ class ConversationalTextToSql(unittest.TestCase):
         pipelines = [
             ConversationalTextToSqlPipeline(
                 model=model, preprocessor=preprocessor),
-            pipeline(
-                task=Tasks.conversational_text_to_sql,
-                model=model,
-                preprocessor=preprocessor)
+            pipeline(task=self.task, model=model, preprocessor=preprocessor)
         ]
         text2sql_tracking_and_print_results(self.test_case, pipelines)
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_run_with_model_name(self):
-        pipelines = [
-            pipeline(
-                task=Tasks.conversational_text_to_sql, model=self.model_id)
-        ]
+        pipelines = [pipeline(task=self.task, model=self.model_id)]
         text2sql_tracking_and_print_results(self.test_case, pipelines)
 
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_with_default_model(self):
-        pipelines = [pipeline(task=Tasks.conversational_text_to_sql)]
+        pipelines = [pipeline(task=self.task)]
         text2sql_tracking_and_print_results(self.test_case, pipelines)
 
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_crowd_counting.py b/tests/pipelines/test_crowd_counting.py
index 99f5ffd2..4e15cfca 100644
--- a/tests/pipelines/test_crowd_counting.py
+++ b/tests/pipelines/test_crowd_counting.py
@@ -8,17 +8,19 @@ from modelscope.outputs import OutputKeys
 from modelscope.pipelines import pipeline
 from modelscope.utils.constant import Tasks
 from modelscope.utils.cv.image_utils import numpy_to_cv2img
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.logger import get_logger
 from modelscope.utils.test_utils import test_level
 
 logger = get_logger()
 
 
-class CrowdCountingTest(unittest.TestCase):
+class CrowdCountingTest(unittest.TestCase, DemoCompatibilityCheck):
 
     def setUp(self) -> None:
         self.input_location = 'data/test/images/crowd_counting.jpg'
         self.model_id = 'damo/cv_hrnet_crowd-counting_dcanet'
+        self.task = Tasks.crowd_counting
 
     def save_result(self, result):
         print('scores:', result[OutputKeys.SCORES])
@@ -28,7 +30,7 @@ class CrowdCountingTest(unittest.TestCase):
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_crowd_counting(self):
-        crowd_counting = pipeline(Tasks.crowd_counting, model=self.model_id)
+        crowd_counting = pipeline(task=self.task, model=self.model_id)
         result = crowd_counting(self.input_location)
         if result:
             self.save_result(result)
@@ -37,7 +39,7 @@ class CrowdCountingTest(unittest.TestCase):
 
     @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
     def test_crowd_counting_with_image(self):
-        crowd_counting = pipeline(Tasks.crowd_counting, model=self.model_id)
+        crowd_counting = pipeline(task=self.task, model=self.model_id)
         img = Image.open(self.input_location)
         result = crowd_counting(img)
         if result:
@@ -47,13 +49,17 @@ class CrowdCountingTest(unittest.TestCase):
 
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_crowd_counting_with_default_task(self):
-        crowd_counting = pipeline(Tasks.crowd_counting)
+        crowd_counting = pipeline(self.task)
         result = crowd_counting(self.input_location)
         if result:
             self.save_result(result)
         else:
             raise ValueError('process error')
 
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_csanmt_translation.py b/tests/pipelines/test_csanmt_translation.py
index c852b1ff..f7ec81cd 100644
--- a/tests/pipelines/test_csanmt_translation.py
+++ b/tests/pipelines/test_csanmt_translation.py
@@ -3,22 +3,38 @@ import unittest
 
 from modelscope.pipelines import pipeline
 from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.test_utils import test_level
 
 
-class TranslationTest(unittest.TestCase):
-    model_id = 'damo/nlp_csanmt_translation_zh2en'
-    inputs = '声明 补充 说 ， 沃伦 的 同事 都 深感 震惊 ， 并且 希望 他 能够 投@@ 案@@ 自@@ 首 。'
+class TranslationTest(unittest.TestCase, DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.task = Tasks.translation
+        self.model_id = 'damo/nlp_csanmt_translation_zh2en'
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
-    def test_run_with_model_name(self):
-        pipeline_ins = pipeline(task=Tasks.translation, model=self.model_id)
-        print(pipeline_ins(input=self.inputs))
+    def test_run_with_model_name_for_zh2en(self):
+        inputs = '声明补充说，沃伦的同事都深感震惊，并且希望他能够投案自首。'
+        pipeline_ins = pipeline(self.task, model=self.model_id)
+        print(pipeline_ins(input=inputs))
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_with_model_name_for_en2zh(self):
+        model_id = 'damo/nlp_csanmt_translation_en2zh'
+        inputs = 'Elon Musk, co-founder and chief executive officer of Tesla Motors.'
+        pipeline_ins = pipeline(self.task, model=model_id)
+        print(pipeline_ins(input=inputs))
 
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_with_default_model(self):
-        pipeline_ins = pipeline(task=Tasks.translation)
-        print(pipeline_ins(input=self.inputs))
+        inputs = '声明补充说，沃伦的同事都深感震惊，并且希望他能够投案自首。'
+        pipeline_ins = pipeline(self.task)
+        print(pipeline_ins(input=inputs))
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
 
 
 if __name__ == '__main__':
diff --git a/tests/pipelines/test_deberta_tasks.py b/tests/pipelines/test_deberta_tasks.py
new file mode 100644
index 00000000..549d2cb3
--- /dev/null
+++ b/tests/pipelines/test_deberta_tasks.py
@@ -0,0 +1,60 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import unittest
+
+import torch
+
+from modelscope.hub.snapshot_download import snapshot_download
+from modelscope.models import Model
+from modelscope.models.nlp import DebertaV2ForMaskedLM
+from modelscope.pipelines import pipeline
+from modelscope.pipelines.nlp import FillMaskPipeline
+from modelscope.preprocessors import NLPPreprocessor
+from modelscope.utils.constant import Tasks
+from modelscope.utils.test_utils import test_level
+
+
+class DeBERTaV2TaskTest(unittest.TestCase):
+    model_id_deberta = 'damo/nlp_debertav2_fill-mask_chinese-lite'
+
+    ori_text = '你师父差得动你，你师父可差不动我。'
+    test_input = '你师父差得动你，你师父可[MASK]不动我。'
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_run_by_direct_model_download(self):
+        model_dir = snapshot_download(self.model_id_deberta)
+        preprocessor = NLPPreprocessor(
+            model_dir, first_sequence='sentence', second_sequence=None)
+        model = DebertaV2ForMaskedLM.from_pretrained(model_dir)
+        pipeline1 = FillMaskPipeline(model, preprocessor)
+        pipeline2 = pipeline(
+            Tasks.fill_mask, model=model, preprocessor=preprocessor)
+        ori_text = self.ori_text
+        test_input = self.test_input
+        print(f'\nori_text: {ori_text}\ninput: {test_input}\npipeline1: '
+              f'{pipeline1(test_input)}\npipeline2: {pipeline2(test_input)}\n')
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_with_model_from_modelhub(self):
+        # sbert
+        print(self.model_id_deberta)
+        model = Model.from_pretrained(self.model_id_deberta)
+        preprocessor = NLPPreprocessor(
+            model.model_dir, first_sequence='sentence', second_sequence=None)
+        pipeline_ins = pipeline(
+            task=Tasks.fill_mask, model=model, preprocessor=preprocessor)
+        print(
+            f'\nori_text: {self.ori_text}\ninput: {self.test_input}\npipeline: '
+            f'{pipeline_ins(self.test_input)}\n')
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_with_model_name(self):
+        pipeline_ins = pipeline(
+            task=Tasks.fill_mask, model=self.model_id_deberta)
+        ori_text = self.ori_text
+        test_input = self.test_input
+        print(f'\nori_text: {ori_text}\ninput: {test_input}\npipeline: '
+              f'{pipeline_ins(test_input)}\n')
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/pipelines/test_dialog_intent_prediction.py b/tests/pipelines/test_dialog_intent_prediction.py
index afd68442..5894297f 100644
--- a/tests/pipelines/test_dialog_intent_prediction.py
+++ b/tests/pipelines/test_dialog_intent_prediction.py
@@ -8,11 +8,16 @@ from modelscope.pipelines import pipeline
 from modelscope.pipelines.nlp import DialogIntentPredictionPipeline
 from modelscope.preprocessors import DialogIntentPredictionPreprocessor
 from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.test_utils import test_level
 
 
-class DialogIntentPredictionTest(unittest.TestCase):
-    model_id = 'damo/nlp_space_dialog-intent-prediction'
+class DialogIntentPredictionTest(unittest.TestCase, DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.task = Tasks.task_oriented_conversation
+        self.model_id = 'damo/nlp_space_dialog-intent-prediction'
+
     test_case = [
         'How do I locate my card?',
         'I still have not received my new card, I ordered over a week ago.'
@@ -61,13 +66,15 @@ class DialogIntentPredictionTest(unittest.TestCase):
     def test_run_with_model_name(self):
         pipelines = [
             pipeline(
-                task=Tasks.task_oriented_conversation,
-                model=self.model_id,
-                model_revision='update')
+                task=self.task, model=self.model_id, model_revision='update')
         ]
         for my_pipeline, item in list(zip(pipelines, self.test_case)):
             print(my_pipeline(item))
 
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_dialog_modeling.py b/tests/pipelines/test_dialog_modeling.py
index 299af2e9..19d6ed2f 100644
--- a/tests/pipelines/test_dialog_modeling.py
+++ b/tests/pipelines/test_dialog_modeling.py
@@ -10,11 +10,16 @@ from modelscope.pipelines import pipeline
 from modelscope.pipelines.nlp import DialogModelingPipeline
 from modelscope.preprocessors import DialogModelingPreprocessor
 from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.test_utils import test_level
 
 
-class DialogModelingTest(unittest.TestCase):
-    model_id = 'damo/nlp_space_dialog-modeling'
+class DialogModelingTest(unittest.TestCase, DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.task = Tasks.task_oriented_conversation
+        self.model_id = 'damo/nlp_space_dialog-modeling'
+
     test_case = {
         'sng0073': {
             'goal': {
@@ -139,7 +144,7 @@ class DialogModelingTest(unittest.TestCase):
     def test_run_with_model_name(self):
         pipelines = [
             pipeline(
-                task=Tasks.task_oriented_conversation,
+                task=self.task,
                 model=self.model_id,
                 model_revision='task_oriented_conversation')
         ]
@@ -149,11 +154,14 @@ class DialogModelingTest(unittest.TestCase):
     def test_run_with_default_model(self):
         pipelines = [
             pipeline(
-                task=Tasks.task_oriented_conversation,
-                model_revision='task_oriented_conversation')
+                task=self.task, model_revision='task_oriented_conversation')
         ]
         self.generate_and_print_dialog_response(pipelines)
 
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_dialog_state_tracking.py b/tests/pipelines/test_dialog_state_tracking.py
index 843aade9..81bdd9be 100644
--- a/tests/pipelines/test_dialog_state_tracking.py
+++ b/tests/pipelines/test_dialog_state_tracking.py
@@ -8,12 +8,17 @@ from modelscope.pipelines import pipeline
 from modelscope.pipelines.nlp import DialogStateTrackingPipeline
 from modelscope.preprocessors import DialogStateTrackingPreprocessor
 from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.nlp.nlp_utils import tracking_and_print_dialog_states
 from modelscope.utils.test_utils import test_level
 
 
-class DialogStateTrackingTest(unittest.TestCase):
-    model_id = 'damo/nlp_space_dialog-state-tracking'
+class DialogStateTrackingTest(unittest.TestCase, DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.task = Tasks.task_oriented_conversation
+        self.model_id = 'damo/nlp_space_dialog-state-tracking'
+
     test_case = [{
         'User-1':
         'Hi, I\'m looking for a train that is going to cambridge and arriving there by 20:45, '
@@ -103,10 +108,7 @@ class DialogStateTrackingTest(unittest.TestCase):
         pipelines = [
             DialogStateTrackingPipeline(
                 model=model, preprocessor=preprocessor),
-            pipeline(
-                task=Tasks.task_oriented_conversation,
-                model=model,
-                preprocessor=preprocessor)
+            pipeline(task=self.task, model=model, preprocessor=preprocessor)
         ]
 
         tracking_and_print_dialog_states(self.test_case, pipelines)
@@ -115,12 +117,14 @@ class DialogStateTrackingTest(unittest.TestCase):
     def test_run_with_model_name(self):
         pipelines = [
             pipeline(
-                task=Tasks.task_oriented_conversation,
-                model=self.model_id,
-                model_revision='update')
+                task=self.task, model=self.model_id, model_revision='update')
         ]
         tracking_and_print_dialog_states(self.test_case, pipelines)
 
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_document_segmentation.py b/tests/pipelines/test_document_segmentation.py
new file mode 100644
index 00000000..b4406fef
--- /dev/null
+++ b/tests/pipelines/test_document_segmentation.py
@@ -0,0 +1,63 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import unittest
+from typing import Any, Dict
+
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
+from modelscope.utils.logger import get_logger
+from modelscope.utils.test_utils import test_level
+
+logger = get_logger()
+
+
+class DocumentSegmentationTest(unittest.TestCase, DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.task = Tasks.document_segmentation
+        self.model_id = 'damo/nlp_bert_document-segmentation_chinese-base'
+
+    model_id = 'damo/nlp_bert_document-segmentation_chinese-base'
+    eng_model_id = 'damo/nlp_bert_document-segmentation_english-base'
+    sentences = '近年来，随着端到端语音识别的流行，基于Transformer结构的语音识别系统逐渐成为了主流。然而，由于Transformer是一种自回归模型，需要逐个生成目标文字，计算复杂度随着目标文字数量线性增加，限制了其在工业生产中的应用。针对Transoformer模型自回归生成文字的低计算效率缺陷，学术界提出了非自回归模型来并行的输出目标文字。根据生成目标文字时，迭代轮数，非自回归模型分为：多轮迭代式与单轮迭代非自回归模型。其中实用的是基于单轮迭代的非自回归模型。对于单轮非自回归模型，现有工作往往聚焦于如何更加准确的预测目标文字个数，如CTC-enhanced采用CTC预测输出文字个数，尽管如此，考虑到现实应用中，语速、口音、静音以及噪声等因素的影响，如何准确的预测目标文字个数以及抽取目标文字对应的声学隐变量仍然是一个比较大的挑战；另外一方面，我们通过对比自回归模型与单轮非自回归模型在工业大数据上的错误类型（如下图所示，AR与vanilla NAR），发现，相比于自回归模型，非自回归模型，在预测目标文字个数方面差距较小，但是替换错误显著的增加，我们认为这是由于单轮非自回归模型中条件独立假设导致的语义信息丢失。于此同时，目前非自回归模型主要停留在学术验证阶段，还没有工业大数据上的相关实验与结论。'  # noqa *
+    sentences_1 = '移动端语音唤醒模型，检测关键词为“小云小云”。模型主体为4层FSMN结构，使用CTC训练准则，参数量750K，适用于移动端设备运行。模型输入为Fbank特征，输出为基于char建模的中文全集token预测，测试工具根据每一帧的预测数据进行后处理得到输入音频的实时检测结果。模型训练采用“basetrain + finetune”的模式，basetrain过程使用大量内部移动端数据，在此基础上，使用1万条设备端录制安静场景“小云小云”数据进行微调，得到最终面向业务的模型。后续用户可在basetrain模型基础上，使用其他关键词数据进行微调，得到新的语音唤醒模型，但暂时未开放模型finetune功能。'  # noqa *
+    eng_sentences = 'The Saint Alexander Nevsky Church was established in 1936 by Archbishop Vitaly (Maximenko) () on a tract of land donated by Yulia Martinovna Plavskaya.The initial chapel, dedicated to the memory of the great prince St. Alexander Nevsky (1220–1263), was blessed in May, 1936.The church building was subsequently expanded three times.In 1987, ground was cleared for the construction of the new church and on September 12, 1989, on the Feast Day of St. Alexander Nevsky, the cornerstone was laid and the relics of St. Herman of Alaska placed in the foundation.The imposing edifice, completed in 1997, is the work of Nikolaus Karsanov, architect and Protopresbyter Valery Lukianov, engineer.Funds were raised through donations.The Great blessing of the cathedral took place on October 18, 1997 with seven bishops, headed by Metropolitan Vitaly Ustinov, and 36 priests and deacons officiating, some 800 faithful attended the festivity.The old church was rededicated to Our Lady of Tikhvin.Metropolitan Hilarion (Kapral) announced, that cathedral will officially become the episcopal See of the Ruling Bishop of the Eastern American Diocese and the administrative center of the Diocese on September 12, 2014.At present the parish serves the spiritual needs of 300 members.The parochial school instructs over 90 boys and girls in religion, Russian language and history.The school meets every Saturday.The choir is directed by Andrew Burbelo.The sisterhood attends to the needs of the church and a church council acts in the administration of the community.The cathedral is decorated by frescoes in the Byzantine style.The iconography project was fulfilled by Father Andrew Erastov and his students from 1995 until 2001.'  # noqa *
+
+    def run_pipeline(self, model_id: str, documents: str) -> Dict[str, Any]:
+        p = pipeline(task=self.task, model=model_id)
+        result = p(documents=documents)
+        return result
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_with_document(self):
+        logger.info('Run document segmentation with one document ...')
+
+        result = self.run_pipeline(
+            model_id=self.model_id, documents=self.sentences)
+        print(result[OutputKeys.TEXT])
+
+        result = self.run_pipeline(
+            model_id=self.eng_model_id, documents=self.eng_sentences)
+        print(result[OutputKeys.TEXT])
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_with_documents(self):
+        logger.info('Run document segmentation with many documents ...')
+
+        result = self.run_pipeline(
+            model_id=self.model_id,
+            documents=[self.sentences, self.sentences_1])
+
+        documents_list = result[OutputKeys.TEXT]
+        for document in documents_list:
+            print(document)
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/pipelines/test_face_2d_keypoints.py b/tests/pipelines/test_face_2d_keypoints.py
new file mode 100644
index 00000000..667ecddc
--- /dev/null
+++ b/tests/pipelines/test_face_2d_keypoints.py
@@ -0,0 +1,36 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import unittest
+
+import cv2
+
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.test_utils import test_level
+
+
+class EasyCVFace2DKeypointsPipelineTest(unittest.TestCase):
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_face_2d_keypoints(self):
+        img_path = 'data/test/images/keypoints_detect/test_img_face_2d_keypoints.png'
+        model_id = 'damo/cv_mobilenet_face-2d-keypoints_alignment'
+
+        face_2d_keypoints_align = pipeline(
+            task=Tasks.face_2d_keypoints, model=model_id)
+        output = face_2d_keypoints_align(img_path)[0]
+
+        output_keypoints = output[OutputKeys.KEYPOINTS]
+        output_pose = output[OutputKeys.POSES]
+
+        img = cv2.imread(img_path)
+        img = face_2d_keypoints_align.show_result(
+            img, output_keypoints, scale=2, save_path='face_keypoints.jpg')
+
+        self.assertEqual(output_keypoints.shape[0], 106)
+        self.assertEqual(output_keypoints.shape[1], 2)
+        self.assertEqual(output_pose.shape[0], 3)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/pipelines/test_face_detection.py b/tests/pipelines/test_face_detection.py
index 03dd75a6..f89e9a94 100644
--- a/tests/pipelines/test_face_detection.py
+++ b/tests/pipelines/test_face_detection.py
@@ -3,19 +3,19 @@ import os.path as osp
 import unittest
 
 import cv2
-import numpy as np
 
 from modelscope.msdatasets import MsDataset
-from modelscope.outputs import OutputKeys
 from modelscope.pipelines import pipeline
 from modelscope.utils.constant import Tasks
 from modelscope.utils.cv.image_utils import draw_face_detection_result
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.test_utils import test_level
 
 
-class FaceDetectionTest(unittest.TestCase):
+class FaceDetectionTest(unittest.TestCase, DemoCompatibilityCheck):
 
     def setUp(self) -> None:
+        self.task = Tasks.face_detection
         self.model_id = 'damo/cv_resnet_facedetection_scrfd10gkps'
 
     def show_result(self, img_path, detection_result):
@@ -49,6 +49,10 @@ class FaceDetectionTest(unittest.TestCase):
         result = face_detection(img_path)
         self.show_result(img_path, result)
 
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_face_emotion.py b/tests/pipelines/test_face_emotion.py
new file mode 100644
index 00000000..907e15ee
--- /dev/null
+++ b/tests/pipelines/test_face_emotion.py
@@ -0,0 +1,32 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import unittest
+
+from modelscope.pipelines import pipeline
+from modelscope.pipelines.base import Pipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.test_utils import test_level
+
+
+class FaceEmotionTest(unittest.TestCase):
+
+    def setUp(self) -> None:
+        self.model = 'damo/cv_face-emotion'
+        self.img = {'img_path': 'data/test/images/face_emotion.jpg'}
+
+    def pipeline_inference(self, pipeline: Pipeline, input: str):
+        result = pipeline(input)
+        print(result)
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_modelhub(self):
+        face_emotion = pipeline(Tasks.face_emotion, model=self.model)
+        self.pipeline_inference(face_emotion, self.img)
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_run_modelhub_default_model(self):
+        face_emotion = pipeline(Tasks.face_emotion)
+        self.pipeline_inference(face_emotion, self.img)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/pipelines/test_face_human_hand_detection.py b/tests/pipelines/test_face_human_hand_detection.py
new file mode 100644
index 00000000..7aaa67e7
--- /dev/null
+++ b/tests/pipelines/test_face_human_hand_detection.py
@@ -0,0 +1,38 @@
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
+import unittest
+
+from modelscope.pipelines import pipeline
+from modelscope.pipelines.base import Pipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.logger import get_logger
+from modelscope.utils.test_utils import test_level
+
+logger = get_logger()
+
+
+class FaceHumanHandTest(unittest.TestCase):
+
+    def setUp(self) -> None:
+        self.model_id = 'damo/cv_nanodet_face-human-hand-detection'
+        self.input = {
+            'input_path': 'data/test/images/face_human_hand_detection.jpg',
+        }
+
+    def pipeline_inference(self, pipeline: Pipeline, input: str):
+        result = pipeline(input)
+        logger.info(result)
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_modelhub(self):
+        face_human_hand_detection = pipeline(
+            Tasks.face_human_hand_detection, model=self.model_id)
+        self.pipeline_inference(face_human_hand_detection, self.input)
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_run_modelhub_default_model(self):
+        face_human_hand_detection = pipeline(Tasks.face_human_hand_detection)
+        self.pipeline_inference(face_human_hand_detection, self.input)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/pipelines/test_face_image_generation.py b/tests/pipelines/test_face_image_generation.py
index c758ea3a..21d8e835 100644
--- a/tests/pipelines/test_face_image_generation.py
+++ b/tests/pipelines/test_face_image_generation.py
@@ -8,12 +8,14 @@ from modelscope.outputs import OutputKeys
 from modelscope.pipelines import pipeline
 from modelscope.pipelines.base import Pipeline
 from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.test_utils import test_level
 
 
-class FaceGenerationTest(unittest.TestCase):
+class FaceGenerationTest(unittest.TestCase, DemoCompatibilityCheck):
 
     def setUp(self) -> None:
+        self.task = Tasks.face_image_generation
         self.model_id = 'damo/cv_gan_face-image-generation'
 
     def pipeline_inference(self, pipeline: Pipeline, seed: int):
@@ -26,7 +28,7 @@ class FaceGenerationTest(unittest.TestCase):
     def test_run_modelhub(self):
         seed = 10
         face_generation = pipeline(
-            Tasks.face_image_generation,
+            self.task,
             model=self.model_id,
         )
         self.pipeline_inference(face_generation, seed)
@@ -34,9 +36,13 @@ class FaceGenerationTest(unittest.TestCase):
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_modelhub_default_model(self):
         seed = 10
-        face_generation = pipeline(Tasks.face_image_generation)
+        face_generation = pipeline(self.task)
         self.pipeline_inference(face_generation, seed)
 
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_face_recognition.py b/tests/pipelines/test_face_recognition.py
index 015205d6..d3451f5d 100644
--- a/tests/pipelines/test_face_recognition.py
+++ b/tests/pipelines/test_face_recognition.py
@@ -6,12 +6,14 @@ import numpy as np
 from modelscope.outputs import OutputKeys
 from modelscope.pipelines import pipeline
 from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.test_utils import test_level
 
 
-class FaceRecognitionTest(unittest.TestCase):
+class FaceRecognitionTest(unittest.TestCase, DemoCompatibilityCheck):
 
     def setUp(self) -> None:
+        self.task = Tasks.face_recognition
         self.model_id = 'damo/cv_ir101_facerecognition_cfglint'
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
@@ -26,6 +28,10 @@ class FaceRecognitionTest(unittest.TestCase):
         sim = np.dot(emb1[0], emb2[0])
         print(f'Cos similarity={sim:.3f}, img1:{img1}  img2:{img2}')
 
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_facial_expression_recognition.py b/tests/pipelines/test_facial_expression_recognition.py
new file mode 100644
index 00000000..fff83ad6
--- /dev/null
+++ b/tests/pipelines/test_facial_expression_recognition.py
@@ -0,0 +1,36 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os.path as osp
+import unittest
+
+import cv2
+import numpy as np
+
+from modelscope.msdatasets import MsDataset
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.cv.image_utils import draw_facial_expression_result
+from modelscope.utils.test_utils import test_level
+
+
+class FacialExpressionRecognitionTest(unittest.TestCase):
+
+    def setUp(self) -> None:
+        self.model_id = 'damo/cv_vgg19_facial-expression-recognition_fer'
+
+    def show_result(self, img_path, facial_expression_result):
+        img = draw_facial_expression_result(img_path, facial_expression_result)
+        cv2.imwrite('result.png', img)
+        print(f'output written to {osp.abspath("result.png")}')
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_modelhub(self):
+        fer = pipeline(
+            Tasks.facial_expression_recognition, model=self.model_id)
+        img_path = 'data/test/images/facial_expression_recognition.jpg'
+        result = fer(img_path)
+        self.show_result(img_path, result)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/pipelines/test_faq_question_answering.py b/tests/pipelines/test_faq_question_answering.py
new file mode 100644
index 00000000..7eea0ddf
--- /dev/null
+++ b/tests/pipelines/test_faq_question_answering.py
@@ -0,0 +1,94 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import unittest
+
+import numpy as np
+
+from modelscope.hub.api import HubApi
+from modelscope.hub.snapshot_download import snapshot_download
+from modelscope.models import Model
+from modelscope.models.nlp import SbertForFaqQuestionAnswering
+from modelscope.pipelines import pipeline
+from modelscope.pipelines.nlp import FaqQuestionAnsweringPipeline
+from modelscope.preprocessors import FaqQuestionAnsweringPreprocessor
+from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
+from modelscope.utils.test_utils import test_level
+
+
+class FaqQuestionAnsweringTest(unittest.TestCase, DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.task = Tasks.faq_question_answering
+        self.model_id = 'damo/nlp_structbert_faq-question-answering_chinese-base'
+
+    param = {
+        'query_set': ['如何使用优惠券', '在哪里领券', '在哪里领券'],
+        'support_set': [{
+            'text': '卖品代金券怎么用',
+            'label': '6527856'
+        }, {
+            'text': '怎么使用优惠券',
+            'label': '6527856'
+        }, {
+            'text': '这个可以一起领吗',
+            'label': '1000012000'
+        }, {
+            'text': '付款时送的优惠券哪里领',
+            'label': '1000012000'
+        }, {
+            'text': '购物等级怎么长',
+            'label': '13421097'
+        }, {
+            'text': '购物等级二心',
+            'label': '13421097'
+        }]
+    }
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_run_with_direct_file_download(self):
+        cache_path = snapshot_download(self.model_id)
+        preprocessor = FaqQuestionAnsweringPreprocessor(cache_path)
+        model = SbertForFaqQuestionAnswering(cache_path)
+        model.load_checkpoint(cache_path)
+        pipeline_ins = FaqQuestionAnsweringPipeline(
+            model, preprocessor=preprocessor)
+        result = pipeline_ins(self.param)
+        print(result)
+
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_run_with_model_from_modelhub(self):
+        model = Model.from_pretrained(self.model_id)
+        preprocessor = FaqQuestionAnsweringPreprocessor(model.model_dir)
+        pipeline_ins = pipeline(
+            task=Tasks.faq_question_answering,
+            model=model,
+            preprocessor=preprocessor)
+        result = pipeline_ins(self.param)
+        print(result)
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_with_model_name(self):
+        pipeline_ins = pipeline(
+            task=Tasks.faq_question_answering, model=self.model_id)
+        result = pipeline_ins(self.param)
+        print(result)
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_run_with_default_model(self):
+        pipeline_ins = pipeline(task=Tasks.faq_question_answering)
+        print(pipeline_ins(self.param, max_seq_length=20))
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_sentence_embedding(self):
+        pipeline_ins = pipeline(task=Tasks.faq_question_answering)
+        sentence_vec = pipeline_ins.get_sentence_embedding(
+            ['今天星期六', '明天星期几明天星期几'])
+        print(np.shape(sentence_vec))
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/pipelines/test_feature_extraction.py b/tests/pipelines/test_feature_extraction.py
new file mode 100644
index 00000000..39291e76
--- /dev/null
+++ b/tests/pipelines/test_feature_extraction.py
@@ -0,0 +1,67 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import unittest
+
+import numpy as np
+
+from modelscope.hub.snapshot_download import snapshot_download
+from modelscope.models import Model
+from modelscope.models.nlp import FeatureExtractionModel
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines import pipeline
+from modelscope.pipelines.nlp import FeatureExtractionPipeline
+from modelscope.preprocessors import NLPPreprocessor
+from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
+from modelscope.utils.test_utils import test_level
+
+
+class FeatureExtractionTaskModelTest(unittest.TestCase,
+                                     DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.task = Tasks.feature_extraction
+        self.model_id = 'damo/pert_feature-extraction_base-test'
+
+    sentence1 = '测试embedding'
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_run_with_direct_file_download(self):
+        cache_path = snapshot_download(self.model_id)
+        tokenizer = NLPPreprocessor(cache_path, padding=False)
+        model = FeatureExtractionModel.from_pretrained(self.model_id)
+        pipeline1 = FeatureExtractionPipeline(model, preprocessor=tokenizer)
+        pipeline2 = pipeline(
+            Tasks.feature_extraction, model=model, preprocessor=tokenizer)
+        result = pipeline1(input=self.sentence1)
+
+        print(f'sentence1: {self.sentence1}\n'
+              f'pipeline1:{np.shape(result[OutputKeys.TEXT_EMBEDDING])}')
+        result = pipeline2(input=self.sentence1)
+        print(f'sentence1: {self.sentence1}\n'
+              f'pipeline1: {np.shape(result[OutputKeys.TEXT_EMBEDDING])}')
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_run_with_model_from_modelhub(self):
+        model = Model.from_pretrained(self.model_id)
+        tokenizer = NLPPreprocessor(model.model_dir, padding=False)
+        pipeline_ins = pipeline(
+            task=Tasks.feature_extraction, model=model, preprocessor=tokenizer)
+        result = pipeline_ins(input=self.sentence1)
+        print(np.shape(result[OutputKeys.TEXT_EMBEDDING]))
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_with_model_name(self):
+        pipeline_ins = pipeline(
+            task=Tasks.feature_extraction, model=self.model_id)
+        result = pipeline_ins(input=self.sentence1)
+        print(np.shape(result[OutputKeys.TEXT_EMBEDDING]))
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_with_default_model(self):
+        pipeline_ins = pipeline(task=Tasks.feature_extraction)
+        result = pipeline_ins(input=self.sentence1)
+        print(np.shape(result[OutputKeys.TEXT_EMBEDDING]))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/pipelines/test_fill_mask.py b/tests/pipelines/test_fill_mask.py
index 2f57b2d8..0e5e242b 100644
--- a/tests/pipelines/test_fill_mask.py
+++ b/tests/pipelines/test_fill_mask.py
@@ -1,18 +1,27 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import unittest
 
+from regex import R
+
 from modelscope.hub.snapshot_download import snapshot_download
 from modelscope.models import Model
 from modelscope.models.nlp import (BertForMaskedLM, StructBertForMaskedLM,
                                    VecoForMaskedLM)
 from modelscope.pipelines import pipeline
 from modelscope.pipelines.nlp import FillMaskPipeline
-from modelscope.preprocessors import FillMaskPreprocessor
+from modelscope.preprocessors import NLPPreprocessor
 from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
+from modelscope.utils.regress_test_utils import MsRegressTool
 from modelscope.utils.test_utils import test_level
 
 
-class FillMaskTest(unittest.TestCase):
+class FillMaskTest(unittest.TestCase, DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.task = Tasks.fill_mask
+        self.model_id = 'damo/nlp_veco_fill-mask-large'
+
     model_id_sbert = {
         'zh': 'damo/nlp_structbert_fill-mask_chinese-large',
         'en': 'damo/nlp_structbert_fill-mask_english-large'
@@ -37,13 +46,14 @@ class FillMaskTest(unittest.TestCase):
         'Everything in [MASK] you call reality is really [MASK] a reflection of your '
         '[MASK]. Your [MASK] universe is just a mirror [MASK] of your story.'
     }
+    regress_tool = MsRegressTool(baseline=False)
 
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_by_direct_model_download(self):
         # sbert
-        for language in ['zh', 'en']:
+        for language in ['zh']:
             model_dir = snapshot_download(self.model_id_sbert[language])
-            preprocessor = FillMaskPreprocessor(
+            preprocessor = NLPPreprocessor(
                 model_dir, first_sequence='sentence', second_sequence=None)
             model = StructBertForMaskedLM.from_pretrained(model_dir)
             pipeline1 = FillMaskPipeline(model, preprocessor)
@@ -58,7 +68,7 @@ class FillMaskTest(unittest.TestCase):
 
         # veco
         model_dir = snapshot_download(self.model_id_veco)
-        preprocessor = FillMaskPreprocessor(
+        preprocessor = NLPPreprocessor(
             model_dir, first_sequence='sentence', second_sequence=None)
         model = VecoForMaskedLM.from_pretrained(model_dir)
         pipeline1 = FillMaskPipeline(model, preprocessor)
@@ -72,12 +82,12 @@ class FillMaskTest(unittest.TestCase):
                 f'{pipeline1(test_input)}\npipeline2: {pipeline2(test_input)}\n'
             )
 
-        # zh bert
+        # bert
         language = 'zh'
-        model_dir = snapshot_download(self.model_id_bert)
-        preprocessor = FillMaskPreprocessor(
+        model_dir = snapshot_download(self.model_id_bert, revision='beta')
+        preprocessor = NLPPreprocessor(
             model_dir, first_sequence='sentence', second_sequence=None)
-        model = BertForMaskedLM.from_pretrained(model_dir)
+        model = Model.from_pretrained(model_dir)
         pipeline1 = FillMaskPipeline(model, preprocessor)
         pipeline2 = pipeline(
             Tasks.fill_mask, model=model, preprocessor=preprocessor)
@@ -88,43 +98,49 @@ class FillMaskTest(unittest.TestCase):
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_run_with_model_from_modelhub(self):
+
         # sbert
-        for language in ['zh', 'en']:
+        for language in ['zh']:
             print(self.model_id_sbert[language])
             model = Model.from_pretrained(self.model_id_sbert[language])
-            preprocessor = FillMaskPreprocessor(
+            preprocessor = NLPPreprocessor(
                 model.model_dir,
                 first_sequence='sentence',
                 second_sequence=None)
             pipeline_ins = pipeline(
                 task=Tasks.fill_mask, model=model, preprocessor=preprocessor)
-            print(
-                f'\nori_text: {self.ori_texts[language]}\ninput: {self.test_inputs[language]}\npipeline: '
-                f'{pipeline_ins(self.test_inputs[language])}\n')
+            with self.regress_tool.monitor_module_single_forward(
+                    pipeline_ins.model, f'fill_mask_sbert_{language}'):
+                print(
+                    f'\nori_text: {self.ori_texts[language]}\ninput: {self.test_inputs[language]}\npipeline: '
+                    f'{pipeline_ins(self.test_inputs[language])}\n')
 
         # veco
         model = Model.from_pretrained(self.model_id_veco)
-        preprocessor = FillMaskPreprocessor(
+        preprocessor = NLPPreprocessor(
             model.model_dir, first_sequence='sentence', second_sequence=None)
         pipeline_ins = pipeline(
             Tasks.fill_mask, model=model, preprocessor=preprocessor)
         for language in ['zh', 'en']:
             ori_text = self.ori_texts[language]
             test_input = self.test_inputs[language].replace('[MASK]', '<mask>')
-            print(f'\nori_text: {ori_text}\ninput: {test_input}\npipeline: '
-                  f'{pipeline_ins(test_input)}\n')
+            with self.regress_tool.monitor_module_single_forward(
+                    pipeline_ins.model, f'fill_mask_veco_{language}'):
+                print(
+                    f'\nori_text: {ori_text}\ninput: {test_input}\npipeline: '
+                    f'{pipeline_ins(test_input)}\n')
 
-        # zh bert
-        model = Model.from_pretrained(self.model_id_bert)
-        preprocessor = FillMaskPreprocessor(
+        # bert
+        language = 'zh'
+        model = Model.from_pretrained(self.model_id_bert, revision='beta')
+        preprocessor = NLPPreprocessor(
             model.model_dir, first_sequence='sentence', second_sequence=None)
         pipeline_ins = pipeline(
             Tasks.fill_mask, model=model, preprocessor=preprocessor)
-        language = 'zh'
-        ori_text = self.ori_texts[language]
-        test_input = self.test_inputs[language]
-        print(f'\nori_text: {ori_text}\ninput: {test_input}\npipeline: '
-              f'{pipeline_ins(test_input)}\n')
+        pipeline_ins.model, f'fill_mask_bert_{language}'
+        print(
+            f'\nori_text: {self.ori_texts[language]}\ninput: {self.test_inputs[language]}\npipeline: '
+            f'{pipeline_ins(self.test_inputs[language])}\n')
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_run_with_model_name(self):
@@ -144,8 +160,12 @@ class FillMaskTest(unittest.TestCase):
             f'\nori_text: {self.ori_texts[language]}\ninput: {self.test_inputs[language]}\npipeline: '
             f'{pipeline_ins(self.test_inputs[language])}\n')
 
-        # bert
-        pipeline_ins = pipeline(task=Tasks.fill_mask, model=self.model_id_bert)
+        # Bert
+        language = 'zh'
+        pipeline_ins = pipeline(
+            task=Tasks.fill_mask,
+            model=self.model_id_bert,
+            model_revision='beta')
         print(
             f'\nori_text: {self.ori_texts[language]}\ninput: {self.test_inputs[language]}\npipeline: '
             f'{pipeline_ins(self.test_inputs[language])}\n')
@@ -159,6 +179,10 @@ class FillMaskTest(unittest.TestCase):
         print(f'\nori_text: {ori_text}\ninput: {test_input}\npipeline: '
               f'{pipeline_ins(test_input)}\n')
 
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_fill_mask_ponet.py b/tests/pipelines/test_fill_mask_ponet.py
new file mode 100644
index 00000000..707cc201
--- /dev/null
+++ b/tests/pipelines/test_fill_mask_ponet.py
@@ -0,0 +1,48 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import unittest
+
+from modelscope.metainfo import Pipelines
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.test_utils import test_level
+
+
+class FillMaskPonetTest(unittest.TestCase):
+    model_id_ponet = {
+        'zh': 'damo/nlp_ponet_fill-mask_chinese-base',
+        'en': 'damo/nlp_ponet_fill-mask_english-base'
+    }
+
+    ori_texts = {
+        'zh':
+        '段誉轻挥折扇，摇了摇头，说道：“你师父是你的师父，你师父可不是我的师父。'
+        '你师父差得动你，你师父可差不动我。',
+        'en':
+        'Everything in what you call reality is really just a reflection of your '
+        'consciousness. Your whole universe is just a mirror reflection of your story.'
+    }
+
+    test_inputs = {
+        'zh':
+        '段誉轻[MASK]折扇，摇了摇[MASK]，[MASK]道：“你师父是你的[MASK][MASK]，你'
+        '师父可不是[MASK]的师父。你师父差得动你，你师父可[MASK]不动我。',
+        'en':
+        'Everything in [MASK] you call reality is really [MASK] a reflection of your '
+        '[MASK]. Your [MASK] universe is just a mirror [MASK] of your story.'
+    }
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_with_ponet_model(self):
+        for language in ['zh', 'en']:
+            ori_text = self.ori_texts[language]
+            test_input = self.test_inputs[language]
+
+            pipeline_ins = pipeline(
+                task=Tasks.fill_mask, model=self.model_id_ponet[language])
+
+            print(f'\nori_text: {ori_text}\ninput: {test_input}\npipeline: '
+                  f'{pipeline_ins(test_input)}\n')
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/pipelines/test_general_image_classification.py b/tests/pipelines/test_general_image_classification.py
index 8a814f4a..d5357f02 100644
--- a/tests/pipelines/test_general_image_classification.py
+++ b/tests/pipelines/test_general_image_classification.py
@@ -1,11 +1,19 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import unittest
 
 from modelscope.pipelines import pipeline
 from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.test_utils import test_level
 
 
-class GeneralImageClassificationTest(unittest.TestCase):
+class GeneralImageClassificationTest(unittest.TestCase,
+                                     DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.task = Tasks.image_classification
+        self.model_id = 'damo/cv_vit-base_image-classification_Dailylife-labels'
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_run_ImageNet(self):
@@ -29,6 +37,10 @@ class GeneralImageClassificationTest(unittest.TestCase):
         result = general_image_classification('data/test/images/bird.JPEG')
         print(result)
 
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_general_recognition.py b/tests/pipelines/test_general_recognition.py
index 0b32e1f5..ba713bbe 100644
--- a/tests/pipelines/test_general_recognition.py
+++ b/tests/pipelines/test_general_recognition.py
@@ -1,11 +1,18 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import unittest
 
 from modelscope.pipelines import pipeline
 from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.test_utils import test_level
 
 
-class GeneralRecognitionTest(unittest.TestCase):
+class GeneralRecognitionTest(unittest.TestCase, DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.task = Tasks.general_recognition
+        self.model_id = 'damo/cv_resnest101_general_recognition'
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_run(self):
@@ -15,6 +22,10 @@ class GeneralRecognitionTest(unittest.TestCase):
         result = general_recognition('data/test/images/dogs.jpg')
         print(result)
 
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_generative_multi_modal_embedding.py b/tests/pipelines/test_generative_multi_modal_embedding.py
index d8593abb..7061d736 100644
--- a/tests/pipelines/test_generative_multi_modal_embedding.py
+++ b/tests/pipelines/test_generative_multi_modal_embedding.py
@@ -1,15 +1,20 @@
-# Copyright (c) Alibaba, Inc. and its affiliates.
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
 
 import unittest
 
 from modelscope.models import Model
 from modelscope.pipelines import pipeline
 from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.test_utils import test_level
 
 
-class GEMMMultiModalEmbeddingTest(unittest.TestCase):
-    model_id = 'damo/multi-modal_gemm-vit-large-patch14_generative-multi-modal-embedding'
+class GEMMMultiModalEmbeddingTest(unittest.TestCase, DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.task = Tasks.generative_multi_modal_embedding
+        self.model_id = 'damo/multi-modal_gemm-vit-large-patch14_generative-multi-modal-embedding'
+
     test_input = {
         'image': 'data/test/images/generative_multimodal.jpg',
         'text':
@@ -63,6 +68,10 @@ class GEMMMultiModalEmbeddingTest(unittest.TestCase):
         output = generative_multi_modal_embedding_pipeline(test_input)
         print(output)
 
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_hand_2d_keypoints.py b/tests/pipelines/test_hand_2d_keypoints.py
new file mode 100644
index 00000000..86cd2d06
--- /dev/null
+++ b/tests/pipelines/test_hand_2d_keypoints.py
@@ -0,0 +1,45 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import unittest
+
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.test_utils import test_level
+
+
+class Hand2DKeypointsPipelineTest(unittest.TestCase):
+
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_hand_2d_keypoints(self):
+        img_path = 'data/test/images/hand_keypoints.jpg'
+        model_id = 'damo/cv_hrnetw18_hand-pose-keypoints_coco-wholebody'
+
+        hand_keypoint = pipeline(task=Tasks.hand_2d_keypoints, model=model_id)
+        outputs = hand_keypoint(img_path)
+        self.assertEqual(len(outputs), 1)
+
+        results = outputs[0]
+        self.assertIn(OutputKeys.KEYPOINTS, results.keys())
+        self.assertIn(OutputKeys.BOXES, results.keys())
+        self.assertEqual(results[OutputKeys.KEYPOINTS].shape[1], 21)
+        self.assertEqual(results[OutputKeys.KEYPOINTS].shape[2], 3)
+        self.assertEqual(results[OutputKeys.BOXES].shape[1], 4)
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_hand_2d_keypoints_with_default_model(self):
+        img_path = 'data/test/images/hand_keypoints.jpg'
+
+        hand_keypoint = pipeline(task=Tasks.hand_2d_keypoints)
+        outputs = hand_keypoint(img_path)
+        self.assertEqual(len(outputs), 1)
+
+        results = outputs[0]
+        self.assertIn(OutputKeys.KEYPOINTS, results.keys())
+        self.assertIn(OutputKeys.BOXES, results.keys())
+        self.assertEqual(results[OutputKeys.KEYPOINTS].shape[1], 21)
+        self.assertEqual(results[OutputKeys.KEYPOINTS].shape[2], 3)
+        self.assertEqual(results[OutputKeys.BOXES].shape[1], 4)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/pipelines/test_hand_static.py b/tests/pipelines/test_hand_static.py
new file mode 100644
index 00000000..37181899
--- /dev/null
+++ b/tests/pipelines/test_hand_static.py
@@ -0,0 +1,32 @@
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
+import unittest
+
+from modelscope.pipelines import pipeline
+from modelscope.pipelines.base import Pipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.test_utils import test_level
+
+
+class HandStaticTest(unittest.TestCase):
+
+    def setUp(self) -> None:
+        self.model = 'damo/cv_mobileface_hand-static'
+        self.input = {'img_path': 'data/test/images/hand_static.jpg'}
+
+    def pipeline_inference(self, pipeline: Pipeline, input: str):
+        result = pipeline(input)
+        print(result)
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_modelhub(self):
+        hand_static = pipeline(Tasks.hand_static, model=self.model)
+        self.pipeline_inference(hand_static, self.input)
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_run_modelhub_default_model(self):
+        hand_static = pipeline(Tasks.hand_static)
+        self.pipeline_inference(hand_static, self.input)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/pipelines/test_hicossl_video_embedding.py b/tests/pipelines/test_hicossl_video_embedding.py
new file mode 100644
index 00000000..8a7de1fa
--- /dev/null
+++ b/tests/pipelines/test_hicossl_video_embedding.py
@@ -0,0 +1,32 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+# !/usr/bin/env python
+import unittest
+
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
+from modelscope.utils.test_utils import test_level
+
+
+class HICOSSLVideoEmbeddingTest(unittest.TestCase, DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.task = Tasks.video_embedding
+        self.model_id = 'damo/cv_s3dg_video-embedding'
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_modelhub(self):
+        videossl_pipeline = pipeline(
+            Tasks.video_embedding, model=self.model_id)
+        result = videossl_pipeline(
+            'data/test/videos/action_recognition_test_video.mp4')
+
+        print(f'video embedding output: {result}.')
+
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/pipelines/test_image2image_generation.py b/tests/pipelines/test_image2image_generation.py
index 487fe4d0..116cef76 100644
--- a/tests/pipelines/test_image2image_generation.py
+++ b/tests/pipelines/test_image2image_generation.py
@@ -11,7 +11,7 @@ from modelscope.utils.test_utils import test_level
 
 class Image2ImageGenerationTest(unittest.TestCase):
 
-    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_modelhub(self):
         r"""We provide two generation modes, i.e., Similar Image Generation and Interpolation.
             You can pass the following parameters for different mode.
diff --git a/tests/pipelines/test_image2image_translation.py b/tests/pipelines/test_image2image_translation.py
index fd2f8063..a1cdb957 100644
--- a/tests/pipelines/test_image2image_translation.py
+++ b/tests/pipelines/test_image2image_translation.py
@@ -8,7 +8,7 @@ from modelscope.utils.test_utils import test_level
 
 class Image2ImageTranslationTest(unittest.TestCase):
 
-    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_modelhub(self):
         r"""We provide three translation modes, i.e., uncropping, colorization and combination.
             You can pass the following parameters for different mode.
diff --git a/tests/pipelines/test_image_color_enhance.py b/tests/pipelines/test_image_color_enhance.py
index c8ea5f9c..9b72999e 100644
--- a/tests/pipelines/test_image_color_enhance.py
+++ b/tests/pipelines/test_image_color_enhance.py
@@ -8,13 +8,15 @@ from modelscope.outputs import OutputKeys
 from modelscope.pipelines import pipeline
 from modelscope.pipelines.base import Pipeline
 from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.test_utils import test_level
 
 
-class ImageColorEnhanceTest(unittest.TestCase):
+class ImageColorEnhanceTest(unittest.TestCase, DemoCompatibilityCheck):
 
     def setUp(self) -> None:
         self.model_id = 'damo/cv_csrnet_image-color-enhance-models'
+        self.task = Tasks.image_color_enhancement
 
     def pipeline_inference(self, pipeline: Pipeline, input_location: str):
         result = pipeline(input_location)
@@ -36,6 +38,10 @@ class ImageColorEnhanceTest(unittest.TestCase):
         self.pipeline_inference(img_color_enhance,
                                 'data/test/images/image_color_enhance.png')
 
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_image_colorization.py b/tests/pipelines/test_image_colorization.py
index 1a02cffb..547fce89 100644
--- a/tests/pipelines/test_image_colorization.py
+++ b/tests/pipelines/test_image_colorization.py
@@ -8,14 +8,16 @@ from modelscope.outputs import OutputKeys
 from modelscope.pipelines import pipeline
 from modelscope.pipelines.base import Pipeline
 from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.test_utils import test_level
 
 
-class ImageColorizationTest(unittest.TestCase):
+class ImageColorizationTest(unittest.TestCase, DemoCompatibilityCheck):
 
     def setUp(self) -> None:
         self.model_id = 'damo/cv_unet_image-colorization'
         self.test_image = 'data/test/images/marilyn_monroe_4.jpg'
+        self.task = Tasks.image_colorization
 
     def pipeline_inference(self, pipeline: Pipeline, test_image: str):
         result = pipeline(test_image)
@@ -35,6 +37,10 @@ class ImageColorizationTest(unittest.TestCase):
         image_colorization = pipeline(Tasks.image_colorization)
         self.pipeline_inference(image_colorization, self.test_image)
 
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_image_denoise.py b/tests/pipelines/test_image_denoise.py
index d3e0af24..bf8cfd0f 100644
--- a/tests/pipelines/test_image_denoise.py
+++ b/tests/pipelines/test_image_denoise.py
@@ -10,11 +10,16 @@ from modelscope.outputs import OutputKeys
 from modelscope.pipelines import pipeline
 from modelscope.pipelines.cv import ImageDenoisePipeline
 from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.test_utils import test_level
 
 
-class ImageDenoiseTest(unittest.TestCase):
-    model_id = 'damo/cv_nafnet_image-denoise_sidd'
+class ImageDenoiseTest(unittest.TestCase, DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.task = Tasks.image_denoising
+        self.model_id = 'damo/cv_nafnet_image-denoise_sidd'
+
     demo_image_path = 'data/test/images/noisy-demo-1.png'
 
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
@@ -56,6 +61,10 @@ class ImageDenoiseTest(unittest.TestCase):
         w, h = denoise_img.size
         print('pipeline: the shape of output_img is {}x{}'.format(h, w))
 
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_image_instance_segmentation.py b/tests/pipelines/test_image_instance_segmentation.py
index cd08d669..2ba0724a 100644
--- a/tests/pipelines/test_image_instance_segmentation.py
+++ b/tests/pipelines/test_image_instance_segmentation.py
@@ -12,11 +12,16 @@ from modelscope.pipelines.cv import ImageInstanceSegmentationPipeline
 from modelscope.preprocessors import build_preprocessor
 from modelscope.utils.config import Config
 from modelscope.utils.constant import Fields, ModelFile, Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.test_utils import test_level
 
 
-class ImageInstanceSegmentationTest(unittest.TestCase):
-    model_id = 'damo/cv_swin-b_image-instance-segmentation_coco'
+class ImageInstanceSegmentationTest(unittest.TestCase, DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.task = Tasks.image_segmentation
+        self.model_id = 'damo/cv_swin-b_image-instance-segmentation_coco'
+
     image = 'data/test/images/image_instance_segmentation.jpg'
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
@@ -56,6 +61,10 @@ class ImageInstanceSegmentationTest(unittest.TestCase):
         print(f'pipeline1:{pipeline1(input=self.image)[OutputKeys.LABELS]}')
         print(f'pipeline2: {pipeline2(input=self.image)[OutputKeys.LABELS]}')
 
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_image_matting.py b/tests/pipelines/test_image_matting.py
index 83b7fee2..a3edb705 100644
--- a/tests/pipelines/test_image_matting.py
+++ b/tests/pipelines/test_image_matting.py
@@ -1,19 +1,18 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import os.path as osp
-import tempfile
 import unittest
 
 import cv2
 
-from modelscope.fileio import File
 from modelscope.msdatasets import MsDataset
 from modelscope.outputs import OutputKeys
 from modelscope.pipelines import pipeline
 from modelscope.utils.constant import ModelFile, Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.test_utils import test_level
 
 
-class ImageMattingTest(unittest.TestCase):
+class ImageMattingTest(unittest.TestCase, DemoCompatibilityCheck):
 
     def setUp(self) -> None:
         self.model_id = 'damo/cv_unet_image-matting'
@@ -62,6 +61,10 @@ class ImageMattingTest(unittest.TestCase):
             f'Output written to dir: {osp.dirname(osp.abspath("result_0.png"))}'
         )
 
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_image_panoptic_segmentation.py b/tests/pipelines/test_image_panoptic_segmentation.py
new file mode 100644
index 00000000..4f12e6af
--- /dev/null
+++ b/tests/pipelines/test_image_panoptic_segmentation.py
@@ -0,0 +1,49 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import unittest
+
+import cv2
+import PIL
+
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.cv.image_utils import panoptic_seg_masks_to_image
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
+from modelscope.utils.test_utils import test_level
+
+
+class ImagePanopticSegmentationTest(unittest.TestCase, DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.task = Tasks.image_segmentation
+        self.model_id = 'damo/cv_swinL_panoptic-segmentation_cocopan'
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_image_panoptic_segmentation(self):
+        input_location = 'data/test/images/image_panoptic_segmentation.jpg'
+        pan_segmentor = pipeline(Tasks.image_segmentation, model=self.model_id)
+        result = pan_segmentor(input_location)
+
+        draw_img = panoptic_seg_masks_to_image(result[OutputKeys.MASKS])
+        cv2.imwrite('result.jpg', draw_img)
+        print('print test_image_panoptic_segmentation return success')
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_image_panoptic_segmentation_from_PIL(self):
+        input_location = 'data/test/images/image_panoptic_segmentation.jpg'
+        pan_segmentor = pipeline(Tasks.image_segmentation, model=self.model_id)
+        PIL_array = PIL.Image.open(input_location)
+        result = pan_segmentor(PIL_array)
+
+        draw_img = panoptic_seg_masks_to_image(result[OutputKeys.MASKS])
+        cv2.imwrite('result.jpg', draw_img)
+        print('print test_image_panoptic_segmentation from PIL return success')
+
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/pipelines/test_image_portrait_enhancement.py b/tests/pipelines/test_image_portrait_enhancement.py
index 834fcfdb..1ca97253 100644
--- a/tests/pipelines/test_image_portrait_enhancement.py
+++ b/tests/pipelines/test_image_portrait_enhancement.py
@@ -9,12 +9,14 @@ from modelscope.outputs import OutputKeys
 from modelscope.pipelines import pipeline
 from modelscope.pipelines.base import Pipeline
 from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.test_utils import test_level
 
 
-class ImagePortraitEnhancementTest(unittest.TestCase):
+class ImagePortraitEnhancementTest(unittest.TestCase, DemoCompatibilityCheck):
 
     def setUp(self) -> None:
+        self.task = Tasks.image_portrait_enhancement
         self.model_id = 'damo/cv_gpen_image-portrait-enhancement'
         self.test_image = 'data/test/images/Solvay_conference_1927.png'
 
@@ -37,6 +39,10 @@ class ImagePortraitEnhancementTest(unittest.TestCase):
         face_enhancement = pipeline(Tasks.image_portrait_enhancement)
         self.pipeline_inference(face_enhancement, self.test_image)
 
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_image_reid_person.py b/tests/pipelines/test_image_reid_person.py
index c3e8d487..310cdd66 100644
--- a/tests/pipelines/test_image_reid_person.py
+++ b/tests/pipelines/test_image_reid_person.py
@@ -6,14 +6,16 @@ from PIL import Image
 from modelscope.outputs import OutputKeys
 from modelscope.pipelines import pipeline
 from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.test_utils import test_level
 
 
-class ImageReidPersonTest(unittest.TestCase):
+class ImageReidPersonTest(unittest.TestCase, DemoCompatibilityCheck):
 
     def setUp(self) -> None:
         self.input_location = 'data/test/images/image_reid_person.jpg'
         self.model_id = 'damo/cv_passvitb_image-reid-person_market'
+        self.task = Tasks.image_reid_person
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_image_reid_person(self):
@@ -48,6 +50,10 @@ class ImageReidPersonTest(unittest.TestCase):
         )
         print(f'The img embedding is: {result[OutputKeys.IMG_EMBEDDING]}')
 
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_image_semantic_segmentation.py b/tests/pipelines/test_image_semantic_segmentation.py
new file mode 100644
index 00000000..286d317a
--- /dev/null
+++ b/tests/pipelines/test_image_semantic_segmentation.py
@@ -0,0 +1,62 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import unittest
+
+import cv2
+import PIL
+
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.cv.image_utils import semantic_seg_masks_to_image
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
+from modelscope.utils.test_utils import test_level
+
+
+class ImageSemanticSegmentationTest(unittest.TestCase, DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.task = 'image-segmentation'
+        self.model_id = 'damo/cv_swinL_semantic-segmentation_cocopanmerge'
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_image_semantic_segmentation_panmerge(self):
+        input_location = 'data/test/images/image_semantic_segmentation.jpg'
+        segmenter = pipeline(Tasks.image_segmentation, model=self.model_id)
+        result = segmenter(input_location)
+
+        draw_img = semantic_seg_masks_to_image(result[OutputKeys.MASKS])
+        cv2.imwrite('result.jpg', draw_img)
+        print('test_image_semantic_segmentation_panmerge DONE')
+
+        PIL_array = PIL.Image.open(input_location)
+        result = segmenter(PIL_array)
+
+        draw_img = semantic_seg_masks_to_image(result[OutputKeys.MASKS])
+        cv2.imwrite('result.jpg', draw_img)
+        print('test_image_semantic_segmentation_panmerge_from_PIL DONE')
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_image_semantic_segmentation_vitadapter(self):
+        input_location = 'data/test/images/image_semantic_segmentation.jpg'
+        segmenter = pipeline(Tasks.image_segmentation, model=self.model_id)
+        result = segmenter(input_location)
+
+        draw_img = semantic_seg_masks_to_image(result[OutputKeys.MASKS])
+        cv2.imwrite('result.jpg', draw_img)
+        print('test_image_semantic_segmentation_vitadapter DONE')
+
+        PIL_array = PIL.Image.open(input_location)
+        result = segmenter(PIL_array)
+
+        draw_img = semantic_seg_masks_to_image(result[OutputKeys.MASKS])
+        cv2.imwrite('result.jpg', draw_img)
+        print('test_image_semantic_segmentation_vitadapter_from_PIL DONE')
+
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/pipelines/test_image_style_transfer.py b/tests/pipelines/test_image_style_transfer.py
index 4e5bb69b..a02d5308 100644
--- a/tests/pipelines/test_image_style_transfer.py
+++ b/tests/pipelines/test_image_style_transfer.py
@@ -7,12 +7,14 @@ from modelscope.hub.snapshot_download import snapshot_download
 from modelscope.outputs import OutputKeys
 from modelscope.pipelines import pipeline
 from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.test_utils import test_level
 
 
-class ImageStyleTransferTest(unittest.TestCase):
+class ImageStyleTransferTest(unittest.TestCase, DemoCompatibilityCheck):
 
     def setUp(self) -> None:
+        self.task = Tasks.image_style_transfer
         self.model_id = 'damo/cv_aams_style-transfer_damo'
 
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
@@ -48,6 +50,10 @@ class ImageStyleTransferTest(unittest.TestCase):
         cv2.imwrite('result_styletransfer3.png', result[OutputKeys.OUTPUT_IMG])
         print('style_transfer.test_run_modelhub_default_model done')
 
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_image_super_resolution.py b/tests/pipelines/test_image_super_resolution.py
index 8cf9e46f..d5cbebe8 100644
--- a/tests/pipelines/test_image_super_resolution.py
+++ b/tests/pipelines/test_image_super_resolution.py
@@ -8,14 +8,16 @@ from modelscope.outputs import OutputKeys
 from modelscope.pipelines import pipeline
 from modelscope.pipelines.base import Pipeline
 from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.test_utils import test_level
 
 
-class ImageSuperResolutionTest(unittest.TestCase):
+class ImageSuperResolutionTest(unittest.TestCase, DemoCompatibilityCheck):
 
     def setUp(self) -> None:
         self.model_id = 'damo/cv_rrdb_image-super-resolution'
         self.img = 'data/test/images/dogs.jpg'
+        self.task = Tasks.image_super_resolution
 
     def pipeline_inference(self, pipeline: Pipeline, img: str):
         result = pipeline(img)
@@ -35,6 +37,10 @@ class ImageSuperResolutionTest(unittest.TestCase):
         super_resolution = pipeline(Tasks.image_super_resolution)
         self.pipeline_inference(super_resolution, self.img)
 
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_key_word_spotting.py b/tests/pipelines/test_key_word_spotting.py
index 32a853af..91f9f566 100644
--- a/tests/pipelines/test_key_word_spotting.py
+++ b/tests/pipelines/test_key_word_spotting.py
@@ -10,6 +10,7 @@ import soundfile
 from modelscope.outputs import OutputKeys
 from modelscope.pipelines import pipeline
 from modelscope.utils.constant import ColorCodes, Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.logger import get_logger
 from modelscope.utils.test_utils import download_and_untar, test_level
 
@@ -17,6 +18,7 @@ logger = get_logger()
 
 POS_WAV_FILE = 'data/test/audios/kws_xiaoyunxiaoyun.wav'
 BOFANGYINYUE_WAV_FILE = 'data/test/audios/kws_bofangyinyue.wav'
+URL_FILE = 'https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/KWS/pos_testset/20200707_xiaoyun.wav'
 
 POS_TESTSETS_FILE = 'pos_testsets.tar.gz'
 POS_TESTSETS_URL = 'https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/KWS/pos_testsets.tar.gz'
@@ -25,7 +27,7 @@ NEG_TESTSETS_FILE = 'neg_testsets.tar.gz'
 NEG_TESTSETS_URL = 'https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/KWS/neg_testsets.tar.gz'
 
 
-class KeyWordSpottingTest(unittest.TestCase):
+class KeyWordSpottingTest(unittest.TestCase, DemoCompatibilityCheck):
     action_info = {
         'test_run_with_wav': {
             'checking_item': [OutputKeys.KWS_LIST, 0, 'keyword'],
@@ -75,6 +77,22 @@ class KeyWordSpottingTest(unittest.TestCase):
                 }]
             }
         },
+        'test_run_with_url': {
+            'checking_item': [OutputKeys.KWS_LIST, 0, 'keyword'],
+            'checking_value': '小云小云',
+            'example': {
+                'wav_count':
+                1,
+                'kws_type':
+                'pcm',
+                'kws_list': [{
+                    'keyword': '小云小云',
+                    'offset': 0.69,
+                    'length': 1.67,
+                    'confidence': 0.996023
+                }]
+            }
+        },
         'test_run_with_pos_testsets': {
             'checking_item': ['recall'],
             'example': {
@@ -236,6 +254,12 @@ class KeyWordSpottingTest(unittest.TestCase):
         self.check_result('test_run_with_wav_by_customized_keywords',
                           kws_result)
 
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_with_url(self):
+        kws_result = self.run_pipeline(
+            model_id=self.model_id, audio_in=URL_FILE)
+        self.check_result('test_run_with_url', kws_result)
+
     @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
     def test_run_with_pos_testsets(self):
         wav_file_path = download_and_untar(
@@ -272,6 +296,10 @@ class KeyWordSpottingTest(unittest.TestCase):
             model_id=self.model_id, audio_in=audio_list)
         self.check_result('test_run_with_roc', kws_result)
 
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_key_word_spotting_farfield.py b/tests/pipelines/test_key_word_spotting_farfield.py
index 4a732950..f8c167de 100644
--- a/tests/pipelines/test_key_word_spotting_farfield.py
+++ b/tests/pipelines/test_key_word_spotting_farfield.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import os.path
 import unittest
 
@@ -6,6 +8,10 @@ from modelscope.utils.constant import Tasks
 from modelscope.utils.test_utils import test_level
 
 TEST_SPEECH_FILE = 'data/test/audios/3ch_nihaomiya.wav'
+TEST_SPEECH_FILE_MONO = 'data/test/audios/1ch_nihaomiya.wav'
+TEST_SPEECH_URL = 'https://modelscope.cn/api/v1/models/damo/' \
+                  'speech_dfsmn_kws_char_farfield_16k_nihaomiya/repo' \
+                  '?Revision=master&FilePath=examples/3ch_nihaomiya.wav'
 
 
 class KWSFarfieldTest(unittest.TestCase):
@@ -13,7 +19,7 @@ class KWSFarfieldTest(unittest.TestCase):
     def setUp(self) -> None:
         self.model_id = 'damo/speech_dfsmn_kws_char_farfield_16k_nihaomiya'
 
-    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
     def test_normal(self):
         kws = pipeline(Tasks.keyword_spotting, model=self.model_id)
         inputs = {'input_file': os.path.join(os.getcwd(), TEST_SPEECH_FILE)}
@@ -21,6 +27,23 @@ class KWSFarfieldTest(unittest.TestCase):
         self.assertEqual(len(result['kws_list']), 5)
         print(result['kws_list'][-1])
 
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_mono(self):
+        kws = pipeline(Tasks.keyword_spotting, model=self.model_id)
+        inputs = {
+            'input_file': os.path.join(os.getcwd(), TEST_SPEECH_FILE_MONO)
+        }
+        result = kws(inputs)
+        self.assertEqual(len(result['kws_list']), 5)
+        print(result['kws_list'][-1])
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_url(self):
+        kws = pipeline(Tasks.keyword_spotting, model=self.model_id)
+        result = kws(TEST_SPEECH_URL)
+        self.assertEqual(len(result['kws_list']), 5)
+        print(result['kws_list'][-1])
+
     @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
     def test_output(self):
         kws = pipeline(Tasks.keyword_spotting, model=self.model_id)
diff --git a/tests/pipelines/test_live_category.py b/tests/pipelines/test_live_category.py
index dead376d..391ed283 100644
--- a/tests/pipelines/test_live_category.py
+++ b/tests/pipelines/test_live_category.py
@@ -3,20 +3,28 @@ import unittest
 
 from modelscope.pipelines import pipeline
 from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.test_utils import test_level
 
 
-class LiveCategoryTest(unittest.TestCase):
+class LiveCategoryTest(unittest.TestCase, DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.task = Tasks.live_category
+        self.model_id = 'damo/cv_resnet50_live-category'
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_run_modelhub(self):
-        category_pipeline = pipeline(
-            Tasks.live_category, model='damo/cv_resnet50_live-category')
+        category_pipeline = pipeline(Tasks.live_category, self.model_id)
         result = category_pipeline(
             'data/test/videos/live_category_test_video.mp4')
 
         print(f'live category output: {result}.')
 
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_mog_face_detection.py b/tests/pipelines/test_mog_face_detection.py
new file mode 100644
index 00000000..5c6d97c2
--- /dev/null
+++ b/tests/pipelines/test_mog_face_detection.py
@@ -0,0 +1,33 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os.path as osp
+import unittest
+
+import cv2
+
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.cv.image_utils import draw_face_detection_no_lm_result
+from modelscope.utils.test_utils import test_level
+
+
+class MogFaceDetectionTest(unittest.TestCase):
+
+    def setUp(self) -> None:
+        self.model_id = 'damo/cv_resnet101_face-detection_cvpr22papermogface'
+
+    def show_result(self, img_path, detection_result):
+        img = draw_face_detection_no_lm_result(img_path, detection_result)
+        cv2.imwrite('result.png', img)
+        print(f'output written to {osp.abspath("result.png")}')
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_modelhub(self):
+        face_detection = pipeline(Tasks.face_detection, model=self.model_id)
+        img_path = 'data/test/images/mog_face_detection.jpg'
+
+        result = face_detection(img_path)
+        self.show_result(img_path, result)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/pipelines/test_movie_scene_segmentation.py b/tests/pipelines/test_movie_scene_segmentation.py
new file mode 100644
index 00000000..affd5140
--- /dev/null
+++ b/tests/pipelines/test_movie_scene_segmentation.py
@@ -0,0 +1,44 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import unittest
+
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
+from modelscope.utils.test_utils import test_level
+
+
+class MovieSceneSegmentationTest(unittest.TestCase, DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.task = Tasks.movie_scene_segmentation
+        self.model_id = 'damo/cv_resnet50-bert_video-scene-segmentation_movienet'
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_movie_scene_segmentation(self):
+        input_location = 'data/test/videos/movie_scene_segmentation_test_video.mp4'
+        movie_scene_segmentation_pipeline = pipeline(
+            Tasks.movie_scene_segmentation, model=self.model_id)
+        result = movie_scene_segmentation_pipeline(input_location)
+        if result:
+            print(result)
+        else:
+            raise ValueError('process error')
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_movie_scene_segmentation_with_default_task(self):
+        input_location = 'data/test/videos/movie_scene_segmentation_test_video.mp4'
+        movie_scene_segmentation_pipeline = pipeline(
+            Tasks.movie_scene_segmentation)
+        result = movie_scene_segmentation_pipeline(input_location)
+        if result:
+            print(result)
+        else:
+            raise ValueError('process error')
+
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/pipelines/test_mplug_tasks.py b/tests/pipelines/test_mplug_tasks.py
index 4b8a813a..a3ace62d 100644
--- a/tests/pipelines/test_mplug_tasks.py
+++ b/tests/pipelines/test_mplug_tasks.py
@@ -7,10 +7,15 @@ from modelscope.models import Model
 from modelscope.outputs import OutputKeys
 from modelscope.pipelines import pipeline
 from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.test_utils import test_level
 
 
-class MplugTasksTest(unittest.TestCase):
+class MplugTasksTest(unittest.TestCase, DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.task = 'visual-question-answering'
+        self.model_id = 'damo/mplug_visual-question-answering_coco_large_en'
 
     @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
     def test_run_with_image_captioning_with_model(self):
@@ -39,8 +44,8 @@ class MplugTasksTest(unittest.TestCase):
             'damo/mplug_visual-question-answering_coco_large_en')
         pipeline_vqa = pipeline(Tasks.visual_question_answering, model=model)
         image = Image.open('data/test/images/image_mplug_vqa.jpg')
-        question = 'What is the woman doing?'
-        input = {'image': image, 'question': question}
+        text = 'What is the woman doing?'
+        input = {'image': image, 'text': text}
         result = pipeline_vqa(input)
         print(result)
 
@@ -49,11 +54,36 @@ class MplugTasksTest(unittest.TestCase):
         model = 'damo/mplug_visual-question-answering_coco_large_en'
         pipeline_vqa = pipeline(Tasks.visual_question_answering, model=model)
         image = Image.open('data/test/images/image_mplug_vqa.jpg')
-        question = 'What is the woman doing?'
-        input = {'image': image, 'question': question}
+        text = 'What is the woman doing?'
+        input = {'image': image, 'text': text}
         result = pipeline_vqa(input)
         print(result)
 
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_run_with_image_text_retrieval_with_model(self):
+        model = Model.from_pretrained(
+            'damo/mplug_image-text-retrieval_flickr30k_large_en')
+        pipeline_retrieval = pipeline(Tasks.image_text_retrieval, model=model)
+        image = Image.open('data/test/images/image-text-retrieval.jpg')
+        text = 'Two young guys with shaggy hair look at their hands while hanging out in the yard.'
+        input = {'image': image, 'text': text}
+        result = pipeline_retrieval(input)
+        print(result)
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_with_image_text_retrieval_with_name(self):
+        model = 'damo/mplug_image-text-retrieval_flickr30k_large_en'
+        pipeline_retrieval = pipeline(Tasks.image_text_retrieval, model=model)
+        image = Image.open('data/test/images/image-text-retrieval.jpg')
+        text = 'Two young guys with shaggy hair look at their hands while hanging out in the yard.'
+        input = {'image': image, 'text': text}
+        result = pipeline_retrieval(input)
+        print(result)
+
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_mtcnn_face_detection.py b/tests/pipelines/test_mtcnn_face_detection.py
new file mode 100644
index 00000000..5afb5588
--- /dev/null
+++ b/tests/pipelines/test_mtcnn_face_detection.py
@@ -0,0 +1,38 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os.path as osp
+import unittest
+
+import cv2
+from PIL import Image
+
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.cv.image_utils import draw_face_detection_result
+from modelscope.utils.test_utils import test_level
+
+
+class MtcnnFaceDetectionTest(unittest.TestCase):
+
+    def setUp(self) -> None:
+        self.model_id = 'damo/cv_manual_face-detection_mtcnn'
+
+    def show_result(self, img_path, detection_result):
+        img = draw_face_detection_result(img_path, detection_result)
+        cv2.imwrite('result.png', img)
+        print(f'output written to {osp.abspath("result.png")}')
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_modelhub(self):
+        face_detection = pipeline(Tasks.face_detection, model=self.model_id)
+        img_path = 'data/test/images/mtcnn_face_detection.jpg'
+        img = Image.open(img_path)
+
+        result_1 = face_detection(img_path)
+        self.show_result(img_path, result_1)
+
+        result_2 = face_detection(img)
+        self.show_result(img_path, result_2)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/pipelines/test_multi_modal_embedding.py b/tests/pipelines/test_multi_modal_embedding.py
index 6152f279..23954c27 100644
--- a/tests/pipelines/test_multi_modal_embedding.py
+++ b/tests/pipelines/test_multi_modal_embedding.py
@@ -8,11 +8,16 @@ from modelscope.models import Model
 from modelscope.outputs import OutputKeys
 from modelscope.pipelines import pipeline
 from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.test_utils import test_level
 
 
-class MultiModalEmbeddingTest(unittest.TestCase):
-    model_id = 'damo/multi-modal_clip-vit-base-patch16_zh'
+class MultiModalEmbeddingTest(unittest.TestCase, DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.task = Tasks.multi_modal_embedding
+        self.model_id = 'damo/multi-modal_clip-vit-base-patch16_zh'
+
     test_input = {'text': '皮卡丘'}
     model_version = 'dev'
 
@@ -31,11 +36,10 @@ class MultiModalEmbeddingTest(unittest.TestCase):
 
     @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
     def test_run_with_model_from_modelhub(self):
-        model = Model.from_pretrained(self.model_id)
+        model = Model.from_pretrained(
+            self.model_id, revision=self.model_version)
         pipeline_multi_modal_embedding = pipeline(
-            task=Tasks.multi_modal_embedding,
-            model=model,
-            model_revision=self.model_version)
+            task=Tasks.multi_modal_embedding, model=model)
         text_embedding = pipeline_multi_modal_embedding(
             self.test_input)[OutputKeys.TEXT_EMBEDDING]
         print('l1-norm: {}'.format(
@@ -55,6 +59,10 @@ class MultiModalEmbeddingTest(unittest.TestCase):
         print('l2-norm: {}'.format(torch.norm(text_embedding,
                                               dim=-1).item()))  # should be 1.0
 
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_multi_modal_similarity.py b/tests/pipelines/test_multi_modal_similarity.py
new file mode 100644
index 00000000..a54fbcf0
--- /dev/null
+++ b/tests/pipelines/test_multi_modal_similarity.py
@@ -0,0 +1,48 @@
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
+
+import unittest
+
+from modelscope.models import Model
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.test_utils import test_level
+
+
+class MultiModalSimilarityTest(unittest.TestCase):
+    model_id = 'damo/multi-modal_team-vit-large-patch14_multi-modal-similarity'
+    test_img = 'data/test/images/multimodal_similarity.jpg'
+    test_str1 = '一个上了年纪的女人在城镇中骑着自行车一个黄色出租车正要从她身边驶过'
+    test_str2 = '穿着蓝色连衣裙的那个女人正冲着行来的车辆伸出她的手'
+
+    def infer_pipeline(self, multi_modal_similarity_pipeline):
+        test_input1 = {'img': self.test_img, 'text': self.test_str1}
+        test_input2 = {'img': self.test_img, 'text': self.test_str2}
+        output1 = multi_modal_similarity_pipeline(test_input1)
+        output2 = multi_modal_similarity_pipeline(test_input2)
+        print('image: {}, text: {}, similarity: {}'.format(
+            self.test_img, self.test_str1, output1['scores']))
+        print('image: {}, text: {}, similarity: {}'.format(
+            self.test_img, self.test_str2, output2['scores']))
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run(self):
+        multi_modal_similarity_pipeline = pipeline(
+            Tasks.multi_modal_similarity, model=self.model_id)
+        self.infer_pipeline(multi_modal_similarity_pipeline)
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_run_with_default_model(self):
+        multi_modal_similarity_pipeline = pipeline(
+            task=Tasks.multi_modal_similarity)
+        self.infer_pipeline(multi_modal_similarity_pipeline)
+
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_run_with_model_from_modelhub(self):
+        model = Model.from_pretrained(self.model_id)
+        multi_modal_similarity_pipeline = pipeline(
+            task=Tasks.multi_modal_similarity, model=model)
+        self.infer_pipeline(multi_modal_similarity_pipeline)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/pipelines/test_multi_stage_diffusion.py b/tests/pipelines/test_multi_stage_diffusion.py
new file mode 100644
index 00000000..f4e63ce0
--- /dev/null
+++ b/tests/pipelines/test_multi_stage_diffusion.py
@@ -0,0 +1,40 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import unittest
+
+import numpy as np
+import torch
+
+from modelscope.models import Model
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.test_utils import test_level
+
+
+class MultiStageDiffusionTest(unittest.TestCase):
+    model_id = 'damo/cv_diffusion_text-to-image-synthesis'
+    test_text = {'text': 'Photograph of a baby chicken wearing sunglasses'}
+
+    @unittest.skip(
+        'skip test since the pretrained model is not publicly available')
+    def test_run_with_model_from_modelhub(self):
+        model = Model.from_pretrained(self.model_id)
+        pipe_line_text_to_image_synthesis = pipeline(
+            task=Tasks.text_to_image_synthesis, model=model)
+        img = pipe_line_text_to_image_synthesis(
+            self.test_text)[OutputKeys.OUTPUT_IMG]
+        print(np.sum(np.abs(img)))
+
+    @unittest.skip(
+        'skip test since the pretrained model is not publicly available')
+    def test_run_with_model_name(self):
+        pipe_line_text_to_image_synthesis = pipeline(
+            task=Tasks.text_to_image_synthesis, model=self.model_id)
+        img = pipe_line_text_to_image_synthesis(
+            self.test_text)[OutputKeys.OUTPUT_IMG]
+        print(np.sum(np.abs(img)))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/pipelines/test_named_entity_recognition.py b/tests/pipelines/test_named_entity_recognition.py
index 5ba93f49..3658cf3f 100644
--- a/tests/pipelines/test_named_entity_recognition.py
+++ b/tests/pipelines/test_named_entity_recognition.py
@@ -3,22 +3,30 @@ import unittest
 
 from modelscope.hub.snapshot_download import snapshot_download
 from modelscope.models import Model
-from modelscope.models.nlp import TransformerCRFForNamedEntityRecognition
+from modelscope.models.nlp import (LSTMCRFForNamedEntityRecognition,
+                                   TransformerCRFForNamedEntityRecognition)
 from modelscope.pipelines import pipeline
 from modelscope.pipelines.nlp import NamedEntityRecognitionPipeline
-from modelscope.preprocessors import NERPreprocessor
+from modelscope.preprocessors import TokenClassificationPreprocessor
 from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.test_utils import test_level
 
 
-class NamedEntityRecognitionTest(unittest.TestCase):
-    model_id = 'damo/nlp_raner_named-entity-recognition_chinese-base-news'
+class NamedEntityRecognitionTest(unittest.TestCase, DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.task = Tasks.named_entity_recognition
+        self.model_id = 'damo/nlp_raner_named-entity-recognition_chinese-base-news'
+
+    tcrf_model_id = 'damo/nlp_raner_named-entity-recognition_chinese-base-news'
+    lcrf_model_id = 'damo/nlp_lstm_named-entity-recognition_chinese-news'
     sentence = '这与温岭市新河镇的一个神秘的传说有关。'
 
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
-    def test_run_by_direct_model_download(self):
-        cache_path = snapshot_download(self.model_id)
-        tokenizer = NERPreprocessor(cache_path)
+    def test_run_tcrf_by_direct_model_download(self):
+        cache_path = snapshot_download(self.tcrf_model_id)
+        tokenizer = TokenClassificationPreprocessor(cache_path)
         model = TransformerCRFForNamedEntityRecognition(
             cache_path, tokenizer=tokenizer)
         pipeline1 = NamedEntityRecognitionPipeline(
@@ -32,10 +40,37 @@ class NamedEntityRecognitionTest(unittest.TestCase):
         print()
         print(f'pipeline2: {pipeline2(input=self.sentence)}')
 
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_run_lcrf_by_direct_model_download(self):
+        cache_path = snapshot_download(self.lcrf_model_id)
+        tokenizer = TokenClassificationPreprocessor(cache_path)
+        model = LSTMCRFForNamedEntityRecognition(
+            cache_path, tokenizer=tokenizer)
+        pipeline1 = NamedEntityRecognitionPipeline(
+            model, preprocessor=tokenizer)
+        pipeline2 = pipeline(
+            Tasks.named_entity_recognition,
+            model=model,
+            preprocessor=tokenizer)
+        print(f'sentence: {self.sentence}\n'
+              f'pipeline1:{pipeline1(input=self.sentence)}')
+        print()
+        print(f'pipeline2: {pipeline2(input=self.sentence)}')
+
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
-    def test_run_with_model_from_modelhub(self):
-        model = Model.from_pretrained(self.model_id)
-        tokenizer = NERPreprocessor(model.model_dir)
+    def test_run_tcrf_with_model_from_modelhub(self):
+        model = Model.from_pretrained(self.tcrf_model_id)
+        tokenizer = TokenClassificationPreprocessor(model.model_dir)
+        pipeline_ins = pipeline(
+            task=Tasks.named_entity_recognition,
+            model=model,
+            preprocessor=tokenizer)
+        print(pipeline_ins(input=self.sentence))
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_run_lcrf_with_model_from_modelhub(self):
+        model = Model.from_pretrained(self.lcrf_model_id)
+        tokenizer = TokenClassificationPreprocessor(model.model_dir)
         pipeline_ins = pipeline(
             task=Tasks.named_entity_recognition,
             model=model,
@@ -43,9 +78,15 @@ class NamedEntityRecognitionTest(unittest.TestCase):
         print(pipeline_ins(input=self.sentence))
 
     @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
-    def test_run_with_model_name(self):
+    def test_run_tcrf_with_model_name(self):
         pipeline_ins = pipeline(
-            task=Tasks.named_entity_recognition, model=self.model_id)
+            task=Tasks.named_entity_recognition, model=self.tcrf_model_id)
+        print(pipeline_ins(input=self.sentence))
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_run_lcrf_with_model_name(self):
+        pipeline_ins = pipeline(
+            task=Tasks.named_entity_recognition, model=self.lcrf_model_id)
         print(pipeline_ins(input=self.sentence))
 
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
@@ -53,6 +94,10 @@ class NamedEntityRecognitionTest(unittest.TestCase):
         pipeline_ins = pipeline(task=Tasks.named_entity_recognition)
         print(pipeline_ins(input=self.sentence))
 
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_nli.py b/tests/pipelines/test_nli.py
index 1e259a2e..db4b9912 100644
--- a/tests/pipelines/test_nli.py
+++ b/tests/pipelines/test_nli.py
@@ -5,28 +5,34 @@ from modelscope.hub.snapshot_download import snapshot_download
 from modelscope.models import Model
 from modelscope.models.nlp import SbertForSequenceClassification
 from modelscope.pipelines import pipeline
-from modelscope.pipelines.nlp import PairSentenceClassificationPipeline
-from modelscope.preprocessors import PairSentenceClassificationPreprocessor
+from modelscope.pipelines.nlp import SequenceClassificationPipeline
+from modelscope.preprocessors import SequenceClassificationPreprocessor
 from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
+from modelscope.utils.regress_test_utils import MsRegressTool
 from modelscope.utils.test_utils import test_level
 
 
-class NLITest(unittest.TestCase):
-    model_id = 'damo/nlp_structbert_nli_chinese-base'
+class NLITest(unittest.TestCase, DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.task = Tasks.nli
+        self.model_id = 'damo/nlp_structbert_nli_chinese-base'
+
     sentence1 = '四川商务职业学院和四川财经职业学院哪个好？'
     sentence2 = '四川商务职业学院商务管理在哪个校区？'
+    regress_tool = MsRegressTool(baseline=False)
 
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_with_direct_file_download(self):
         cache_path = snapshot_download(self.model_id)
-        tokenizer = PairSentenceClassificationPreprocessor(cache_path)
+        tokenizer = SequenceClassificationPreprocessor(cache_path)
         model = SbertForSequenceClassification.from_pretrained(cache_path)
-        pipeline1 = PairSentenceClassificationPipeline(
+        pipeline1 = SequenceClassificationPipeline(
             model, preprocessor=tokenizer)
         pipeline2 = pipeline(Tasks.nli, model=model, preprocessor=tokenizer)
         print(f'sentence1: {self.sentence1}\nsentence2: {self.sentence2}\n'
               f'pipeline1:{pipeline1(input=(self.sentence1, self.sentence2))}')
-        print()
         print(
             f'sentence1: {self.sentence1}\nsentence2: {self.sentence2}\n'
             f'pipeline1: {pipeline2(input=(self.sentence1, self.sentence2))}')
@@ -34,7 +40,7 @@ class NLITest(unittest.TestCase):
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_with_model_from_modelhub(self):
         model = Model.from_pretrained(self.model_id)
-        tokenizer = PairSentenceClassificationPreprocessor(model.model_dir)
+        tokenizer = SequenceClassificationPreprocessor(model.model_dir)
         pipeline_ins = pipeline(
             task=Tasks.nli, model=model, preprocessor=tokenizer)
         print(pipeline_ins(input=(self.sentence1, self.sentence2)))
@@ -42,13 +48,19 @@ class NLITest(unittest.TestCase):
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_run_with_model_name(self):
         pipeline_ins = pipeline(task=Tasks.nli, model=self.model_id)
-        print(pipeline_ins(input=(self.sentence1, self.sentence2)))
+        with self.regress_tool.monitor_module_single_forward(
+                pipeline_ins.model, 'sbert_nli'):
+            print(pipeline_ins(input=(self.sentence1, self.sentence2)))
 
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_with_default_model(self):
         pipeline_ins = pipeline(task=Tasks.nli)
         print(pipeline_ins(input=(self.sentence1, self.sentence2)))
 
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_object_detection.py b/tests/pipelines/test_object_detection.py
index de16aaa1..2a74eb41 100644
--- a/tests/pipelines/test_object_detection.py
+++ b/tests/pipelines/test_object_detection.py
@@ -3,10 +3,15 @@ import unittest
 
 from modelscope.pipelines import pipeline
 from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.test_utils import test_level
 
 
-class ObjectDetectionTest(unittest.TestCase):
+class ObjectDetectionTest(unittest.TestCase, DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.task = Tasks.human_detection
+        self.model_id = 'damo/cv_resnet18_human-detection'
 
     @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
     def test_object_detection(self):
@@ -50,6 +55,10 @@ class ObjectDetectionTest(unittest.TestCase):
         else:
             raise ValueError('process error')
 
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_ocr_detection.py b/tests/pipelines/test_ocr_detection.py
index a4201512..e0591496 100644
--- a/tests/pipelines/test_ocr_detection.py
+++ b/tests/pipelines/test_ocr_detection.py
@@ -4,14 +4,16 @@ import unittest
 from modelscope.pipelines import pipeline
 from modelscope.pipelines.base import Pipeline
 from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.test_utils import test_level
 
 
-class OCRDetectionTest(unittest.TestCase):
+class OCRDetectionTest(unittest.TestCase, DemoCompatibilityCheck):
 
     def setUp(self) -> None:
         self.model_id = 'damo/cv_resnet18_ocr-detection-line-level_damo'
         self.test_image = 'data/test/images/ocr_detection.jpg'
+        self.task = Tasks.ocr_detection
 
     def pipeline_inference(self, pipeline: Pipeline, input_location: str):
         result = pipeline(input_location)
@@ -28,6 +30,10 @@ class OCRDetectionTest(unittest.TestCase):
         ocr_detection = pipeline(Tasks.ocr_detection)
         self.pipeline_inference(ocr_detection, self.test_image)
 
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_ocr_recognition.py b/tests/pipelines/test_ocr_recognition.py
index a2e5ba8e..8d48dd7a 100644
--- a/tests/pipelines/test_ocr_recognition.py
+++ b/tests/pipelines/test_ocr_recognition.py
@@ -1,26 +1,21 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
-import os.path as osp
-import shutil
-import sys
-import tempfile
 import unittest
-from typing import Any, Dict, List, Tuple, Union
 
-import cv2
-import numpy as np
 import PIL
 
 from modelscope.pipelines import pipeline
 from modelscope.pipelines.base import Pipeline
 from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.test_utils import test_level
 
 
-class OCRRecognitionTest(unittest.TestCase):
+class OCRRecognitionTest(unittest.TestCase, DemoCompatibilityCheck):
 
     def setUp(self) -> None:
         self.model_id = 'damo/cv_convnextTiny_ocr-recognition-general_damo'
         self.test_image = 'data/test/images/ocr_recognition.jpg'
+        self.task = Tasks.ocr_recognition
 
     def pipeline_inference(self, pipeline: Pipeline, input_location: str):
         result = pipeline(input_location)
@@ -42,6 +37,10 @@ class OCRRecognitionTest(unittest.TestCase):
         ocr_recognition = pipeline(Tasks.ocr_recognition)
         self.pipeline_inference(ocr_recognition, self.test_image)
 
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_ofa_tasks.py b/tests/pipelines/test_ofa_tasks.py
index 69bccac1..e6638dfa 100644
--- a/tests/pipelines/test_ofa_tasks.py
+++ b/tests/pipelines/test_ofa_tasks.py
@@ -11,10 +11,11 @@ from modelscope.outputs import OutputKeys
 from modelscope.pipelines import pipeline
 from modelscope.utils.constant import Tasks
 from modelscope.utils.cv.image_utils import created_boxed_image
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.test_utils import test_level
 
 
-class OfaTasksTest(unittest.TestCase):
+class OfaTasksTest(unittest.TestCase, DemoCompatibilityCheck):
 
     def setUp(self) -> None:
         self.output_dir = 'unittest_output'
@@ -146,8 +147,10 @@ class OfaTasksTest(unittest.TestCase):
         result = ofa_pipe(input)
         print(result)
         image_name = image.split('/')[-2]
-        self.save_img(image, result[OutputKeys.BOXES],
-                      osp.join('large_en_model_' + image_name + '.png'))
+        self.save_img(
+            image,
+            result[OutputKeys.BOXES][0],  # just one box
+            osp.join('large_en_model_' + image_name + '.png'))
 
     @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
     def test_run_with_visual_grounding_with_name(self):
@@ -160,7 +163,7 @@ class OfaTasksTest(unittest.TestCase):
         result = ofa_pipe(input)
         print(result)
         image_name = image.split('/')[-2]
-        self.save_img(image, result[OutputKeys.BOXES],
+        self.save_img(image, result[OutputKeys.BOXES][0],
                       osp.join('large_en_name_' + image_name + '.png'))
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
@@ -173,7 +176,7 @@ class OfaTasksTest(unittest.TestCase):
         result = ofa_pipe(input)
         print(result)
         image_name = image.split('/')[-1]
-        self.save_img(image, result[OutputKeys.BOXES],
+        self.save_img(image, result[OutputKeys.BOXES][0],
                       osp.join('large_zh_name_' + image_name))
 
     @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
@@ -251,6 +254,10 @@ class OfaTasksTest(unittest.TestCase):
         result[OutputKeys.OUTPUT_IMG].save('result.png')
         print(f'Output written to {osp.abspath("result.png")}')
 
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_part_of_speech.py b/tests/pipelines/test_part_of_speech.py
new file mode 100644
index 00000000..25f4491c
--- /dev/null
+++ b/tests/pipelines/test_part_of_speech.py
@@ -0,0 +1,55 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import shutil
+import unittest
+
+from modelscope.hub.snapshot_download import snapshot_download
+from modelscope.models import Model
+from modelscope.models.nlp import TokenClassificationModel
+from modelscope.pipelines import pipeline
+from modelscope.pipelines.nlp import TokenClassificationPipeline
+from modelscope.preprocessors import TokenClassificationPreprocessor
+from modelscope.utils.constant import Tasks
+from modelscope.utils.test_utils import test_level
+
+
+class PartOfSpeechTest(unittest.TestCase):
+    model_id = 'damo/nlp_structbert_part-of-speech_chinese-base'
+    sentence = '今天天气不错，适合出去游玩'
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_run_by_direct_model_download(self):
+        cache_path = snapshot_download(self.model_id)
+        tokenizer = TokenClassificationPreprocessor(cache_path)
+        model = TokenClassificationModel.from_pretrained(cache_path)
+        pipeline1 = TokenClassificationPipeline(model, preprocessor=tokenizer)
+        pipeline2 = pipeline(
+            Tasks.token_classification, model=model, preprocessor=tokenizer)
+        print(f'sentence: {self.sentence}\n'
+              f'pipeline1:{pipeline1(input=self.sentence)}')
+        print()
+        print(f'pipeline2: {pipeline2(input=self.sentence)}')
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_with_model_from_modelhub(self):
+        model = Model.from_pretrained(self.model_id)
+        tokenizer = TokenClassificationPreprocessor(model.model_dir)
+        pipeline_ins = pipeline(
+            task=Tasks.token_classification,
+            model=model,
+            preprocessor=tokenizer)
+        print(pipeline_ins(input=self.sentence))
+
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_run_with_model_name(self):
+        pipeline_ins = pipeline(
+            task=Tasks.token_classification, model=self.model_id)
+        print(pipeline_ins(input=self.sentence))
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_run_with_default_model(self):
+        pipeline_ins = pipeline(task=Tasks.token_classification)
+        print(pipeline_ins(input=self.sentence))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/pipelines/test_passage_ranking.py b/tests/pipelines/test_passage_ranking.py
new file mode 100644
index 00000000..5faa365e
--- /dev/null
+++ b/tests/pipelines/test_passage_ranking.py
@@ -0,0 +1,61 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import shutil
+import unittest
+
+from modelscope.hub.snapshot_download import snapshot_download
+from modelscope.models import Model
+from modelscope.models.nlp import PassageRanking
+from modelscope.pipelines import pipeline
+from modelscope.pipelines.nlp import PassageRankingPipeline
+from modelscope.preprocessors import PassageRankingPreprocessor
+from modelscope.utils.constant import Tasks
+from modelscope.utils.test_utils import test_level
+
+
+class PassageRankingTest(unittest.TestCase):
+    model_id = 'damo/nlp_corom_passage-ranking_english-base'
+    inputs = {
+        'source_sentence': ["how long it take to get a master's degree"],
+        'sentences_to_compare': [
+            "On average, students take about 18 to 24 months to complete a master's degree.",
+            'On the other hand, some students prefer to go at a slower pace and choose to take '
+            'several years to complete their studies.',
+            'It can take anywhere from two semesters'
+        ]
+    }
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_run_by_direct_model_download(self):
+        cache_path = snapshot_download(self.model_id)
+        tokenizer = PassageRankingPreprocessor(cache_path)
+        model = PassageRanking.from_pretrained(cache_path)
+        pipeline1 = PassageRankingPipeline(model, preprocessor=tokenizer)
+        pipeline2 = pipeline(
+            Tasks.passage_ranking, model=model, preprocessor=tokenizer)
+        print(f'sentence: {self.inputs}\n'
+              f'pipeline1:{pipeline1(input=self.inputs)}')
+        print()
+        print(f'pipeline2: {pipeline2(input=self.inputs)}')
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_with_model_from_modelhub(self):
+        model = Model.from_pretrained(self.model_id)
+        tokenizer = PassageRankingPreprocessor(model.model_dir)
+        pipeline_ins = pipeline(
+            task=Tasks.passage_ranking, model=model, preprocessor=tokenizer)
+        print(pipeline_ins(input=self.inputs))
+
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_run_with_model_name(self):
+        pipeline_ins = pipeline(
+            task=Tasks.passage_ranking, model=self.model_id)
+        print(pipeline_ins(input=self.inputs))
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_run_with_default_model(self):
+        pipeline_ins = pipeline(task=Tasks.passage_ranking)
+        print(pipeline_ins(input=self.inputs))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/pipelines/test_person_image_cartoon.py b/tests/pipelines/test_person_image_cartoon.py
index bdbf8b61..b8549f4f 100644
--- a/tests/pipelines/test_person_image_cartoon.py
+++ b/tests/pipelines/test_person_image_cartoon.py
@@ -1,5 +1,4 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
-import os
 import os.path as osp
 import unittest
 
@@ -9,13 +8,19 @@ from modelscope.outputs import OutputKeys
 from modelscope.pipelines import pipeline
 from modelscope.pipelines.base import Pipeline
 from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.test_utils import test_level
 
 
-class ImageCartoonTest(unittest.TestCase):
+class ImageCartoonTest(unittest.TestCase, DemoCompatibilityCheck):
 
     def setUp(self) -> None:
         self.model_id = 'damo/cv_unet_person-image-cartoon_compound-models'
+        self.model_id_3d = 'damo/cv_unet_person-image-cartoon-3d_compound-models'
+        self.model_id_handdrawn = 'damo/cv_unet_person-image-cartoon-handdrawn_compound-models'
+        self.model_id_sketch = 'damo/cv_unet_person-image-cartoon-sketch_compound-models'
+        self.model_id_artstyle = 'damo/cv_unet_person-image-cartoon-artstyle_compound-models'
+        self.task = Tasks.image_portrait_stylization
         self.test_image = 'https://modelscope.oss-cn-beijing.aliyuncs.com/test/images/image_cartoon.png'
 
     def pipeline_inference(self, pipeline: Pipeline, input_location: str):
@@ -30,11 +35,39 @@ class ImageCartoonTest(unittest.TestCase):
             Tasks.image_portrait_stylization, model=self.model_id)
         self.pipeline_inference(img_cartoon, self.test_image)
 
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_modelhub_3d(self):
+        img_cartoon = pipeline(
+            Tasks.image_portrait_stylization, model=self.model_id_3d)
+        self.pipeline_inference(img_cartoon, self.test_image)
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_modelhub_handdrawn(self):
+        img_cartoon = pipeline(
+            Tasks.image_portrait_stylization, model=self.model_id_handdrawn)
+        self.pipeline_inference(img_cartoon, self.test_image)
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_modelhub_sketch(self):
+        img_cartoon = pipeline(
+            Tasks.image_portrait_stylization, model=self.model_id_sketch)
+        self.pipeline_inference(img_cartoon, self.test_image)
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_modelhub_artstyle(self):
+        img_cartoon = pipeline(
+            Tasks.image_portrait_stylization, model=self.model_id_artstyle)
+        self.pipeline_inference(img_cartoon, self.test_image)
+
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_modelhub_default_model(self):
         img_cartoon = pipeline(Tasks.image_portrait_stylization)
         self.pipeline_inference(img_cartoon, self.test_image)
 
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_plug_text_generation.py b/tests/pipelines/test_plug_text_generation.py
new file mode 100644
index 00000000..90b48efa
--- /dev/null
+++ b/tests/pipelines/test_plug_text_generation.py
@@ -0,0 +1,49 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import unittest
+
+from modelscope.hub.snapshot_download import snapshot_download
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+
+
+class TextPlugGenerationTest(unittest.TestCase):
+
+    def setUp(self) -> None:
+        # please make sure this local path exists.
+        self.model_id = 'damo/nlp_plug_text-generation_27B'
+        self.model_dir = snapshot_download(self.model_id)
+        self.plug_input = '段誉轻挥折扇，摇了摇头，说道：“你师父是你的师父，你师父可不是我的师父。"'
+
+    @unittest.skip('distributed plug, skipped')
+    def test_plug(self):
+        """ The model can be downloaded from the link on
+        https://modelscope.cn/models/damo/nlp_plug_text-generation_27B/summary.
+        After downloading, you should have a plug model structure like this:
+        nlp_plug_text-generation_27B
+            |_ config.json
+            |_ configuration.json
+            |_ ds_zero-offload_10B_config.json
+            |_ vocab.txt
+            |_ model <-- an empty directory
+
+        Model binaries shall be downloaded separately to populate the model directory, so that
+        the model directory would contain the following binaries:
+            |_ model
+                |_ mp_rank_00_model_states.pt
+                |_ mp_rank_01_model_states.pt
+                |_ mp_rank_02_model_states.pt
+                |_ mp_rank_03_model_states.pt
+                |_ mp_rank_04_model_states.pt
+                |_ mp_rank_05_model_states.pt
+                |_ mp_rank_06_model_states.pt
+                |_ mp_rank_07_model_states.pt
+        """
+        # download model binaries to <model_dir>/model
+        pipe = pipeline(Tasks.text_generation, model=self.model_id)
+        print(
+            f'input: {self.plug_input}\noutput: {pipe(self.plug_input, out_length=256)}'
+        )
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/pipelines/test_product_retrieval_embedding.py b/tests/pipelines/test_product_retrieval_embedding.py
index c416943e..2483d53a 100644
--- a/tests/pipelines/test_product_retrieval_embedding.py
+++ b/tests/pipelines/test_product_retrieval_embedding.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import unittest
 
 import numpy as np
@@ -6,11 +8,16 @@ from modelscope.models import Model
 from modelscope.outputs import OutputKeys
 from modelscope.pipelines import pipeline
 from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.test_utils import test_level
 
 
-class ProductRetrievalEmbeddingTest(unittest.TestCase):
-    model_id = 'damo/cv_resnet50_product-bag-embedding-models'
+class ProductRetrievalEmbeddingTest(unittest.TestCase, DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.task = Tasks.product_retrieval_embedding
+        self.model_id = 'damo/cv_resnet50_product-bag-embedding-models'
+
     img_input = 'data/test/images/product_embed_bag.jpg'
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
@@ -34,6 +41,10 @@ class ProductRetrievalEmbeddingTest(unittest.TestCase):
         result = product_embed(self.img_input)[OutputKeys.IMG_EMBEDDING]
         print('abs sum value is: {}'.format(np.sum(np.abs(result))))
 
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_product_segmentation.py b/tests/pipelines/test_product_segmentation.py
new file mode 100644
index 00000000..8f41c13c
--- /dev/null
+++ b/tests/pipelines/test_product_segmentation.py
@@ -0,0 +1,43 @@
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
+import unittest
+
+import cv2
+
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines import pipeline
+from modelscope.pipelines.base import Pipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.logger import get_logger
+from modelscope.utils.test_utils import test_level
+
+logger = get_logger()
+
+
+class ProductSegmentationTest(unittest.TestCase):
+
+    def setUp(self) -> None:
+        self.model_id = 'damo/cv_F3Net_product-segmentation'
+        self.input = {
+            'input_path': 'data/test/images/product_segmentation.jpg'
+        }
+
+    def pipeline_inference(self, pipeline: Pipeline, input: str):
+        result = pipeline(input)
+        cv2.imwrite('test_product_segmentation_mask.jpg',
+                    result[OutputKeys.MASKS])
+        logger.info('test done')
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_modelhub(self):
+        product_segmentation = pipeline(
+            Tasks.product_segmentation, model=self.model_id)
+        self.pipeline_inference(product_segmentation, self.input)
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_run_modelhub_default_model(self):
+        product_segmentation = pipeline(Tasks.product_segmentation)
+        self.pipeline_inference(product_segmentation, self.input)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/pipelines/test_realtime_object_detection.py b/tests/pipelines/test_realtime_object_detection.py
new file mode 100644
index 00000000..e04f6b5c
--- /dev/null
+++ b/tests/pipelines/test_realtime_object_detection.py
@@ -0,0 +1,56 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import unittest
+
+import cv2
+
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.cv.image_utils import realtime_object_detection_bbox_vis
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
+from modelscope.utils.test_utils import test_level
+
+
+class RealtimeObjectDetectionTest(unittest.TestCase, DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.model_id = 'damo/cv_cspnet_image-object-detection_yolox'
+        self.model_nano_id = 'damo/cv_cspnet_image-object-detection_yolox_nano_coco'
+        self.test_image = 'data/test/images/keypoints_detect/000000438862.jpg'
+        self.task = Tasks.image_object_detection
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_modelhub(self):
+        realtime_object_detection = pipeline(
+            Tasks.image_object_detection, model=self.model_id)
+
+        image = cv2.imread(self.test_image)
+        result = realtime_object_detection(image)
+        if result:
+            bboxes = result[OutputKeys.BOXES].astype(int)
+            image = realtime_object_detection_bbox_vis(image, bboxes)
+            cv2.imwrite('rt_obj_out.jpg', image)
+        else:
+            raise ValueError('process error')
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_nano(self):
+        realtime_object_detection = pipeline(
+            Tasks.image_object_detection, model=self.model_nano_id)
+
+        image = cv2.imread(self.test_image)
+        result = realtime_object_detection(image)
+        if result:
+            bboxes = result[OutputKeys.BOXES].astype(int)
+            image = realtime_object_detection_bbox_vis(image, bboxes)
+            cv2.imwrite('rtnano_obj_out.jpg', image)
+        else:
+            raise ValueError('process error')
+
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/pipelines/test_relation_extraction.py b/tests/pipelines/test_relation_extraction.py
new file mode 100644
index 00000000..57d98f66
--- /dev/null
+++ b/tests/pipelines/test_relation_extraction.py
@@ -0,0 +1,64 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import unittest
+
+from modelscope.hub.snapshot_download import snapshot_download
+from modelscope.models import Model
+from modelscope.models.nlp import InformationExtractionModel
+from modelscope.pipelines import pipeline
+from modelscope.pipelines.nlp import InformationExtractionPipeline
+from modelscope.preprocessors import RelationExtractionPreprocessor
+from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
+from modelscope.utils.test_utils import test_level
+
+
+class RelationExtractionTest(unittest.TestCase, DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.task = Tasks.information_extraction
+        self.model_id = 'damo/nlp_bert_relation-extraction_chinese-base'
+
+    sentence = '高捷，祖籍江苏，本科毕业于东南大学'
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_run_by_direct_model_download(self):
+        cache_path = snapshot_download(self.model_id)
+        tokenizer = RelationExtractionPreprocessor(cache_path)
+        model = InformationExtractionModel.from_pretrained(cache_path)
+        pipeline1 = InformationExtractionPipeline(
+            model, preprocessor=tokenizer)
+        pipeline2 = pipeline(
+            Tasks.information_extraction, model=model, preprocessor=tokenizer)
+        print(f'sentence: {self.sentence}\n'
+              f'pipeline1:{pipeline1(input=self.sentence)}')
+        print()
+        print(f'pipeline2: {pipeline2(input=self.sentence)}')
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_with_model_from_modelhub(self):
+        model = Model.from_pretrained(self.model_id)
+        tokenizer = RelationExtractionPreprocessor(model.model_dir)
+        pipeline_ins = pipeline(
+            task=Tasks.information_extraction,
+            model=model,
+            preprocessor=tokenizer)
+        print(pipeline_ins(input=self.sentence))
+
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_run_with_model_name(self):
+        pipeline_ins = pipeline(
+            task=Tasks.information_extraction, model=self.model_id)
+        print(pipeline_ins(input=self.sentence))
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_run_with_default_model(self):
+        pipeline_ins = pipeline(task=Tasks.information_extraction)
+        print(pipeline_ins(input=self.sentence))
+
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/pipelines/test_retina_face_detection.py b/tests/pipelines/test_retina_face_detection.py
new file mode 100644
index 00000000..343e1c91
--- /dev/null
+++ b/tests/pipelines/test_retina_face_detection.py
@@ -0,0 +1,33 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os.path as osp
+import unittest
+
+import cv2
+
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.cv.image_utils import draw_face_detection_result
+from modelscope.utils.test_utils import test_level
+
+
+class RetinaFaceDetectionTest(unittest.TestCase):
+
+    def setUp(self) -> None:
+        self.model_id = 'damo/cv_resnet50_face-detection_retinaface'
+
+    def show_result(self, img_path, detection_result):
+        img = draw_face_detection_result(img_path, detection_result)
+        cv2.imwrite('result.png', img)
+        print(f'output written to {osp.abspath("result.png")}')
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_modelhub(self):
+        face_detection = pipeline(Tasks.face_detection, model=self.model_id)
+        img_path = 'data/test/images/retina_face_detection.jpg'
+
+        result = face_detection(img_path)
+        self.show_result(img_path, result)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/pipelines/test_salient_detection.py b/tests/pipelines/test_salient_detection.py
index ec010b17..bcb904e6 100644
--- a/tests/pipelines/test_salient_detection.py
+++ b/tests/pipelines/test_salient_detection.py
@@ -4,21 +4,29 @@ import unittest
 from modelscope.outputs import OutputKeys
 from modelscope.pipelines import pipeline
 from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.test_utils import test_level
 
 
-class SalientDetectionTest(unittest.TestCase):
+class SalientDetectionTest(unittest.TestCase, DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.task = Tasks.semantic_segmentation
+        self.model_id = 'damo/cv_u2net_salient-detection'
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_salient_detection(self):
         input_location = 'data/test/images/image_salient_detection.jpg'
         model_id = 'damo/cv_u2net_salient-detection'
-        salient_detect = pipeline(Tasks.image_segmentation, model=model_id)
+        salient_detect = pipeline(Tasks.semantic_segmentation, model=model_id)
         result = salient_detect(input_location)
         import cv2
-        # result[OutputKeys.MASKS] is salient map result,other keys are not used
         cv2.imwrite(input_location + '_salient.jpg', result[OutputKeys.MASKS])
 
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_sentence_embedding.py b/tests/pipelines/test_sentence_embedding.py
new file mode 100644
index 00000000..739dd7ab
--- /dev/null
+++ b/tests/pipelines/test_sentence_embedding.py
@@ -0,0 +1,82 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import shutil
+import unittest
+
+from modelscope.hub.snapshot_download import snapshot_download
+from modelscope.models import Model
+from modelscope.models.nlp import SentenceEmbedding
+from modelscope.pipelines import pipeline
+from modelscope.pipelines.nlp import SentenceEmbeddingPipeline
+from modelscope.preprocessors import SentenceEmbeddingPreprocessor
+from modelscope.utils.constant import Tasks
+from modelscope.utils.test_utils import test_level
+
+
+class SentenceEmbeddingTest(unittest.TestCase):
+    model_id = 'damo/nlp_corom_sentence-embedding_english-base'
+    inputs = {
+        'source_sentence': ["how long it take to get a master's degree"],
+        'sentences_to_compare': [
+            "On average, students take about 18 to 24 months to complete a master's degree.",
+            'On the other hand, some students prefer to go at a slower pace and choose to take ',
+            'several years to complete their studies.',
+            'It can take anywhere from two semesters'
+        ]
+    }
+
+    inputs2 = {
+        'source_sentence': ["how long it take to get a master's degree"],
+        'sentences_to_compare': [
+            "On average, students take about 18 to 24 months to complete a master's degree."
+        ]
+    }
+
+    inputs3 = {
+        'source_sentence': ["how long it take to get a master's degree"],
+        'sentences_to_compare': []
+    }
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_run_by_direct_model_download(self):
+        cache_path = snapshot_download(self.model_id)
+        tokenizer = SentenceEmbeddingPreprocessor(cache_path)
+        model = SentenceEmbedding.from_pretrained(cache_path)
+        pipeline1 = SentenceEmbeddingPipeline(model, preprocessor=tokenizer)
+        pipeline2 = pipeline(
+            Tasks.sentence_embedding, model=model, preprocessor=tokenizer)
+        print(f'inputs: {self.inputs}\n'
+              f'pipeline1:{pipeline1(input=self.inputs)}')
+        print()
+        print(f'pipeline2: {pipeline2(input=self.inputs)}')
+        print()
+        print(f'inputs: {self.inputs2}\n'
+              f'pipeline1:{pipeline1(input=self.inputs2)}')
+        print()
+        print(f'pipeline2: {pipeline2(input=self.inputs2)}')
+        print(f'inputs: {self.inputs3}\n'
+              f'pipeline1:{pipeline1(input=self.inputs3)}')
+        print()
+        print(f'pipeline2: {pipeline2(input=self.inputs3)}')
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_with_model_from_modelhub(self):
+        model = Model.from_pretrained(self.model_id)
+        tokenizer = SentenceEmbeddingPreprocessor(model.model_dir)
+        pipeline_ins = pipeline(
+            task=Tasks.sentence_embedding, model=model, preprocessor=tokenizer)
+        print(pipeline_ins(input=self.inputs))
+
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_run_with_model_name(self):
+        pipeline_ins = pipeline(
+            task=Tasks.sentence_embedding, model=self.model_id)
+        print(pipeline_ins(input=self.inputs))
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_run_with_default_model(self):
+        pipeline_ins = pipeline(task=Tasks.sentence_embedding)
+        print(pipeline_ins(input=self.inputs))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/pipelines/test_sentence_similarity.py b/tests/pipelines/test_sentence_similarity.py
index d39f6783..288d38c7 100644
--- a/tests/pipelines/test_sentence_similarity.py
+++ b/tests/pipelines/test_sentence_similarity.py
@@ -5,23 +5,30 @@ from modelscope.hub.snapshot_download import snapshot_download
 from modelscope.models import Model
 from modelscope.models.nlp import SbertForSequenceClassification
 from modelscope.pipelines import pipeline
-from modelscope.pipelines.nlp import PairSentenceClassificationPipeline
-from modelscope.preprocessors import PairSentenceClassificationPreprocessor
+from modelscope.pipelines.nlp import SequenceClassificationPipeline
+from modelscope.preprocessors import SequenceClassificationPreprocessor
 from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
+from modelscope.utils.regress_test_utils import MsRegressTool
 from modelscope.utils.test_utils import test_level
 
 
-class SentenceSimilarityTest(unittest.TestCase):
-    model_id = 'damo/nlp_structbert_sentence-similarity_chinese-base'
+class SentenceSimilarityTest(unittest.TestCase, DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.task = Tasks.sentence_similarity
+        self.model_id = 'damo/nlp_structbert_sentence-similarity_chinese-base'
+
     sentence1 = '今天气温比昨天高么？'
     sentence2 = '今天湿度比昨天高么？'
+    regress_tool = MsRegressTool(baseline=False)
 
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run(self):
         cache_path = snapshot_download(self.model_id)
-        tokenizer = PairSentenceClassificationPreprocessor(cache_path)
+        tokenizer = SequenceClassificationPreprocessor(cache_path)
         model = SbertForSequenceClassification.from_pretrained(cache_path)
-        pipeline1 = PairSentenceClassificationPipeline(
+        pipeline1 = SequenceClassificationPipeline(
             model, preprocessor=tokenizer)
         pipeline2 = pipeline(
             Tasks.sentence_similarity, model=model, preprocessor=tokenizer)
@@ -36,7 +43,7 @@ class SentenceSimilarityTest(unittest.TestCase):
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_with_model_from_modelhub(self):
         model = Model.from_pretrained(self.model_id)
-        tokenizer = PairSentenceClassificationPreprocessor(model.model_dir)
+        tokenizer = SequenceClassificationPreprocessor(model.model_dir)
         pipeline_ins = pipeline(
             task=Tasks.sentence_similarity,
             model=model,
@@ -47,13 +54,19 @@ class SentenceSimilarityTest(unittest.TestCase):
     def test_run_with_model_name(self):
         pipeline_ins = pipeline(
             task=Tasks.sentence_similarity, model=self.model_id)
-        print(pipeline_ins(input=(self.sentence1, self.sentence2)))
+        with self.regress_tool.monitor_module_single_forward(
+                pipeline_ins.model, 'sbert_sen_sim'):
+            print(pipeline_ins(input=(self.sentence1, self.sentence2)))
 
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_with_default_model(self):
         pipeline_ins = pipeline(task=Tasks.sentence_similarity)
         print(pipeline_ins(input=(self.sentence1, self.sentence2)))
 
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_sentiment_classification.py b/tests/pipelines/test_sentiment_classification.py
index f3bc6981..d0b1b40f 100644
--- a/tests/pipelines/test_sentiment_classification.py
+++ b/tests/pipelines/test_sentiment_classification.py
@@ -6,40 +6,43 @@ from modelscope.models import Model
 from modelscope.models.nlp.task_models.sequence_classification import \
     SequenceClassificationModel
 from modelscope.pipelines import pipeline
-from modelscope.pipelines.nlp import SingleSentenceClassificationPipeline
-from modelscope.preprocessors import SingleSentenceClassificationPreprocessor
+from modelscope.pipelines.nlp import SequenceClassificationPipeline
+from modelscope.preprocessors import SequenceClassificationPreprocessor
 from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.test_utils import test_level
 
 
-class SentimentClassificationTaskModelTest(unittest.TestCase):
-    model_id = 'damo/nlp_structbert_sentiment-classification_chinese-base'
+class SentimentClassificationTaskModelTest(unittest.TestCase,
+                                           DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.task = Tasks.text_classification
+        self.model_id = 'damo/nlp_structbert_sentiment-classification_chinese-base'
+
     sentence1 = '启动的时候很大声音，然后就会听到1.2秒的卡察的声音，类似齿轮摩擦的声音'
 
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_with_direct_file_download(self):
-        cache_path = snapshot_download(self.model_id)
-        tokenizer = SingleSentenceClassificationPreprocessor(cache_path)
+        cache_path = snapshot_download(self.model_id, revision='beta')
+        tokenizer = SequenceClassificationPreprocessor(cache_path)
         model = SequenceClassificationModel.from_pretrained(
-            self.model_id, num_labels=2)
-        pipeline1 = SingleSentenceClassificationPipeline(
+            self.model_id, num_labels=2, revision='beta')
+        pipeline1 = SequenceClassificationPipeline(
             model, preprocessor=tokenizer)
         pipeline2 = pipeline(
-            Tasks.sentiment_classification,
-            model=model,
-            preprocessor=tokenizer)
+            Tasks.text_classification, model=model, preprocessor=tokenizer)
         print(f'sentence1: {self.sentence1}\n'
               f'pipeline1:{pipeline1(input=self.sentence1)}')
-        print()
         print(f'sentence1: {self.sentence1}\n'
               f'pipeline1: {pipeline2(input=self.sentence1)}')
 
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_with_model_from_modelhub(self):
-        model = Model.from_pretrained(self.model_id)
-        tokenizer = SingleSentenceClassificationPreprocessor(model.model_dir)
+        model = Model.from_pretrained(self.model_id, revision='beta')
+        tokenizer = SequenceClassificationPreprocessor(model.model_dir)
         pipeline_ins = pipeline(
-            task=Tasks.sentiment_classification,
+            task=Tasks.text_classification,
             model=model,
             preprocessor=tokenizer)
         print(pipeline_ins(input=self.sentence1))
@@ -49,18 +52,25 @@ class SentimentClassificationTaskModelTest(unittest.TestCase):
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_run_with_model_name(self):
         pipeline_ins = pipeline(
-            task=Tasks.sentiment_classification, model=self.model_id)
+            task=Tasks.text_classification,
+            model=self.model_id,
+            model_revision='beta')
         print(pipeline_ins(input=self.sentence1))
         self.assertTrue(
             isinstance(pipeline_ins.model, SequenceClassificationModel))
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_run_with_default_model(self):
-        pipeline_ins = pipeline(task=Tasks.sentiment_classification)
+        pipeline_ins = pipeline(
+            task=Tasks.text_classification, model_revision='beta')
         print(pipeline_ins(input=self.sentence1))
         self.assertTrue(
             isinstance(pipeline_ins.model, SequenceClassificationModel))
 
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_shop_segmentation.py b/tests/pipelines/test_shop_segmentation.py
new file mode 100644
index 00000000..58c56dd7
--- /dev/null
+++ b/tests/pipelines/test_shop_segmentation.py
@@ -0,0 +1,24 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import unittest
+
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.test_utils import test_level
+
+
+class ShopSegmentationTest(unittest.TestCase):
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_shop_segmentation(self):
+        input_location = 'data/test/images/shop_segmentation.jpg'
+        model_id = 'damo/cv_vitb16_segmentation_shop-seg'
+        shop_seg = pipeline(Tasks.shop_segmentation, model=model_id)
+        result = shop_seg(input_location)
+        import cv2
+        # result[OutputKeys.MASKS] is segment map result,other keys are not used
+        cv2.imwrite(input_location + '_shopseg.jpg', result[OutputKeys.MASKS])
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/pipelines/test_skin_retouching.py b/tests/pipelines/test_skin_retouching.py
index c6dbee2c..db8d89ed 100644
--- a/tests/pipelines/test_skin_retouching.py
+++ b/tests/pipelines/test_skin_retouching.py
@@ -9,12 +9,14 @@ from modelscope.outputs import OutputKeys
 from modelscope.pipelines import pipeline
 from modelscope.pipelines.base import Pipeline
 from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.test_utils import test_level
 
 
-class SkinRetouchingTest(unittest.TestCase):
+class SkinRetouchingTest(unittest.TestCase, DemoCompatibilityCheck):
 
     def setUp(self) -> None:
+        self.task = Tasks.skin_retouching
         self.model_id = 'damo/cv_unet_skin-retouching'
         self.test_image = 'data/test/images/skin_retouching.png'
 
@@ -39,6 +41,10 @@ class SkinRetouchingTest(unittest.TestCase):
         skin_retouching = pipeline(Tasks.skin_retouching)
         self.pipeline_inference(skin_retouching, self.test_image)
 
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_speech_signal_process.py b/tests/pipelines/test_speech_signal_process.py
index 007e6c73..e5f97c02 100644
--- a/tests/pipelines/test_speech_signal_process.py
+++ b/tests/pipelines/test_speech_signal_process.py
@@ -1,25 +1,35 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import os.path
-import shutil
 import unittest
 
-from modelscope.fileio import File
 from modelscope.metainfo import Pipelines
 from modelscope.pipelines import pipeline
 from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.test_utils import test_level
 
 NEAREND_MIC_FILE = 'data/test/audios/nearend_mic.wav'
 FAREND_SPEECH_FILE = 'data/test/audios/farend_speech.wav'
+NEAREND_MIC_URL = 'https://modelscope.cn/api/v1/models/damo/' \
+                  'speech_dfsmn_aec_psm_16k/repo?Revision=master' \
+                  '&FilePath=examples/nearend_mic.wav'
+FAREND_SPEECH_URL = 'https://modelscope.cn/api/v1/models/damo/' \
+                    'speech_dfsmn_aec_psm_16k/repo?Revision=master' \
+                    '&FilePath=examples/farend_speech.wav'
 
 NOISE_SPEECH_FILE = 'data/test/audios/speech_with_noise.wav'
+NOISE_SPEECH_URL = 'https://modelscope.cn/api/v1/models/damo/' \
+                   'speech_frcrn_ans_cirm_16k/repo?Revision=master' \
+                   '&FilePath=examples/speech_with_noise.wav'
 
 
-class SpeechSignalProcessTest(unittest.TestCase):
+class SpeechSignalProcessTest(unittest.TestCase, DemoCompatibilityCheck):
 
     def setUp(self) -> None:
         pass
 
-    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
     def test_aec(self):
         model_id = 'damo/speech_dfsmn_aec_psm_16k'
         input = {
@@ -31,6 +41,18 @@ class SpeechSignalProcessTest(unittest.TestCase):
         aec(input, output_path=output_path)
         print(f'Processed audio saved to {output_path}')
 
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_aec_url(self):
+        model_id = 'damo/speech_dfsmn_aec_psm_16k'
+        input = {
+            'nearend_mic': NEAREND_MIC_URL,
+            'farend_speech': FAREND_SPEECH_URL
+        }
+        aec = pipeline(Tasks.acoustic_echo_cancellation, model=model_id)
+        output_path = os.path.abspath('output.wav')
+        aec(input, output_path=output_path)
+        print(f'Processed audio saved to {output_path}')
+
     @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
     def test_aec_bytes(self):
         model_id = 'damo/speech_dfsmn_aec_psm_16k'
@@ -63,7 +85,7 @@ class SpeechSignalProcessTest(unittest.TestCase):
         aec(inputs, output_path=output_path)
         print(f'Processed audio saved to {output_path}')
 
-    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
     def test_ans(self):
         model_id = 'damo/speech_frcrn_ans_cirm_16k'
         ans = pipeline(Tasks.acoustic_noise_suppression, model=model_id)
@@ -72,6 +94,14 @@ class SpeechSignalProcessTest(unittest.TestCase):
             output_path=output_path)
         print(f'Processed audio saved to {output_path}')
 
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_ans_url(self):
+        model_id = 'damo/speech_frcrn_ans_cirm_16k'
+        ans = pipeline(Tasks.acoustic_noise_suppression, model=model_id)
+        output_path = os.path.abspath('output.wav')
+        ans(NOISE_SPEECH_URL, output_path=output_path)
+        print(f'Processed audio saved to {output_path}')
+
     @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
     def test_ans_bytes(self):
         model_id = 'damo/speech_frcrn_ans_cirm_16k'
@@ -85,6 +115,10 @@ class SpeechSignalProcessTest(unittest.TestCase):
             ans(data, output_path=output_path)
         print(f'Processed audio saved to {output_path}')
 
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_table_question_answering.py b/tests/pipelines/test_table_question_answering.py
new file mode 100644
index 00000000..7ea28725
--- /dev/null
+++ b/tests/pipelines/test_table_question_answering.py
@@ -0,0 +1,75 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os
+import unittest
+
+from transformers import BertTokenizer
+
+from modelscope.hub.snapshot_download import snapshot_download
+from modelscope.models import Model
+from modelscope.pipelines import pipeline
+from modelscope.pipelines.nlp import TableQuestionAnsweringPipeline
+from modelscope.preprocessors import TableQuestionAnsweringPreprocessor
+from modelscope.preprocessors.star3.fields.database import Database
+from modelscope.utils.constant import ModelFile, Tasks
+from modelscope.utils.nlp.nlp_utils import tableqa_tracking_and_print_results
+from modelscope.utils.test_utils import test_level
+
+
+class TableQuestionAnswering(unittest.TestCase):
+
+    def setUp(self) -> None:
+        self.task = Tasks.table_question_answering
+        self.model_id = 'damo/nlp_convai_text2sql_pretrain_cn'
+
+    model_id = 'damo/nlp_convai_text2sql_pretrain_cn'
+    test_case = {
+        'utterance':
+        ['长江流域的小(2)型水库的库容总量是多少？', '那平均值是多少？', '那水库的名称呢？', '换成中型的呢？']
+    }
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_run_by_direct_model_download(self):
+        cache_path = snapshot_download(self.model_id)
+        preprocessor = TableQuestionAnsweringPreprocessor(model_dir=cache_path)
+        pipelines = [
+            TableQuestionAnsweringPipeline(
+                model=cache_path, preprocessor=preprocessor)
+        ]
+        tableqa_tracking_and_print_results(self.test_case, pipelines)
+
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_run_with_model_from_modelhub(self):
+        model = Model.from_pretrained(self.model_id)
+        preprocessor = TableQuestionAnsweringPreprocessor(
+            model_dir=model.model_dir)
+        pipelines = [
+            TableQuestionAnsweringPipeline(
+                model=model, preprocessor=preprocessor)
+        ]
+        tableqa_tracking_and_print_results(self.test_case, pipelines)
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_with_model_from_task(self):
+        pipelines = [pipeline(Tasks.table_question_answering, self.model_id)]
+        tableqa_tracking_and_print_results(self.test_case, pipelines)
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_run_with_model_from_modelhub_with_other_classes(self):
+        model = Model.from_pretrained(self.model_id)
+        self.tokenizer = BertTokenizer(
+            os.path.join(model.model_dir, ModelFile.VOCAB_FILE))
+        db = Database(
+            tokenizer=self.tokenizer,
+            table_file_path=os.path.join(model.model_dir, 'table.json'),
+            syn_dict_file_path=os.path.join(model.model_dir, 'synonym.txt'))
+        preprocessor = TableQuestionAnsweringPreprocessor(
+            model_dir=model.model_dir, db=db)
+        pipelines = [
+            TableQuestionAnsweringPipeline(
+                model=model, preprocessor=preprocessor, db=db)
+        ]
+        tableqa_tracking_and_print_results(self.test_case, pipelines)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/pipelines/test_text2text_generation.py b/tests/pipelines/test_text2text_generation.py
new file mode 100644
index 00000000..a39562f5
--- /dev/null
+++ b/tests/pipelines/test_text2text_generation.py
@@ -0,0 +1,61 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import unittest
+
+from modelscope.hub.snapshot_download import snapshot_download
+from modelscope.models import Model
+from modelscope.models.nlp import T5ForConditionalGeneration
+from modelscope.pipelines import pipeline
+from modelscope.pipelines.nlp import Text2TextGenerationPipeline
+from modelscope.preprocessors import Text2TextGenerationPreprocessor
+from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
+from modelscope.utils.test_utils import test_level
+
+
+class Text2TextGenerationTest(unittest.TestCase, DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.model_id = 'damo/t5-cn-base-test'
+        self.input = '中国的首都位于<extra_id_0>。'
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_run_T5(self):
+        cache_path = snapshot_download(self.model_id)
+        model = T5ForConditionalGeneration(cache_path)
+        preprocessor = Text2TextGenerationPreprocessor(cache_path)
+        pipeline1 = Text2TextGenerationPipeline(model, preprocessor)
+        pipeline2 = pipeline(
+            Tasks.text2text_generation, model=model, preprocessor=preprocessor)
+        print(
+            f'pipeline1: {pipeline1(self.input)}\npipeline2: {pipeline2(self.input)}'
+        )
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_run_pipeline_with_model_instance(self):
+        model = Model.from_pretrained(self.model_id)
+        preprocessor = Text2TextGenerationPreprocessor(model.model_dir)
+        pipeline_ins = pipeline(
+            task=Tasks.text2text_generation,
+            model=model,
+            preprocessor=preprocessor)
+        print(pipeline_ins(self.input))
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_run_pipeline_with_model_id(self):
+        pipeline_ins = pipeline(
+            task=Tasks.text2text_generation, model=self.model_id)
+        print(pipeline_ins(self.input))
+
+    @unittest.skip(
+        'only for test cases, there is no default official model yet')
+    def test_run_pipeline_without_model_id(self):
+        pipeline_ins = pipeline(task=Tasks.text2text_generation)
+        print(pipeline_ins(self.input))
+
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/pipelines/test_text_classification.py b/tests/pipelines/test_text_classification.py
index 542568d1..39dbac99 100644
--- a/tests/pipelines/test_text_classification.py
+++ b/tests/pipelines/test_text_classification.py
@@ -6,14 +6,17 @@ from modelscope.msdatasets import MsDataset
 from modelscope.pipelines import pipeline
 from modelscope.pipelines.nlp import SequenceClassificationPipeline
 from modelscope.preprocessors import SequenceClassificationPreprocessor
-from modelscope.utils.constant import Hubs, Tasks
+from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.test_utils import test_level
 
 
-class SequenceClassificationTest(unittest.TestCase):
+class SequenceClassificationTest(unittest.TestCase, DemoCompatibilityCheck):
+    sentence1 = 'i like this wonderful place'
 
     def setUp(self) -> None:
         self.model_id = 'damo/bert-base-sst2'
+        self.task = Tasks.text_classification
 
     def predict(self, pipeline_ins: SequenceClassificationPipeline):
         from easynlp.appzoo import load_dataset
@@ -44,7 +47,8 @@ class SequenceClassificationTest(unittest.TestCase):
             task=Tasks.text_classification,
             model=model,
             preprocessor=preprocessor)
-        self.predict(pipeline_ins)
+        print(f'sentence1: {self.sentence1}\n'
+              f'pipeline1:{pipeline_ins(input=self.sentence1)}')
 
     # @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     @unittest.skip('nlp model does not support tensor input, skipped')
@@ -87,6 +91,10 @@ class SequenceClassificationTest(unittest.TestCase):
         result = text_classification(dataset)
         self.printDataset(result)
 
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_text_driven_segmentation.py b/tests/pipelines/test_text_driven_segmentation.py
new file mode 100644
index 00000000..a67729ff
--- /dev/null
+++ b/tests/pipelines/test_text_driven_segmentation.py
@@ -0,0 +1,32 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import unittest
+
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.test_utils import test_level
+
+
+class TextDrivenSegmentationTest(unittest.TestCase):
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_text_driven_segmentation(self):
+        input_location = 'data/test/images/text_driven_segmentation.jpg'
+        test_input = {
+            'image': input_location,
+            'text': 'bear',
+        }
+        model_id = 'damo/cv_vitl16_segmentation_text-driven-seg'
+        shop_seg = pipeline(Tasks.text_driven_segmentation, model=model_id)
+        result = shop_seg(test_input)
+        import cv2
+        # result[OutputKeys.MASKS] is segment map result,other keys are not used
+        cv2.imwrite(input_location + '_lseg.jpg', result[OutputKeys.MASKS])
+
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
+    def test_demo_compatibility(self):
+        self.test_demo()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/pipelines/test_text_error_correction.py b/tests/pipelines/test_text_error_correction.py
index 5a1890ce..a714d3d0 100644
--- a/tests/pipelines/test_text_error_correction.py
+++ b/tests/pipelines/test_text_error_correction.py
@@ -8,11 +8,16 @@ from modelscope.pipelines import pipeline
 from modelscope.pipelines.nlp import TextErrorCorrectionPipeline
 from modelscope.preprocessors import TextErrorCorrectionPreprocessor
 from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.test_utils import test_level
 
 
-class TextErrorCorrectionTest(unittest.TestCase):
-    model_id = 'damo/nlp_bart_text-error-correction_chinese'
+class TextErrorCorrectionTest(unittest.TestCase, DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.task = Tasks.text_error_correction
+        self.model_id = 'damo/nlp_bart_text-error-correction_chinese'
+
     input = '随着中国经济突飞猛近，建造工业与日俱增'
 
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
@@ -50,6 +55,10 @@ class TextErrorCorrectionTest(unittest.TestCase):
         pipeline_ins = pipeline(task=Tasks.text_error_correction)
         print(pipeline_ins(self.input))
 
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_text_generation.py b/tests/pipelines/test_text_generation.py
index c08209a4..66f9c9da 100644
--- a/tests/pipelines/test_text_generation.py
+++ b/tests/pipelines/test_text_generation.py
@@ -8,10 +8,11 @@ from modelscope.pipelines import pipeline
 from modelscope.pipelines.nlp import TextGenerationPipeline
 from modelscope.preprocessors import TextGenerationPreprocessor
 from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.test_utils import test_level
 
 
-class TextGenerationTest(unittest.TestCase):
+class TextGenerationTest(unittest.TestCase, DemoCompatibilityCheck):
 
     def setUp(self) -> None:
         self.palm_model_id_zh = 'damo/nlp_palm2.0_text-generation_chinese-base'
@@ -128,6 +129,10 @@ class TextGenerationTest(unittest.TestCase):
         pipeline_ins = pipeline(task=Tasks.text_generation)
         print(pipeline_ins(self.palm_input_zh))
 
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_text_to_image_synthesis.py b/tests/pipelines/test_text_to_image_synthesis.py
index 32778ffb..0da6768a 100644
--- a/tests/pipelines/test_text_to_image_synthesis.py
+++ b/tests/pipelines/test_text_to_image_synthesis.py
@@ -8,11 +8,16 @@ from modelscope.models import Model
 from modelscope.outputs import OutputKeys
 from modelscope.pipelines import pipeline
 from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.test_utils import test_level
 
 
-class TextToImageSynthesisTest(unittest.TestCase):
-    model_id = 'damo/cv_diffusion_text-to-image-synthesis_tiny'
+class TextToImageSynthesisTest(unittest.TestCase, DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.task = Tasks.text_to_image_synthesis
+        self.model_id = 'damo/cv_diffusion_text-to-image-synthesis_tiny'
+
     test_text = {
         'text': '宇航员',
         'generator_ddim_timesteps': 2,
@@ -46,6 +51,10 @@ class TextToImageSynthesisTest(unittest.TestCase):
             self.test_text)[OutputKeys.OUTPUT_IMG]
         print(np.sum(np.abs(img)))
 
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_text_to_speech.py b/tests/pipelines/test_text_to_speech.py
index 74cab01f..f659e59b 100644
--- a/tests/pipelines/test_text_to_speech.py
+++ b/tests/pipelines/test_text_to_speech.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import unittest
 
 # NOTICE: Tensorflow 1.15 seems not so compatible with pytorch.
@@ -7,9 +9,11 @@ import unittest
 import torch
 from scipy.io.wavfile import write
 
+from modelscope.models import Model
 from modelscope.outputs import OutputKeys
 from modelscope.pipelines import pipeline
 from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.logger import get_logger
 from modelscope.utils.test_utils import test_level
 
@@ -18,22 +22,31 @@ import tensorflow as tf  # isort:skip
 logger = get_logger()
 
 
-class TextToSpeechSambertHifigan16kPipelineTest(unittest.TestCase):
+class TextToSpeechSambertHifigan16kPipelineTest(unittest.TestCase,
+                                                DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.task = Tasks.text_to_speech
+        self.model_id = 'damo/speech_sambert-hifigan_tts_zhitian_emo_zh-cn_16k'
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_pipeline(self):
         text = '今天北京天气怎么样？'
-        model_id = 'damo/speech_sambert-hifigan_tts_zhitian_emo_zh-cn_16k'
         voice = 'zhitian_emo'
 
-        sambert_hifigan_tts = pipeline(
-            task=Tasks.text_to_speech, model=model_id)
+        model = Model.from_pretrained(
+            model_name_or_path=self.model_id, revision='pytorch_am')
+        sambert_hifigan_tts = pipeline(task=self.task, model=model)
         self.assertTrue(sambert_hifigan_tts is not None)
         output = sambert_hifigan_tts(input=text, voice=voice)
         self.assertIsNotNone(output[OutputKeys.OUTPUT_PCM])
         pcm = output[OutputKeys.OUTPUT_PCM]
         write('output.wav', 16000, pcm)
 
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_tinynas_classification.py b/tests/pipelines/test_tinynas_classification.py
index d64b5bc0..ebc6b722 100644
--- a/tests/pipelines/test_tinynas_classification.py
+++ b/tests/pipelines/test_tinynas_classification.py
@@ -1,11 +1,18 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import unittest
 
 from modelscope.pipelines import pipeline
 from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.test_utils import test_level
 
 
-class TinyNASClassificationTest(unittest.TestCase):
+class TinyNASClassificationTest(unittest.TestCase, DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.task = Tasks.image_classification
+        self.model_id = 'damo/cv_tinynas_classification'
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_run(self):
@@ -14,6 +21,10 @@ class TinyNASClassificationTest(unittest.TestCase):
         result = tinynas_classification('data/test/images/image_wolf.jpeg')
         print(result)
 
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_tinynas_detection.py b/tests/pipelines/test_tinynas_detection.py
new file mode 100644
index 00000000..63db9145
--- /dev/null
+++ b/tests/pipelines/test_tinynas_detection.py
@@ -0,0 +1,26 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import unittest
+
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.test_utils import test_level
+
+
+class TinynasObjectDetectionTest(unittest.TestCase):
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run(self):
+        tinynas_object_detection = pipeline(
+            Tasks.image_object_detection, model='damo/cv_tinynas_detection')
+        result = tinynas_object_detection(
+            'data/test/images/image_detection.jpg')
+        print(result)
+
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
+    def test_demo_compatibility(self):
+        self.test_demo()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/pipelines/test_ulfd_face_detection.py b/tests/pipelines/test_ulfd_face_detection.py
new file mode 100644
index 00000000..0ffa688c
--- /dev/null
+++ b/tests/pipelines/test_ulfd_face_detection.py
@@ -0,0 +1,36 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os.path as osp
+import unittest
+
+import cv2
+import numpy as np
+
+from modelscope.msdatasets import MsDataset
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.cv.image_utils import draw_face_detection_no_lm_result
+from modelscope.utils.test_utils import test_level
+
+
+class UlfdFaceDetectionTest(unittest.TestCase):
+
+    def setUp(self) -> None:
+        self.model_id = 'damo/cv_manual_face-detection_ulfd'
+
+    def show_result(self, img_path, detection_result):
+        img = draw_face_detection_no_lm_result(img_path, detection_result)
+        cv2.imwrite('result.png', img)
+        print(f'output written to {osp.abspath("result.png")}')
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_modelhub(self):
+        face_detection = pipeline(Tasks.face_detection, model=self.model_id)
+        img_path = 'data/test/images/ulfd_face_detection.jpg'
+
+        result = face_detection(img_path)
+        self.show_result(img_path, result)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/pipelines/test_video_category.py b/tests/pipelines/test_video_category.py
index aba56676..660196b8 100644
--- a/tests/pipelines/test_video_category.py
+++ b/tests/pipelines/test_video_category.py
@@ -3,20 +3,28 @@ import unittest
 
 from modelscope.pipelines import pipeline
 from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.test_utils import test_level
 
 
-class VideoCategoryTest(unittest.TestCase):
+class VideoCategoryTest(unittest.TestCase, DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.task = Tasks.video_category
+        self.model_id = 'damo/cv_resnet50_video-category'
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_run_modelhub(self):
-        category_pipeline = pipeline(
-            Tasks.video_category, model='damo/cv_resnet50_video-category')
+        category_pipeline = pipeline(Tasks.video_category, self.model_id)
         result = category_pipeline(
             'data/test/videos/video_category_test_video.mp4')
 
         print(f'video category output: {result}.')
 
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_video_inpainting.py b/tests/pipelines/test_video_inpainting.py
new file mode 100644
index 00000000..8364b1b3
--- /dev/null
+++ b/tests/pipelines/test_video_inpainting.py
@@ -0,0 +1,39 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import unittest
+
+from modelscope.pipelines import pipeline
+from modelscope.pipelines.base import Pipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.test_utils import test_level
+
+
+class VideoInpaintingTest(unittest.TestCase):
+
+    def setUp(self) -> None:
+        self.model = 'damo/cv_video-inpainting'
+        self.mask_dir = 'data/test/videos/mask_dir'
+        self.video_in = 'data/test/videos/video_inpainting_test.mp4'
+        self.video_out = 'out.mp4'
+        self.input = {
+            'video_input_path': self.video_in,
+            'video_output_path': self.video_out,
+            'mask_path': self.mask_dir
+        }
+
+    def pipeline_inference(self, pipeline: Pipeline, input: str):
+        result = pipeline(input)
+        print(result)
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_modelhub(self):
+        video_inpainting = pipeline(Tasks.video_inpainting, model=self.model)
+        self.pipeline_inference(video_inpainting, self.input)
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_run_modelhub_default_model(self):
+        video_inpainting = pipeline(Tasks.video_inpainting)
+        self.pipeline_inference(video_inpainting, self.input)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/pipelines/test_video_multi_modal_embedding.py b/tests/pipelines/test_video_multi_modal_embedding.py
index b33ba56c..f4aa4d24 100644
--- a/tests/pipelines/test_video_multi_modal_embedding.py
+++ b/tests/pipelines/test_video_multi_modal_embedding.py
@@ -4,15 +4,19 @@ import unittest
 
 from modelscope.pipelines import pipeline
 from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.logger import get_logger
 from modelscope.utils.test_utils import test_level
 
 logger = get_logger()
 
 
-class VideoMultiModalEmbeddingTest(unittest.TestCase):
+class VideoMultiModalEmbeddingTest(unittest.TestCase, DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.task = Tasks.video_multi_modal_embedding
+        self.model_id = 'damo/multi_modal_clip_vtretrival_msrvtt_53'
 
-    model_id = 'damo/multi_modal_clip_vtretrival_msrvtt_53'
     video_path = 'data/test/videos/multi_modal_test_video_9770.mp4'
     caption = ('a person is connecting something to system', None, None)
     _input = {'video': video_path, 'text': caption}
@@ -37,6 +41,10 @@ class VideoMultiModalEmbeddingTest(unittest.TestCase):
         logger.info('video feature: {}'.format(
             output['video_embedding'][0][0][0]))
 
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_video_single_object_tracking.py b/tests/pipelines/test_video_single_object_tracking.py
index fc228cd8..7f3a9226 100644
--- a/tests/pipelines/test_video_single_object_tracking.py
+++ b/tests/pipelines/test_video_single_object_tracking.py
@@ -5,12 +5,14 @@ from modelscope.outputs import OutputKeys
 from modelscope.pipelines import pipeline
 from modelscope.utils.constant import Tasks
 from modelscope.utils.cv.image_utils import show_video_tracking_result
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.test_utils import test_level
 
 
-class SingleObjectTracking(unittest.TestCase):
+class SingleObjectTracking(unittest.TestCase, DemoCompatibilityCheck):
 
     def setUp(self) -> None:
+        self.task = Tasks.video_single_object_tracking
         self.model_id = 'damo/cv_vitb_video-single-object-tracking_ostrack'
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
@@ -33,6 +35,10 @@ class SingleObjectTracking(unittest.TestCase):
         result = video_single_object_tracking((video_path, init_bbox))
         print('result is : ', result[OutputKeys.BOXES])
 
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_video_summarization.py b/tests/pipelines/test_video_summarization.py
index 36724332..6dcc31e9 100644
--- a/tests/pipelines/test_video_summarization.py
+++ b/tests/pipelines/test_video_summarization.py
@@ -3,29 +3,39 @@ import unittest
 
 from modelscope.pipelines import pipeline
 from modelscope.utils.constant import Tasks
+from modelscope.utils.cv.image_utils import show_video_summarization_result
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.test_utils import test_level
 
 
-class VideoSummarizationTest(unittest.TestCase):
+class VideoSummarizationTest(unittest.TestCase, DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.task = Tasks.video_summarization
+        self.model_id = 'damo/cv_googlenet_pgl-video-summarization'
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_run_modelhub(self):
-
+        video_path = 'data/test/videos/video_category_test_video.mp4'
         summarization_pipeline = pipeline(
-            Tasks.video_summarization,
-            model='damo/cv_googlenet_pgl-video-summarization')
-        result = summarization_pipeline(
-            'data/test/videos/video_category_test_video.mp4')
+            Tasks.video_summarization, model=self.model_id)
+        result = summarization_pipeline(video_path)
 
-        print(f'video summarization output: {result}.')
+        print(f'video summarization output: \n{result}.')
+        show_video_summarization_result(video_path, result,
+                                        './summarization_result.avi')
 
     @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
     def test_run_modelhub_default_model(self):
+        video_path = 'data/test/videos/video_category_test_video.mp4'
         summarization_pipeline = pipeline(Tasks.video_summarization)
-        result = summarization_pipeline(
-            'data/test/videos/video_category_test_video.mp4')
+        result = summarization_pipeline(video_path)
 
-        print(f'video summarization output: {result}.')
+        print(f'video summarization output:\n {result}.')
+
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
 
 
 if __name__ == '__main__':
diff --git a/tests/pipelines/test_virtual_try_on.py b/tests/pipelines/test_virtual_try_on.py
index 1979c9b8..5c18dcc4 100644
--- a/tests/pipelines/test_virtual_try_on.py
+++ b/tests/pipelines/test_virtual_try_on.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import unittest
 
 import cv2
@@ -6,11 +8,16 @@ from PIL import Image
 from modelscope.outputs import OutputKeys
 from modelscope.pipelines import pipeline
 from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.test_utils import test_level
 
 
-class VirtualTryonTest(unittest.TestCase):
-    model_id = 'damo/cv_daflow_virtual-try-on_base'
+class VirtualTryonTest(unittest.TestCase, DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.task = Tasks.virtual_try_on
+        self.model_id = 'damo/cv_daflow_virtual-try-on_base'
+
     masked_model = Image.open('data/test/images/virtual_tryon_model.jpg')
     pose = Image.open('data/test/images/virtual_tryon_pose.jpg')
     cloth = Image.open('data/test/images/virtual_tryon_cloth.jpg')
@@ -29,6 +36,10 @@ class VirtualTryonTest(unittest.TestCase):
         img = pipeline_virtual_tryon(self.input_imgs)[OutputKeys.OUTPUT_IMG]
         cv2.imwrite('demo.jpg', img[:, :, ::-1])
 
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_word_segmentation.py b/tests/pipelines/test_word_segmentation.py
index c332d987..cd01b98f 100644
--- a/tests/pipelines/test_word_segmentation.py
+++ b/tests/pipelines/test_word_segmentation.py
@@ -1,5 +1,4 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
-import shutil
 import unittest
 
 from modelscope.hub.snapshot_download import snapshot_download
@@ -9,13 +8,20 @@ from modelscope.pipelines import pipeline
 from modelscope.pipelines.nlp import WordSegmentationPipeline
 from modelscope.preprocessors import TokenClassificationPreprocessor
 from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
+from modelscope.utils.regress_test_utils import MsRegressTool
 from modelscope.utils.test_utils import test_level
 
 
-class WordSegmentationTest(unittest.TestCase):
-    model_id = 'damo/nlp_structbert_word-segmentation_chinese-base'
+class WordSegmentationTest(unittest.TestCase, DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.task = Tasks.word_segmentation
+        self.model_id = 'damo/nlp_structbert_word-segmentation_chinese-base'
+
     sentence = '今天天气不错，适合出去游玩'
     sentence_eng = 'I am a program.'
+    regress_tool = MsRegressTool(baseline=False)
 
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_by_direct_model_download(self):
@@ -27,7 +33,6 @@ class WordSegmentationTest(unittest.TestCase):
             Tasks.word_segmentation, model=model, preprocessor=tokenizer)
         print(f'sentence: {self.sentence}\n'
               f'pipeline1:{pipeline1(input=self.sentence)}')
-        print()
         print(f'pipeline2: {pipeline2(input=self.sentence)}')
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
@@ -42,14 +47,22 @@ class WordSegmentationTest(unittest.TestCase):
     def test_run_with_model_name(self):
         pipeline_ins = pipeline(
             task=Tasks.word_segmentation, model=self.model_id)
-        print(pipeline_ins(input=self.sentence))
-        print(pipeline_ins(input=self.sentence_eng))
+        with self.regress_tool.monitor_module_single_forward(
+                pipeline_ins.model, 'sbert_ws_zh'):
+            print(pipeline_ins(input=self.sentence))
+        with self.regress_tool.monitor_module_single_forward(
+                pipeline_ins.model, 'sbert_ws_en'):
+            print(pipeline_ins(input=self.sentence_eng))
 
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_with_default_model(self):
         pipeline_ins = pipeline(task=Tasks.word_segmentation)
         print(pipeline_ins(input=self.sentence))
 
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_zero_shot_classification.py b/tests/pipelines/test_zero_shot_classification.py
index 7620a0ed..da1854c9 100644
--- a/tests/pipelines/test_zero_shot_classification.py
+++ b/tests/pipelines/test_zero_shot_classification.py
@@ -8,14 +8,21 @@ from modelscope.pipelines import pipeline
 from modelscope.pipelines.nlp import ZeroShotClassificationPipeline
 from modelscope.preprocessors import ZeroShotClassificationPreprocessor
 from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
+from modelscope.utils.regress_test_utils import MsRegressTool
 from modelscope.utils.test_utils import test_level
 
 
-class ZeroShotClassificationTest(unittest.TestCase):
-    model_id = 'damo/nlp_structbert_zero-shot-classification_chinese-base'
+class ZeroShotClassificationTest(unittest.TestCase, DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.task = Tasks.zero_shot_classification
+        self.model_id = 'damo/nlp_structbert_zero-shot-classification_chinese-base'
+
     sentence = '全新突破 解放军运20版空中加油机曝光'
     labels = ['文化', '体育', '娱乐', '财经', '家居', '汽车', '教育', '科技', '军事']
     template = '这篇文章的标题是{}'
+    regress_tool = MsRegressTool(baseline=False)
 
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_with_direct_file_download(self):
@@ -33,7 +40,6 @@ class ZeroShotClassificationTest(unittest.TestCase):
             f'sentence: {self.sentence}\n'
             f'pipeline1:{pipeline1(input=self.sentence,candidate_labels=self.labels)}'
         )
-        print()
         print(
             f'sentence: {self.sentence}\n'
             f'pipeline2: {pipeline2(self.sentence,candidate_labels=self.labels,hypothesis_template=self.template)}'
@@ -53,13 +59,21 @@ class ZeroShotClassificationTest(unittest.TestCase):
     def test_run_with_model_name(self):
         pipeline_ins = pipeline(
             task=Tasks.zero_shot_classification, model=self.model_id)
-        print(pipeline_ins(input=self.sentence, candidate_labels=self.labels))
+        with self.regress_tool.monitor_module_single_forward(
+                pipeline_ins.model, 'sbert_zero_shot'):
+            print(
+                pipeline_ins(
+                    input=self.sentence, candidate_labels=self.labels))
 
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_with_default_model(self):
         pipeline_ins = pipeline(task=Tasks.zero_shot_classification)
         print(pipeline_ins(input=self.sentence, candidate_labels=self.labels))
 
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/preprocessors/test_nlp.py b/tests/preprocessors/test_nlp.py
index 4271e201..f9f4d93f 100644
--- a/tests/preprocessors/test_nlp.py
+++ b/tests/preprocessors/test_nlp.py
@@ -32,6 +32,82 @@ class NLPPreprocessorTest(unittest.TestCase):
             output['attention_mask'],
             [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])
 
+    def test_token_classification_tokenize(self):
+        with self.subTest(tokenizer_type='bert'):
+            cfg = dict(
+                type='token-cls-tokenizer',
+                model_dir='bert-base-cased',
+                label2id={
+                    'O': 0,
+                    'B': 1,
+                    'I': 2
+                })
+            preprocessor = build_preprocessor(cfg, Fields.nlp)
+            input = 'Do not meddle in the affairs of wizards, ' \
+                    'for they are subtle and quick to anger.'
+            output = preprocessor(input)
+            self.assertTrue(InputFields.text in output)
+            self.assertEqual(output['input_ids'].tolist()[0], [
+                101, 2091, 1136, 1143, 13002, 1107, 1103, 5707, 1104, 16678,
+                1116, 117, 1111, 1152, 1132, 11515, 1105, 3613, 1106, 4470,
+                119, 102
+            ])
+            self.assertEqual(output['attention_mask'].tolist()[0], [
+                1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                1
+            ])
+            self.assertEqual(output['label_mask'].tolist()[0], [
+                False, True, True, True, False, True, True, True, True, True,
+                False, True, True, True, True, True, True, True, True, True,
+                True, False
+            ])
+            self.assertEqual(output['offset_mapping'], [(0, 2), (3, 6),
+                                                        (7, 13), (14, 16),
+                                                        (17, 20), (21, 28),
+                                                        (29, 31), (32, 39),
+                                                        (39, 40), (41, 44),
+                                                        (45, 49), (50, 53),
+                                                        (54, 60), (61, 64),
+                                                        (65, 70), (71, 73),
+                                                        (74, 79), (79, 80)])
+
+        with self.subTest(tokenizer_type='roberta'):
+            cfg = dict(
+                type='token-cls-tokenizer',
+                model_dir='xlm-roberta-base',
+                label2id={
+                    'O': 0,
+                    'B': 1,
+                    'I': 2
+                })
+            preprocessor = build_preprocessor(cfg, Fields.nlp)
+            input = 'Do not meddle in the affairs of wizards, ' \
+                    'for they are subtle and quick to anger.'
+            output = preprocessor(input)
+            self.assertTrue(InputFields.text in output)
+            self.assertEqual(output['input_ids'].tolist()[0], [
+                0, 984, 959, 128, 19298, 23, 70, 103086, 7, 111, 6, 44239,
+                99397, 4, 100, 1836, 621, 1614, 17991, 136, 63773, 47, 348, 56,
+                5, 2
+            ])
+            self.assertEqual(output['attention_mask'].tolist()[0], [
+                1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                1, 1, 1, 1, 1
+            ])
+            self.assertEqual(output['label_mask'].tolist()[0], [
+                False, True, True, True, False, True, True, True, False, True,
+                True, False, False, False, True, True, True, True, False, True,
+                True, True, True, False, False, False
+            ])
+            self.assertEqual(output['offset_mapping'], [(0, 2), (3, 6),
+                                                        (7, 13), (14, 16),
+                                                        (17, 20), (21, 28),
+                                                        (29, 31), (32, 40),
+                                                        (41, 44), (45, 49),
+                                                        (50, 53), (54, 60),
+                                                        (61, 64), (65, 70),
+                                                        (71, 73), (74, 80)])
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/run.py b/tests/run.py
index 27af7fe5..b286ecb5 100644
--- a/tests/run.py
+++ b/tests/run.py
@@ -2,23 +2,334 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 
 import argparse
+import datetime
+import multiprocessing
 import os
+import subprocess
 import sys
+import tempfile
 import unittest
 from fnmatch import fnmatch
+from multiprocessing.managers import BaseManager
+from pathlib import Path
+from turtle import shape
+from unittest import TestResult, TextTestResult
 
+import pandas
 # NOTICE: Tensorflow 1.15 seems not so compatible with pytorch.
 #         A segmentation fault may be raise by pytorch cpp library
 #         if 'import tensorflow' in front of 'import torch'.
 #         Puting a 'import torch' here can bypass this incompatibility.
 import torch
+import yaml
 
 from modelscope.utils.logger import get_logger
-from modelscope.utils.test_utils import set_test_level, test_level
+from modelscope.utils.model_tag import ModelTag, commit_model_ut_result
+from modelscope.utils.test_utils import (get_case_model_info, set_test_level,
+                                         test_level)
 
 logger = get_logger()
 
 
+def test_cases_result_to_df(result_list):
+    table_header = [
+        'Name', 'Result', 'Info', 'Start time', 'Stop time',
+        'Time cost(seconds)'
+    ]
+    df = pandas.DataFrame(
+        result_list, columns=table_header).sort_values(
+            by=['Start time'], ascending=True)
+    return df
+
+
+def statistics_test_result(df):
+    total_cases = df.shape[0]
+    # yapf: disable
+    success_cases = df.loc[df['Result'] == 'Success'].shape[0]
+    error_cases = df.loc[df['Result'] == 'Error'].shape[0]
+    failures_cases = df.loc[df['Result'] == 'Failures'].shape[0]
+    expected_failure_cases = df.loc[df['Result'] == 'ExpectedFailures'].shape[0]
+    unexpected_success_cases = df.loc[df['Result'] == 'UnexpectedSuccesses'].shape[0]
+    skipped_cases = df.loc[df['Result'] == 'Skipped'].shape[0]
+    # yapf: enable
+
+    if failures_cases > 0 or \
+       error_cases > 0 or \
+       unexpected_success_cases > 0:
+        final_result = 'FAILED'
+    else:
+        final_result = 'SUCCESS'
+    result_msg = '%s (Runs=%s,success=%s,failures=%s,errors=%s,\
+    skipped=%s,expected failures=%s,unexpected successes=%s)' % (
+        final_result, total_cases, success_cases, failures_cases, error_cases,
+        skipped_cases, expected_failure_cases, unexpected_success_cases)
+
+    model_cases = get_case_model_info()
+    for model_name, case_info in model_cases.items():
+        cases = df.loc[df['Name'].str.contains('|'.join(list(case_info)))]
+        results = cases['Result']
+        result = None
+        if any(results == 'Error') or any(results == 'Failures') or any(
+                results == 'UnexpectedSuccesses'):
+            result = ModelTag.MODEL_FAIL
+        elif any(results == 'Success'):
+            result = ModelTag.MODEL_PASS
+        elif all(results == 'Skipped'):
+            result = ModelTag.MODEL_SKIP
+        else:
+            print(f'invalid results for {model_name} \n{result}')
+
+        if result is not None:
+            commit_model_ut_result(model_name, result)
+    print('Testing result summary.')
+    print(result_msg)
+    if final_result == 'FAILED':
+        sys.exit(1)
+
+
+def gather_test_suites_in_files(test_dir, case_file_list, list_tests):
+    test_suite = unittest.TestSuite()
+    for case in case_file_list:
+        test_case = unittest.defaultTestLoader.discover(
+            start_dir=test_dir, pattern=case)
+        test_suite.addTest(test_case)
+        if hasattr(test_case, '__iter__'):
+            for subcase in test_case:
+                if list_tests:
+                    print(subcase)
+        else:
+            if list_tests:
+                print(test_case)
+    return test_suite
+
+
+def gather_test_suites_files(test_dir, pattern):
+    case_file_list = []
+    for dirpath, dirnames, filenames in os.walk(test_dir):
+        for file in filenames:
+            if fnmatch(file, pattern):
+                case_file_list.append(file)
+
+    return case_file_list
+
+
+def collect_test_results(case_results):
+    result_list = [
+    ]  # each item is Case, Result, Start time, Stop time, Time cost
+    for case_result in case_results.successes:
+        result_list.append(
+            (case_result.test_full_name, 'Success', '', case_result.start_time,
+             case_result.stop_time, case_result.time_cost))
+    for case_result in case_results.errors:
+        result_list.append(
+            (case_result[0].test_full_name, 'Error', case_result[1],
+             case_result[0].start_time, case_result[0].stop_time,
+             case_result[0].time_cost))
+    for case_result in case_results.skipped:
+        result_list.append(
+            (case_result[0].test_full_name, 'Skipped', case_result[1],
+             case_result[0].start_time, case_result[0].stop_time,
+             case_result[0].time_cost))
+    for case_result in case_results.expectedFailures:
+        result_list.append(
+            (case_result[0].test_full_name, 'ExpectedFailures', case_result[1],
+             case_result[0].start_time, case_result[0].stop_time,
+             case_result[0].time_cost))
+    for case_result in case_results.failures:
+        result_list.append(
+            (case_result[0].test_full_name, 'Failures', case_result[1],
+             case_result[0].start_time, case_result[0].stop_time,
+             case_result[0].time_cost))
+    for case_result in case_results.unexpectedSuccesses:
+        result_list.append((case_result.test_full_name, 'UnexpectedSuccesses',
+                            '', case_result.start_time, case_result.stop_time,
+                            case_result.time_cost))
+    return result_list
+
+
+def run_command_with_popen(cmd):
+    with subprocess.Popen(
+            cmd,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.STDOUT,
+            bufsize=1,
+            encoding='utf8') as sub_process:
+        for line in iter(sub_process.stdout.readline, ''):
+            sys.stdout.write(line)
+
+
+def save_test_result(df, args):
+    if args.result_dir is not None:
+        file_name = str(int(datetime.datetime.now().timestamp() * 1000))
+        os.umask(0)
+        Path(args.result_dir).mkdir(mode=0o777, parents=True, exist_ok=True)
+        Path(os.path.join(args.result_dir, file_name)).touch(
+            mode=0o666, exist_ok=True)
+        df.to_pickle(os.path.join(args.result_dir, file_name))
+
+
+def run_command(cmd):
+    logger.info('Running command: %s' % ' '.join(cmd))
+    response = subprocess.run(
+        cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+    try:
+        response.check_returncode()
+        logger.info(response.stdout.decode('utf8'))
+    except subprocess.CalledProcessError as error:
+        logger.error(
+            'stdout: %s, stderr: %s' %
+            (response.stdout.decode('utf8'), error.stderr.decode('utf8')))
+
+
+def install_packages(pkgs):
+    cmd = [sys.executable, '-m', 'pip', 'install']
+    for pkg in pkgs:
+        cmd.append(pkg)
+
+    run_command(cmd)
+
+
+def install_requirements(requirements):
+    for req in requirements:
+        cmd = [
+            sys.executable, '-m', 'pip', 'install', '-r',
+            'requirements/%s' % req, '-f',
+            'https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html'
+        ]
+        run_command(cmd)
+
+
+def run_case_in_env(env_name, env, test_suite_env_map, isolated_cases,
+                    result_dir):
+    # install requirements and deps # run_config['envs'][env]
+    if 'requirements' in env:
+        install_requirements(env['requirements'])
+    if 'dependencies' in env:
+        install_packages(env['dependencies'])
+
+    for test_suite_file in isolated_cases:  # run case in subprocess
+        if test_suite_file in test_suite_env_map and test_suite_env_map[
+                test_suite_file] == env_name:
+            cmd = [
+                'python',
+                'tests/run.py',
+                '--pattern',
+                test_suite_file,
+                '--result_dir',
+                result_dir,
+            ]
+            run_command_with_popen(cmd)
+        else:
+            pass  # case not in run list.
+
+    # run remain cases in a process.
+    remain_suite_files = []
+    for k, v in test_suite_env_map.items():
+        if k not in isolated_cases and v == env_name:
+            remain_suite_files.append(k)
+    if len(remain_suite_files) == 0:
+        return
+    cmd = ['python', 'tests/run.py', '--result_dir', result_dir, '--suites']
+    for suite in remain_suite_files:
+        cmd.append(suite)
+    run_command_with_popen(cmd)
+
+
+def run_in_subprocess(args):
+    # only case args.isolated_cases run in subporcess, all other run in a subprocess
+    test_suite_files = gather_test_suites_files(
+        os.path.abspath(args.test_dir), args.pattern)
+    run_config = None
+    isolated_cases = []
+    test_suite_env_map = {}
+    # put all the case in default env.
+    for test_suite_file in test_suite_files:
+        test_suite_env_map[test_suite_file] = 'default'
+
+    if args.run_config is not None and Path(args.run_config).exists():
+        with open(args.run_config) as f:
+            run_config = yaml.load(f, Loader=yaml.FullLoader)
+        if 'isolated' in run_config:
+            isolated_cases = run_config['isolated']
+
+        if 'envs' in run_config:
+            for env in run_config['envs']:
+                if env != 'default':
+                    for test_suite in run_config['envs'][env]['tests']:
+                        if test_suite in test_suite_env_map:
+                            test_suite_env_map[test_suite] = env
+
+    if args.subprocess:  # run all case in subprocess
+        isolated_cases = test_suite_files
+
+    with tempfile.TemporaryDirectory() as temp_result_dir:
+        for env in set(test_suite_env_map.values()):
+            run_case_in_env(env, run_config['envs'][env], test_suite_env_map,
+                            isolated_cases, temp_result_dir)
+
+        result_dfs = []
+        result_path = Path(temp_result_dir)
+        for result in result_path.iterdir():
+            if Path.is_file(result):
+                df = pandas.read_pickle(result)
+                result_dfs.append(df)
+        result_pd = pandas.concat(
+            result_dfs)  # merge result of every test suite.
+        print_table_result(result_pd)
+        print_abnormal_case_info(result_pd)
+        statistics_test_result(result_pd)
+
+
+def get_object_full_name(obj):
+    klass = obj.__class__
+    module = klass.__module__
+    if module == 'builtins':
+        return klass.__qualname__
+    return module + '.' + klass.__qualname__
+
+
+class TimeCostTextTestResult(TextTestResult):
+    """Record test case time used!"""
+
+    def __init__(self, stream, descriptions, verbosity):
+        self.successes = []
+        return super(TimeCostTextTestResult,
+                     self).__init__(stream, descriptions, verbosity)
+
+    def startTest(self, test):
+        test.start_time = datetime.datetime.now()
+        test.test_full_name = get_object_full_name(
+            test) + '.' + test._testMethodName
+        self.stream.writeln('Test case:  %s start at: %s' %
+                            (test.test_full_name, test.start_time))
+
+        return super(TimeCostTextTestResult, self).startTest(test)
+
+    def stopTest(self, test):
+        TextTestResult.stopTest(self, test)
+        test.stop_time = datetime.datetime.now()
+        test.time_cost = (test.stop_time - test.start_time).total_seconds()
+        self.stream.writeln(
+            'Test case: %s stop at: %s, cost time: %s(seconds)' %
+            (test.test_full_name, test.stop_time, test.time_cost))
+        super(TimeCostTextTestResult, self).stopTest(test)
+
+    def addSuccess(self, test):
+        self.successes.append(test)
+        super(TextTestResult, self).addSuccess(test)
+
+
+class TimeCostTextTestRunner(unittest.runner.TextTestRunner):
+    resultclass = TimeCostTextTestResult
+
+    def run(self, test):
+        return super(TimeCostTextTestRunner, self).run(test)
+
+    def _makeResult(self):
+        result = super(TimeCostTextTestRunner, self)._makeResult()
+        return result
+
+
 def gather_test_cases(test_dir, pattern, list_tests):
     case_list = []
     for dirpath, dirnames, filenames in os.walk(test_dir):
@@ -42,16 +353,44 @@ def gather_test_cases(test_dir, pattern, list_tests):
     return test_suite
 
 
+def print_abnormal_case_info(df):
+    df = df.loc[(df['Result'] == 'Error') | (df['Result'] == 'Failures')]
+    for _, row in df.iterrows():
+        print('Case %s run result: %s, msg:\n%s' %
+              (row['Name'], row['Result'], row['Info']))
+
+
+def print_table_result(df):
+    df = df.loc[df['Result'] != 'Skipped']
+    df = df.drop('Info', axis=1)
+    formatters = {
+        'Name': '{{:<{}s}}'.format(df['Name'].str.len().max()).format,
+        'Result': '{{:<{}s}}'.format(df['Result'].str.len().max()).format,
+    }
+    with pandas.option_context('display.max_rows', None, 'display.max_columns',
+                               None, 'display.width', None):
+        print(df.to_string(justify='left', formatters=formatters, index=False))
+
+
 def main(args):
-    runner = unittest.TextTestRunner()
-    test_suite = gather_test_cases(
-        os.path.abspath(args.test_dir), args.pattern, args.list_tests)
+    runner = TimeCostTextTestRunner()
+    if args.suites is not None and len(args.suites) > 0:
+        logger.info('Running: %s' % ' '.join(args.suites))
+        test_suite = gather_test_suites_in_files(args.test_dir, args.suites,
+                                                 args.list_tests)
+    else:
+        test_suite = gather_test_cases(
+            os.path.abspath(args.test_dir), args.pattern, args.list_tests)
     if not args.list_tests:
         result = runner.run(test_suite)
-        if len(result.failures) > 0:
-            sys.exit(len(result.failures))
-        if len(result.errors) > 0:
-            sys.exit(len(result.errors))
+        result = collect_test_results(result)
+        df = test_cases_result_to_df(result)
+        if args.result_dir is not None:
+            save_test_result(df, args)
+        else:
+            print_table_result(df)
+            print_abnormal_case_info(df)
+            statistics_test_result(df)
 
 
 if __name__ == '__main__':
@@ -66,11 +405,31 @@ if __name__ == '__main__':
         '--level', default=0, type=int, help='2 -- all, 1 -- p1, 0 -- p0')
     parser.add_argument(
         '--disable_profile', action='store_true', help='disable profiling')
+    parser.add_argument(
+        '--run_config',
+        default=None,
+        help='specified case run config file(yaml file)')
+    parser.add_argument(
+        '--subprocess',
+        action='store_true',
+        help='run all test suite in subprocess')
+    parser.add_argument(
+        '--result_dir',
+        default=None,
+        help='Save result to directory, internal use only')
+    parser.add_argument(
+        '--suites',
+        nargs='*',
+        help='Run specified test suites(test suite files list split by space)')
     args = parser.parse_args()
     set_test_level(args.level)
+    os.environ['REGRESSION_BASELINE'] = '1'
     logger.info(f'TEST LEVEL: {test_level()}')
     if not args.disable_profile:
         from utils import profiler
         logger.info('enable profile ...')
         profiler.enable()
-    main(args)
+    if args.run_config is not None or args.subprocess:
+        run_in_subprocess(args)
+    else:
+        main(args)
diff --git a/tests/run_config.yaml b/tests/run_config.yaml
new file mode 100644
index 00000000..4c571b7f
--- /dev/null
+++ b/tests/run_config.yaml
@@ -0,0 +1,32 @@
+# isolate cases in env, we can install different dependencies in each env.
+isolated:  # test cases that may require excessive anmount of GPU memory, which will be executed in dedicagted process.
+  - test_text_to_speech.py
+  - test_multi_modal_embedding.py
+  - test_ofa_tasks.py
+  - test_video_summarization.py
+  - test_dialog_modeling.py
+  - test_csanmt_translation.py
+  - test_image_super_resolution.py
+  - test_easycv_trainer.py
+  - test_segformer.py
+  - test_segmentation_pipeline.py
+
+envs:
+  default: # default env, case not in other env will in default, pytorch.
+    dependencies: # requirement packages，pip install before test case run.
+      - numpy>=1.20
+  tensorflow1x: #  cases excuted  tensorflow1.x framework.
+    requirements: # requirements files run before test case run.
+      - tensorflow1x.txt
+    dependencies: # requirement packages，pip install before test case run.
+      - numpy==1.18.5
+    tests:
+      - test_text_to_speech.py
+      - test_csanmt_translation.py
+      - test_translation_trainer.py
+      - test_ocr_detection.py
+      - test_automatic_speech_recognition.py
+      - test_image_matting.py
+      - test_person_image_cartoon.py
+      - test_skin_retouching.py
+      - test_image_style_transfer.py
diff --git a/tests/trainers/audio/__init__.py b/tests/trainers/audio/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/tests/trainers/audio/test_ans_trainer.py b/tests/trainers/audio/test_ans_trainer.py
new file mode 100644
index 00000000..c0860529
--- /dev/null
+++ b/tests/trainers/audio/test_ans_trainer.py
@@ -0,0 +1,66 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import os
+import shutil
+import tempfile
+import unittest
+from functools import partial
+
+from modelscope.metainfo import Trainers
+from modelscope.msdatasets import MsDataset
+from modelscope.trainers import build_trainer
+from modelscope.utils.audio.audio_utils import to_segment
+from modelscope.utils.hub import read_config
+from modelscope.utils.test_utils import test_level
+
+SEGMENT_LENGTH_TEST = 640
+
+
+class TestANSTrainer(unittest.TestCase):
+    REVISION = 'beta'
+
+    def setUp(self):
+        self.tmp_dir = tempfile.TemporaryDirectory().name
+        if not os.path.exists(self.tmp_dir):
+            os.makedirs(self.tmp_dir)
+
+        self.model_id = 'damo/speech_frcrn_ans_cirm_16k'
+        cfg = read_config(self.model_id, revision=self.REVISION)
+        cfg.train.max_epochs = 2
+        cfg.train.dataloader.batch_size_per_gpu = 1
+        self.cfg_file = os.path.join(self.tmp_dir, 'train_config.json')
+        cfg.dump(self.cfg_file)
+
+        hf_ds = MsDataset.load(
+            'ICASSP_2021_DNS_Challenge', split='test').to_hf_dataset()
+        mapped_ds = hf_ds.map(
+            partial(to_segment, segment_length=SEGMENT_LENGTH_TEST),
+            remove_columns=['duration'],
+            batched=True,
+            batch_size=2)
+        self.dataset = MsDataset.from_hf_dataset(mapped_ds)
+
+    def tearDown(self):
+        shutil.rmtree(self.tmp_dir, ignore_errors=True)
+        super().tearDown()
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_trainer(self):
+        kwargs = dict(
+            model=self.model_id,
+            model_revision=self.REVISION,
+            train_dataset=self.dataset,
+            eval_dataset=self.dataset,
+            max_epochs=2,
+            train_iters_per_epoch=2,
+            val_iters_per_epoch=1,
+            cfg_file=self.cfg_file,
+            work_dir=self.tmp_dir)
+
+        trainer = build_trainer(
+            Trainers.speech_frcrn_ans_cirm_16k, default_args=kwargs)
+        trainer.train()
+        results_files = os.listdir(self.tmp_dir)
+        self.assertIn(f'{trainer.timestamp}.log.json', results_files)
+        for i in range(2):
+            self.assertIn(f'epoch_{i + 1}.pth', results_files)
diff --git a/tests/trainers/easycv/__init__.py b/tests/trainers/easycv/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/tests/trainers/easycv/test_easycv_trainer.py b/tests/trainers/easycv/test_easycv_trainer.py
new file mode 100644
index 00000000..4bd63c55
--- /dev/null
+++ b/tests/trainers/easycv/test_easycv_trainer.py
@@ -0,0 +1,237 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import glob
+import os
+import shutil
+import tempfile
+import unittest
+
+import json
+import torch
+
+from modelscope.metainfo import Models, Pipelines, Trainers
+from modelscope.msdatasets import MsDataset
+from modelscope.trainers import build_trainer
+from modelscope.utils.config import Config
+from modelscope.utils.constant import LogKeys, ModeKeys, Tasks
+from modelscope.utils.logger import get_logger
+from modelscope.utils.test_utils import DistributedTestCase, test_level
+from modelscope.utils.torch_utils import is_master
+
+
+def train_func(work_dir, dist=False, log_interval=3, imgs_per_gpu=4):
+    import easycv
+    config_path = os.path.join(
+        os.path.dirname(easycv.__file__),
+        'configs/detection/yolox/yolox_s_8xb16_300e_coco.py')
+
+    cfg = Config.from_file(config_path)
+
+    cfg.log_config.update(
+        dict(hooks=[
+            dict(type='TextLoggerHook'),
+            dict(type='TensorboardLoggerHook')
+        ]))  # not support TensorboardLoggerHookV2
+
+    ms_cfg_file = os.path.join(work_dir, 'ms_yolox_s_8xb16_300e_coco.json')
+    from easycv.utils.ms_utils import to_ms_config
+
+    if is_master():
+        to_ms_config(
+            cfg,
+            dump=True,
+            task=Tasks.image_object_detection,
+            ms_model_name=Models.yolox,
+            pipeline_name=Pipelines.easycv_detection,
+            save_path=ms_cfg_file)
+
+    trainer_name = Trainers.easycv
+    train_dataset = MsDataset.load(
+        dataset_name='small_coco_for_test', namespace='EasyCV', split='train')
+    eval_dataset = MsDataset.load(
+        dataset_name='small_coco_for_test',
+        namespace='EasyCV',
+        split='validation')
+
+    cfg_options = {
+        'train.max_epochs':
+        2,
+        'train.dataloader.batch_size_per_gpu':
+        imgs_per_gpu,
+        'evaluation.dataloader.batch_size_per_gpu':
+        2,
+        'train.hooks': [
+            {
+                'type': 'CheckpointHook',
+                'interval': 1
+            },
+            {
+                'type': 'EvaluationHook',
+                'interval': 1
+            },
+            {
+                'type': 'TextLoggerHook',
+                'interval': log_interval
+            },
+        ]
+    }
+    kwargs = dict(
+        cfg_file=ms_cfg_file,
+        train_dataset=train_dataset,
+        eval_dataset=eval_dataset,
+        work_dir=work_dir,
+        cfg_options=cfg_options,
+        launcher='pytorch' if dist else None)
+
+    trainer = build_trainer(trainer_name, kwargs)
+    trainer.train()
+
+
+@unittest.skipIf(not torch.cuda.is_available(), 'cuda unittest')
+class EasyCVTrainerTestSingleGpu(unittest.TestCase):
+
+    def setUp(self):
+        self.logger = get_logger()
+        self.logger.info(('Testing %s.%s' %
+                          (type(self).__name__, self._testMethodName)))
+        self.tmp_dir = tempfile.TemporaryDirectory().name
+        if not os.path.exists(self.tmp_dir):
+            os.makedirs(self.tmp_dir)
+
+    def tearDown(self):
+        super().tearDown()
+        shutil.rmtree(self.tmp_dir, ignore_errors=True)
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_single_gpu(self):
+        train_func(self.tmp_dir)
+
+        results_files = os.listdir(self.tmp_dir)
+        json_files = glob.glob(os.path.join(self.tmp_dir, '*.log.json'))
+        self.assertEqual(len(json_files), 1)
+
+        with open(json_files[0], 'r') as f:
+            lines = [i.strip() for i in f.readlines()]
+
+        self.assertDictContainsSubset(
+            {
+                LogKeys.MODE: ModeKeys.TRAIN,
+                LogKeys.EPOCH: 1,
+                LogKeys.ITER: 3,
+                LogKeys.LR: 0.00013
+            }, json.loads(lines[0]))
+        self.assertDictContainsSubset(
+            {
+                LogKeys.MODE: ModeKeys.EVAL,
+                LogKeys.EPOCH: 1,
+                LogKeys.ITER: 10
+            }, json.loads(lines[1]))
+        self.assertDictContainsSubset(
+            {
+                LogKeys.MODE: ModeKeys.TRAIN,
+                LogKeys.EPOCH: 2,
+                LogKeys.ITER: 3,
+                LogKeys.LR: 0.00157
+            }, json.loads(lines[2]))
+        self.assertDictContainsSubset(
+            {
+                LogKeys.MODE: ModeKeys.EVAL,
+                LogKeys.EPOCH: 2,
+                LogKeys.ITER: 10
+            }, json.loads(lines[3]))
+        self.assertIn(f'{LogKeys.EPOCH}_1.pth', results_files)
+        self.assertIn(f'{LogKeys.EPOCH}_2.pth', results_files)
+        for i in [0, 2]:
+            self.assertIn(LogKeys.DATA_LOAD_TIME, lines[i])
+            self.assertIn(LogKeys.ITER_TIME, lines[i])
+            self.assertIn(LogKeys.MEMORY, lines[i])
+            self.assertIn('total_loss', lines[i])
+        for i in [1, 3]:
+            self.assertIn(
+                'CocoDetectionEvaluator_DetectionBoxes_Precision/mAP',
+                lines[i])
+            self.assertIn('DetectionBoxes_Precision/mAP', lines[i])
+            self.assertIn('DetectionBoxes_Precision/mAP@.50IOU', lines[i])
+            self.assertIn('DetectionBoxes_Precision/mAP@.75IOU', lines[i])
+            self.assertIn('DetectionBoxes_Precision/mAP (small)', lines[i])
+
+
+@unittest.skipIf(not torch.cuda.is_available()
+                 or torch.cuda.device_count() <= 1, 'distributed unittest')
+class EasyCVTrainerTestMultiGpus(DistributedTestCase):
+
+    def setUp(self):
+        self.logger = get_logger()
+        self.logger.info(('Testing %s.%s' %
+                          (type(self).__name__, self._testMethodName)))
+        self.tmp_dir = tempfile.TemporaryDirectory().name
+        if not os.path.exists(self.tmp_dir):
+            os.makedirs(self.tmp_dir)
+
+    def tearDown(self):
+        super().tearDown()
+        shutil.rmtree(self.tmp_dir, ignore_errors=True)
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_multi_gpus(self):
+        self.start(
+            train_func,
+            num_gpus=2,
+            work_dir=self.tmp_dir,
+            dist=True,
+            log_interval=2,
+            imgs_per_gpu=5)
+
+        results_files = os.listdir(self.tmp_dir)
+        json_files = glob.glob(os.path.join(self.tmp_dir, '*.log.json'))
+        self.assertEqual(len(json_files), 1)
+
+        with open(json_files[0], 'r') as f:
+            lines = [i.strip() for i in f.readlines()]
+
+        self.assertDictContainsSubset(
+            {
+                LogKeys.MODE: ModeKeys.TRAIN,
+                LogKeys.EPOCH: 1,
+                LogKeys.ITER: 2,
+                LogKeys.LR: 0.0002
+            }, json.loads(lines[0]))
+        self.assertDictContainsSubset(
+            {
+                LogKeys.MODE: ModeKeys.EVAL,
+                LogKeys.EPOCH: 1,
+                LogKeys.ITER: 5
+            }, json.loads(lines[1]))
+        self.assertDictContainsSubset(
+            {
+                LogKeys.MODE: ModeKeys.TRAIN,
+                LogKeys.EPOCH: 2,
+                LogKeys.ITER: 2,
+                LogKeys.LR: 0.0018
+            }, json.loads(lines[2]))
+        self.assertDictContainsSubset(
+            {
+                LogKeys.MODE: ModeKeys.EVAL,
+                LogKeys.EPOCH: 2,
+                LogKeys.ITER: 5
+            }, json.loads(lines[3]))
+
+        self.assertIn(f'{LogKeys.EPOCH}_1.pth', results_files)
+        self.assertIn(f'{LogKeys.EPOCH}_2.pth', results_files)
+
+        for i in [0, 2]:
+            self.assertIn(LogKeys.DATA_LOAD_TIME, lines[i])
+            self.assertIn(LogKeys.ITER_TIME, lines[i])
+            self.assertIn(LogKeys.MEMORY, lines[i])
+            self.assertIn('total_loss', lines[i])
+        for i in [1, 3]:
+            self.assertIn(
+                'CocoDetectionEvaluator_DetectionBoxes_Precision/mAP',
+                lines[i])
+            self.assertIn('DetectionBoxes_Precision/mAP', lines[i])
+            self.assertIn('DetectionBoxes_Precision/mAP@.50IOU', lines[i])
+            self.assertIn('DetectionBoxes_Precision/mAP@.75IOU', lines[i])
+            self.assertIn('DetectionBoxes_Precision/mAP (small)', lines[i])
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/trainers/easycv/test_segformer.py b/tests/trainers/easycv/test_segformer.py
new file mode 100644
index 00000000..90a66635
--- /dev/null
+++ b/tests/trainers/easycv/test_segformer.py
@@ -0,0 +1,72 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import glob
+import os
+import shutil
+import tempfile
+import unittest
+
+import torch
+
+from modelscope.metainfo import Trainers
+from modelscope.msdatasets import MsDataset
+from modelscope.trainers import build_trainer
+from modelscope.utils.constant import LogKeys, Tasks
+from modelscope.utils.logger import get_logger
+from modelscope.utils.test_utils import test_level
+
+
+@unittest.skipIf(not torch.cuda.is_available(), 'cuda unittest')
+class EasyCVTrainerTestSegformer(unittest.TestCase):
+
+    def setUp(self):
+        self.logger = get_logger()
+        self.logger.info(('Testing %s.%s' %
+                          (type(self).__name__, self._testMethodName)))
+        self.tmp_dir = tempfile.TemporaryDirectory().name
+        if not os.path.exists(self.tmp_dir):
+            os.makedirs(self.tmp_dir)
+
+    def tearDown(self):
+        super().tearDown()
+        shutil.rmtree(self.tmp_dir, ignore_errors=True)
+
+    def _train(self):
+
+        cfg_options = {
+            'train.max_epochs': 2,
+            'model.decode_head.norm_cfg.type': 'BN'
+        }
+
+        trainer_name = Trainers.easycv
+        train_dataset = MsDataset.load(
+            dataset_name='small_coco_stuff164k',
+            namespace='EasyCV',
+            split='train')
+        eval_dataset = MsDataset.load(
+            dataset_name='small_coco_stuff164k',
+            namespace='EasyCV',
+            split='validation')
+        kwargs = dict(
+            model=
+            'damo/cv_segformer-b0_image_semantic-segmentation_coco-stuff164k',
+            train_dataset=train_dataset,
+            eval_dataset=eval_dataset,
+            work_dir=self.tmp_dir,
+            cfg_options=cfg_options)
+
+        trainer = build_trainer(trainer_name, kwargs)
+        trainer.train()
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_single_gpu_segformer(self):
+        self._train()
+
+        results_files = os.listdir(self.tmp_dir)
+        json_files = glob.glob(os.path.join(self.tmp_dir, '*.log.json'))
+        self.assertEqual(len(json_files), 1)
+        self.assertIn(f'{LogKeys.EPOCH}_1.pth', results_files)
+        self.assertIn(f'{LogKeys.EPOCH}_2.pth', results_files)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/trainers/hooks/logger/test_tensorboard_hook.py b/tests/trainers/hooks/logger/test_tensorboard_hook.py
index 54c31056..67b1aa63 100644
--- a/tests/trainers/hooks/logger/test_tensorboard_hook.py
+++ b/tests/trainers/hooks/logger/test_tensorboard_hook.py
@@ -11,6 +11,7 @@ import torch
 from torch import nn
 
 from modelscope.metainfo import Trainers
+from modelscope.models.base import Model
 from modelscope.trainers import build_trainer
 from modelscope.utils.constant import LogKeys, ModelFile
 from modelscope.utils.test_utils import create_dummy_test_dataset
@@ -19,7 +20,7 @@ dummy_dataset = create_dummy_test_dataset(
     np.random.random(size=(5, )), np.random.randint(0, 4, (1, )), 20)
 
 
-class DummyModel(nn.Module):
+class DummyModel(nn.Module, Model):
 
     def __init__(self):
         super().__init__()
diff --git a/tests/trainers/hooks/test_checkpoint_hook.py b/tests/trainers/hooks/test_checkpoint_hook.py
index 1c81d057..e7f2d33c 100644
--- a/tests/trainers/hooks/test_checkpoint_hook.py
+++ b/tests/trainers/hooks/test_checkpoint_hook.py
@@ -11,11 +11,14 @@ from torch import nn
 
 from modelscope.metainfo import Trainers
 from modelscope.metrics.builder import METRICS, MetricKeys
+from modelscope.models.base import Model
 from modelscope.trainers import build_trainer
 from modelscope.utils.constant import LogKeys, ModelFile
 from modelscope.utils.registry import default_group
 from modelscope.utils.test_utils import create_dummy_test_dataset
 
+SRC_DIR = os.path.dirname(__file__)
+
 
 def create_dummy_metric():
     _global_iter = 0
@@ -39,12 +42,13 @@ dummy_dataset = create_dummy_test_dataset(
     np.random.random(size=(5, )), np.random.randint(0, 4, (1, )), 20)
 
 
-class DummyModel(nn.Module):
+class DummyModel(nn.Module, Model):
 
     def __init__(self):
         super().__init__()
         self.linear = nn.Linear(5, 4)
         self.bn = nn.BatchNorm1d(4)
+        self.model_dir = SRC_DIR
 
     def forward(self, feat, labels):
         x = self.linear(feat)
@@ -123,6 +127,14 @@ class CheckpointHookTest(unittest.TestCase):
         self.assertIn(f'{LogKeys.EPOCH}_1.pth', results_files)
         self.assertIn(f'{LogKeys.EPOCH}_2.pth', results_files)
 
+        output_files = os.listdir(
+            os.path.join(self.tmp_dir, ModelFile.TRAIN_OUTPUT_DIR))
+        self.assertIn(ModelFile.CONFIGURATION, output_files)
+        self.assertIn(ModelFile.TORCH_MODEL_BIN_FILE, output_files)
+        copy_src_files = os.listdir(SRC_DIR)
+        self.assertIn(copy_src_files[0], output_files)
+        self.assertIn(copy_src_files[-1], output_files)
+
 
 class BestCkptSaverHookTest(unittest.TestCase):
 
@@ -192,12 +204,17 @@ class BestCkptSaverHookTest(unittest.TestCase):
         trainer = build_trainer(trainer_name, kwargs)
         trainer.train()
         results_files = os.listdir(self.tmp_dir)
-        self.assertIn(f'{LogKeys.EPOCH}_1.pth', results_files)
-        self.assertIn(f'{LogKeys.EPOCH}_2.pth', results_files)
-        self.assertIn(f'{LogKeys.EPOCH}_3.pth', results_files)
         self.assertIn(f'best_{LogKeys.EPOCH}1_{MetricKeys.ACCURACY}0.1.pth',
                       results_files)
 
+        output_files = os.listdir(
+            os.path.join(self.tmp_dir, ModelFile.TRAIN_OUTPUT_DIR))
+        self.assertIn(ModelFile.CONFIGURATION, output_files)
+        self.assertIn(ModelFile.TORCH_MODEL_BIN_FILE, output_files)
+        copy_src_files = os.listdir(SRC_DIR)
+        self.assertIn(copy_src_files[0], output_files)
+        self.assertIn(copy_src_files[-1], output_files)
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/trainers/hooks/test_evaluation_hook.py b/tests/trainers/hooks/test_evaluation_hook.py
index 1338bb2c..2c71e790 100644
--- a/tests/trainers/hooks/test_evaluation_hook.py
+++ b/tests/trainers/hooks/test_evaluation_hook.py
@@ -11,6 +11,7 @@ from torch import nn
 
 from modelscope.metainfo import Trainers
 from modelscope.metrics.builder import METRICS, MetricKeys
+from modelscope.models.base import Model
 from modelscope.trainers import build_trainer
 from modelscope.utils.constant import ModelFile
 from modelscope.utils.registry import default_group
@@ -34,7 +35,7 @@ dummy_dataset = create_dummy_test_dataset(
     np.random.random(size=(5, )), np.random.randint(0, 4, (1, )), 20)
 
 
-class DummyModel(nn.Module):
+class DummyModel(nn.Module, Model):
 
     def __init__(self):
         super().__init__()
diff --git a/tests/trainers/hooks/test_lr_scheduler_hook.py b/tests/trainers/hooks/test_lr_scheduler_hook.py
index 86d53ecc..7a1ff220 100644
--- a/tests/trainers/hooks/test_lr_scheduler_hook.py
+++ b/tests/trainers/hooks/test_lr_scheduler_hook.py
@@ -13,6 +13,7 @@ from torch.optim.lr_scheduler import MultiStepLR
 
 from modelscope.metainfo import Trainers
 from modelscope.metrics.builder import METRICS, MetricKeys
+from modelscope.models.base import Model
 from modelscope.trainers import build_trainer
 from modelscope.utils.constant import LogKeys, ModelFile, TrainerStages
 from modelscope.utils.registry import default_group
@@ -40,7 +41,7 @@ def create_dummy_metric():
             return {MetricKeys.ACCURACY: self._fake_acc_by_epoch[_global_iter]}
 
 
-class DummyModel(nn.Module):
+class DummyModel(nn.Module, Model):
 
     def __init__(self):
         super().__init__()
diff --git a/tests/trainers/hooks/test_optimizer_hook.py b/tests/trainers/hooks/test_optimizer_hook.py
index 25457c1c..84c783b5 100644
--- a/tests/trainers/hooks/test_optimizer_hook.py
+++ b/tests/trainers/hooks/test_optimizer_hook.py
@@ -12,6 +12,7 @@ from torch.optim import SGD
 from torch.optim.lr_scheduler import MultiStepLR
 
 from modelscope.metainfo import Trainers
+from modelscope.models.base import Model
 from modelscope.trainers import build_trainer
 from modelscope.utils.constant import ModelFile, TrainerStages
 from modelscope.utils.test_utils import create_dummy_test_dataset
@@ -20,7 +21,7 @@ dummy_dataset = create_dummy_test_dataset(
     np.random.random(size=(2, )), np.random.randint(0, 2, (1, )), 10)
 
 
-class DummyModel(nn.Module):
+class DummyModel(nn.Module, Model):
 
     def __init__(self):
         super().__init__()
diff --git a/tests/trainers/hooks/test_timer_hook.py b/tests/trainers/hooks/test_timer_hook.py
index 614f7688..9fb79c77 100644
--- a/tests/trainers/hooks/test_timer_hook.py
+++ b/tests/trainers/hooks/test_timer_hook.py
@@ -12,6 +12,7 @@ from torch.optim import SGD
 from torch.optim.lr_scheduler import MultiStepLR
 
 from modelscope.metainfo import Trainers
+from modelscope.models.base import Model
 from modelscope.trainers import build_trainer
 from modelscope.utils.constant import LogKeys, ModelFile, TrainerStages
 from modelscope.utils.test_utils import create_dummy_test_dataset
@@ -20,7 +21,7 @@ dummy_dataset = create_dummy_test_dataset(
     np.random.random(size=(5, )), np.random.randint(0, 4, (1, )), 10)
 
 
-class DummyModel(nn.Module):
+class DummyModel(nn.Module, Model):
 
     def __init__(self):
         super().__init__()
@@ -83,8 +84,8 @@ class IterTimerHookTest(unittest.TestCase):
             trainer.train_dataset, **trainer.cfg.train.get('dataloader', {}))
         trainer.register_optimizers_hook()
         trainer.register_hook_from_cfg(trainer.cfg.train.hooks)
-        trainer.data_loader = train_dataloader
         trainer.train_dataloader = train_dataloader
+        trainer.data_loader = train_dataloader
         trainer.invoke_hook(TrainerStages.before_run)
         for i in range(trainer._epoch, trainer._max_epochs):
             trainer.invoke_hook(TrainerStages.before_train_epoch)
diff --git a/tests/trainers/test_dialog_intent_trainer.py b/tests/trainers/test_dialog_intent_trainer.py
new file mode 100644
index 00000000..207387ac
--- /dev/null
+++ b/tests/trainers/test_dialog_intent_trainer.py
@@ -0,0 +1,103 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import os
+import shutil
+import tempfile
+import unittest
+
+import json
+
+from modelscope.hub.snapshot_download import snapshot_download
+from modelscope.metainfo import Trainers
+from modelscope.msdatasets import MsDataset
+from modelscope.trainers import build_trainer
+from modelscope.utils.config import Config
+from modelscope.utils.constant import DownloadMode, ModelFile, Tasks
+from modelscope.utils.test_utils import test_level
+
+
+class TestDialogIntentTrainer(unittest.TestCase):
+
+    def setUp(self):
+        self.save_dir = tempfile.TemporaryDirectory().name
+        if not os.path.exists(self.save_dir):
+            os.mkdir(self.save_dir)
+
+    def tearDown(self):
+        shutil.rmtree(self.save_dir)
+        super().tearDown()
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_trainer_with_model_and_args(self):
+        model_id = 'damo/nlp_space_pretrained-dialog-model'
+        data_banking = MsDataset.load('banking77')
+        self.data_dir = data_banking._hf_ds.config_kwargs['split_config'][
+            'train']
+        self.model_dir = snapshot_download(model_id)
+        self.debugging = True
+        kwargs = dict(
+            model_dir=self.model_dir,
+            cfg_name='intent_train_config.json',
+            cfg_modify_fn=self.cfg_modify_fn)
+        trainer = build_trainer(
+            name=Trainers.dialog_intent_trainer, default_args=kwargs)
+        trainer.train()
+
+    def cfg_modify_fn(self, cfg):
+        config = {
+            'num_intent': 77,
+            'BPETextField': {
+                'vocab_path': '',
+                'data_name': 'banking77',
+                'data_root': self.data_dir,
+                'understand': True,
+                'generation': False,
+                'max_len': 256
+            },
+            'Dataset': {
+                'data_dir': self.data_dir,
+                'with_contrastive': False,
+                'trigger_role': 'user',
+                'trigger_data': 'banking'
+            },
+            'Trainer': {
+                'can_norm': True,
+                'seed': 11,
+                'gpu': 1,
+                'save_dir': self.save_dir,
+                'batch_size_label': 128,
+                'batch_size_nolabel': 0,
+                'log_steps': 20
+            },
+            'Model': {
+                'init_checkpoint': self.model_dir,
+                'model': 'IntentUnifiedTransformer',
+                'example': False,
+                'num_intent': 77,
+                'with_rdrop': True,
+                'num_turn_embeddings': 21,
+                'dropout': 0.25,
+                'kl_ratio': 5.0,
+                'embed_dropout': 0.25,
+                'attn_dropout': 0.25,
+                'ff_dropout': 0.25,
+                'with_pool': False,
+                'warmup_steps': -1
+            }
+        }
+        cfg.BPETextField.vocab_path = os.path.join(self.model_dir,
+                                                   ModelFile.VOCAB_FILE)
+        cfg.num_intent = 77
+        cfg.Trainer.update(config['Trainer'])
+        cfg.BPETextField.update(config['BPETextField'])
+        cfg.Dataset.update(config['Dataset'])
+        cfg.Model.update(config['Model'])
+        if self.debugging:
+            cfg.Trainer.save_checkpoint = False
+            cfg.Trainer.num_epochs = 5
+            cfg.Trainer.batch_size_label = 64
+        return cfg
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/trainers/test_dialog_modeling_trainer.py b/tests/trainers/test_dialog_modeling_trainer.py
new file mode 100644
index 00000000..be03db30
--- /dev/null
+++ b/tests/trainers/test_dialog_modeling_trainer.py
@@ -0,0 +1,68 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os
+import unittest
+
+import torch
+
+from modelscope.hub.snapshot_download import snapshot_download
+from modelscope.metainfo import Preprocessors, Trainers
+from modelscope.msdatasets import MsDataset
+from modelscope.trainers import build_trainer
+from modelscope.utils.constant import DownloadMode, ModelFile
+from modelscope.utils.test_utils import test_level
+
+
+class TestDialogModelingTrainer(unittest.TestCase):
+
+    model_id = 'damo/nlp_space_pretrained-dialog-model'
+    output_dir = './dialog_fintune_result'
+
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_trainer_with_model_and_args(self):
+        # download data set
+        data_multiwoz = MsDataset.load(
+            'MultiWoz2.0', download_mode=DownloadMode.REUSE_DATASET_IF_EXISTS)
+        data_dir = os.path.join(
+            data_multiwoz._hf_ds.config_kwargs['split_config']['train'],
+            'data')
+
+        # download model
+        model_dir = snapshot_download(self.model_id)
+
+        # dialog finetune config
+        def cfg_modify_fn(cfg):
+            config = {
+                'seed': 10,
+                'gpu': 4,
+                'use_data_distributed': False,
+                'valid_metric_name': '-loss',
+                'num_epochs': 60,
+                'save_dir': self.output_dir,
+                'token_loss': True,
+                'batch_size': 32,
+                'log_steps': 10,
+                'valid_steps': 0,
+                'save_checkpoint': True,
+                'save_summary': False,
+                'shuffle': True,
+                'sort_pool_size': 0
+            }
+
+            cfg.Trainer = config
+            cfg.use_gpu = torch.cuda.is_available() and config['gpu'] >= 1
+            return cfg
+
+        # trainer config
+        kwargs = dict(
+            model_dir=model_dir,
+            cfg_name='gen_train_config.json',
+            data_dir=data_dir,
+            cfg_modify_fn=cfg_modify_fn)
+
+        trainer = build_trainer(
+            name=Trainers.dialog_modeling_trainer, default_args=kwargs)
+        trainer.train()
+        checkpoint_path = os.path.join(self.output_dir,
+                                       ModelFile.TORCH_MODEL_BIN_FILE)
+        assert os.path.exists(checkpoint_path)
+        trainer.evaluate(checkpoint_path=checkpoint_path)
diff --git a/tests/trainers/test_finetune_mplug.py b/tests/trainers/test_finetune_mplug.py
new file mode 100644
index 00000000..72196fba
--- /dev/null
+++ b/tests/trainers/test_finetune_mplug.py
@@ -0,0 +1,163 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os
+import shutil
+import tempfile
+import unittest
+
+from modelscope.hub.snapshot_download import snapshot_download
+from modelscope.metainfo import Trainers
+from modelscope.models.multi_modal import MPlugForAllTasks
+from modelscope.msdatasets import MsDataset
+from modelscope.trainers import EpochBasedTrainer, build_trainer
+from modelscope.utils.constant import ModelFile
+from modelscope.utils.test_utils import test_level
+
+
+class TestFinetuneMPlug(unittest.TestCase):
+
+    def setUp(self):
+        print(('Testing %s.%s' % (type(self).__name__, self._testMethodName)))
+        self.tmp_dir = tempfile.TemporaryDirectory().name
+        if not os.path.exists(self.tmp_dir):
+            os.makedirs(self.tmp_dir)
+        from modelscope.utils.constant import DownloadMode
+        datadict = MsDataset.load(
+            'coco_captions_small_slice',
+            download_mode=DownloadMode.FORCE_REDOWNLOAD)
+        self.train_dataset = MsDataset(datadict['train'].to_hf_dataset().map(
+            lambda _: {
+                'question': 'what the picture describes?'
+            }).rename_column('image:FILE',
+                             'image').rename_column('answer:Value', 'answer'))
+        self.test_dataset = MsDataset(datadict['test'].to_hf_dataset().map(
+            lambda _: {
+                'question': 'what the picture describes?'
+            }).rename_column('image:FILE',
+                             'image').rename_column('answer:Value', 'answer'))
+
+        self.max_epochs = 2
+
+    def tearDown(self):
+        shutil.rmtree(self.tmp_dir)
+        super().tearDown()
+
+    def _cfg_modify_fn(self, cfg):
+        cfg.train.hooks = [{
+            'type': 'CheckpointHook',
+            'interval': self.max_epochs
+        }, {
+            'type': 'TextLoggerHook',
+            'interval': 1
+        }, {
+            'type': 'IterTimerHook'
+        }]
+        return cfg
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_trainer_with_caption(self):
+        kwargs = dict(
+            model='damo/mplug_image-captioning_coco_base_en',
+            train_dataset=self.train_dataset,
+            eval_dataset=self.test_dataset,
+            max_epochs=self.max_epochs,
+            work_dir=self.tmp_dir,
+            cfg_modify_fn=self._cfg_modify_fn)
+
+        trainer: EpochBasedTrainer = build_trainer(
+            name=Trainers.nlp_base_trainer, default_args=kwargs)
+        trainer.train()
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_trainer_with_caption_with_model_and_args(self):
+        cache_path = snapshot_download(
+            'damo/mplug_image-captioning_coco_base_en')
+        model = MPlugForAllTasks.from_pretrained(cache_path)
+        kwargs = dict(
+            cfg_file=os.path.join(cache_path, ModelFile.CONFIGURATION),
+            model=model,
+            train_dataset=self.train_dataset,
+            eval_dataset=self.test_dataset,
+            max_epochs=self.max_epochs,
+            work_dir=self.tmp_dir)
+
+        trainer: EpochBasedTrainer = build_trainer(
+            name=Trainers.nlp_base_trainer, default_args=kwargs)
+        trainer.train()
+        results_files = os.listdir(self.tmp_dir)
+        self.assertIn(f'{trainer.timestamp}.log.json', results_files)
+        for i in range(self.max_epochs):
+            self.assertIn(f'epoch_{i+1}.pth', results_files)
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_trainer_with_vqa(self):
+        kwargs = dict(
+            model='damo/mplug_visual-question-answering_coco_large_en',
+            train_dataset=self.train_dataset,
+            eval_dataset=self.test_dataset,
+            max_epochs=self.max_epochs,
+            work_dir=self.tmp_dir,
+            cfg_modify_fn=self._cfg_modify_fn)
+
+        trainer: EpochBasedTrainer = build_trainer(
+            name=Trainers.nlp_base_trainer, default_args=kwargs)
+        trainer.train()
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_trainer_with_vqa_with_model_and_args(self):
+        cache_path = snapshot_download(
+            'damo/mplug_visual-question-answering_coco_large_en')
+        model = MPlugForAllTasks.from_pretrained(cache_path)
+        kwargs = dict(
+            cfg_file=os.path.join(cache_path, ModelFile.CONFIGURATION),
+            model=model,
+            train_dataset=self.train_dataset,
+            eval_dataset=self.test_dataset,
+            max_epochs=self.max_epochs,
+            work_dir=self.tmp_dir)
+
+        trainer: EpochBasedTrainer = build_trainer(
+            name=Trainers.nlp_base_trainer, default_args=kwargs)
+        trainer.train()
+        results_files = os.listdir(self.tmp_dir)
+        self.assertIn(f'{trainer.timestamp}.log.json', results_files)
+        for i in range(self.max_epochs):
+            self.assertIn(f'epoch_{i+1}.pth', results_files)
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_trainer_with_retrieval(self):
+        kwargs = dict(
+            model='damo/mplug_image-text-retrieval_flickr30k_large_en',
+            train_dataset=self.train_dataset,
+            eval_dataset=self.test_dataset,
+            max_epochs=self.max_epochs,
+            work_dir=self.tmp_dir,
+            cfg_modify_fn=self._cfg_modify_fn)
+
+        trainer: EpochBasedTrainer = build_trainer(
+            name=Trainers.nlp_base_trainer, default_args=kwargs)
+        trainer.train()
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_trainer_with_retrieval_with_model_and_args(self):
+        cache_path = snapshot_download(
+            'damo/mplug_image-text-retrieval_flickr30k_large_en')
+        model = MPlugForAllTasks.from_pretrained(cache_path)
+        kwargs = dict(
+            cfg_file=os.path.join(cache_path, ModelFile.CONFIGURATION),
+            model=model,
+            train_dataset=self.train_dataset,
+            eval_dataset=self.test_dataset,
+            max_epochs=self.max_epochs,
+            work_dir=self.tmp_dir)
+
+        trainer: EpochBasedTrainer = build_trainer(
+            name=Trainers.nlp_base_trainer, default_args=kwargs)
+        trainer.train()
+        results_files = os.listdir(self.tmp_dir)
+        self.assertIn(f'{trainer.timestamp}.log.json', results_files)
+        for i in range(self.max_epochs):
+            self.assertIn(f'epoch_{i+1}.pth', results_files)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/trainers/test_finetune_passage_ranking.py b/tests/trainers/test_finetune_passage_ranking.py
new file mode 100644
index 00000000..f833f981
--- /dev/null
+++ b/tests/trainers/test_finetune_passage_ranking.py
@@ -0,0 +1,133 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os
+import shutil
+import tempfile
+import unittest
+from typing import Any, Callable, Dict, List, NewType, Optional, Tuple, Union
+
+import torch
+from transformers.tokenization_utils_base import PreTrainedTokenizerBase
+
+from modelscope.metainfo import Trainers
+from modelscope.models import Model
+from modelscope.msdatasets import MsDataset
+from modelscope.pipelines import pipeline
+from modelscope.trainers import build_trainer
+from modelscope.utils.constant import ModelFile, Tasks
+
+
+class TestFinetuneSequenceClassification(unittest.TestCase):
+    inputs = {
+        'source_sentence': ["how long it take to get a master's degree"],
+        'sentences_to_compare': [
+            "On average, students take about 18 to 24 months to complete a master's degree.",
+            'On the other hand, some students prefer to go at a slower pace and choose to take '
+            'several years to complete their studies.',
+            'It can take anywhere from two semesters'
+        ]
+    }
+
+    def setUp(self):
+        print(('Testing %s.%s' % (type(self).__name__, self._testMethodName)))
+        self.tmp_dir = tempfile.TemporaryDirectory().name
+        if not os.path.exists(self.tmp_dir):
+            os.makedirs(self.tmp_dir)
+
+    def tearDown(self):
+        shutil.rmtree(self.tmp_dir)
+        super().tearDown()
+
+    def finetune(self,
+                 model_id,
+                 train_dataset,
+                 eval_dataset,
+                 name=Trainers.nlp_passage_ranking_trainer,
+                 cfg_modify_fn=None,
+                 **kwargs):
+        kwargs = dict(
+            model=model_id,
+            train_dataset=train_dataset,
+            eval_dataset=eval_dataset,
+            work_dir=self.tmp_dir,
+            cfg_modify_fn=cfg_modify_fn,
+            **kwargs)
+
+        os.environ['LOCAL_RANK'] = '0'
+        trainer = build_trainer(name=name, default_args=kwargs)
+        trainer.train()
+        results_files = os.listdir(self.tmp_dir)
+        self.assertIn(f'{trainer.timestamp}.log.json', results_files)
+
+    def test_finetune_msmarco(self):
+
+        def cfg_modify_fn(cfg):
+            cfg.task = 'passage-ranking'
+            cfg['preprocessor'] = {'type': 'passage-ranking'}
+            cfg.train.optimizer.lr = 2e-5
+            cfg['dataset'] = {
+                'train': {
+                    'type': 'bert',
+                    'query_sequence': 'query',
+                    'pos_sequence': 'positive_passages',
+                    'neg_sequence': 'negative_passages',
+                    'passage_text_fileds': ['title', 'text'],
+                    'qid_field': 'query_id'
+                },
+                'val': {
+                    'type': 'bert',
+                    'query_sequence': 'query',
+                    'pos_sequence': 'positive_passages',
+                    'neg_sequence': 'negative_passages',
+                    'passage_text_fileds': ['title', 'text'],
+                    'qid_field': 'query_id'
+                },
+            }
+            cfg['train']['neg_samples'] = 4
+            cfg['evaluation']['dataloader']['batch_size_per_gpu'] = 30
+            cfg.train.max_epochs = 1
+            cfg.train.train_batch_size = 4
+            cfg.train.lr_scheduler = {
+                'type': 'LinearLR',
+                'start_factor': 1.0,
+                'end_factor': 0.0,
+                'options': {
+                    'by_epoch': False
+                }
+            }
+            cfg.train.hooks = [{
+                'type': 'CheckpointHook',
+                'interval': 1
+            }, {
+                'type': 'TextLoggerHook',
+                'interval': 1
+            }, {
+                'type': 'IterTimerHook'
+            }, {
+                'type': 'EvaluationHook',
+                'by_epoch': False,
+                'interval': 3000
+            }]
+            return cfg
+
+        # load dataset
+        ds = MsDataset.load('passage-ranking-demo', 'zyznull')
+        train_ds = ds['train'].to_hf_dataset()
+        dev_ds = ds['train'].to_hf_dataset()
+
+        self.finetune(
+            model_id='damo/nlp_corom_passage-ranking_english-base',
+            train_dataset=train_ds,
+            eval_dataset=dev_ds,
+            cfg_modify_fn=cfg_modify_fn)
+
+        output_dir = os.path.join(self.tmp_dir, ModelFile.TRAIN_OUTPUT_DIR)
+        self.pipeline_passage_ranking(output_dir)
+
+    def pipeline_passage_ranking(self, model_dir):
+        model = Model.from_pretrained(model_dir)
+        pipeline_ins = pipeline(task=Tasks.passage_ranking, model=model)
+        print(pipeline_ins(input=self.inputs))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/trainers/test_finetune_sequence_classification.py b/tests/trainers/test_finetune_sequence_classification.py
index 12c7da77..f2adfa22 100644
--- a/tests/trainers/test_finetune_sequence_classification.py
+++ b/tests/trainers/test_finetune_sequence_classification.py
@@ -4,61 +4,60 @@ import shutil
 import tempfile
 import unittest
 
-from modelscope.metainfo import Trainers
+from modelscope.metainfo import Preprocessors, Trainers
+from modelscope.models import Model
+from modelscope.msdatasets import MsDataset
+from modelscope.pipelines import pipeline
 from modelscope.trainers import build_trainer
+from modelscope.trainers.hooks import Hook
+from modelscope.trainers.nlp_trainer import (EpochBasedTrainer,
+                                             NlpEpochBasedTrainer)
+from modelscope.trainers.optimizer.child_tuning_adamw_optimizer import \
+    calculate_fisher
+from modelscope.utils.constant import ModelFile, Tasks
+from modelscope.utils.data_utils import to_device
+from modelscope.utils.regress_test_utils import MsRegressTool
+from modelscope.utils.test_utils import test_level
 
 
 class TestFinetuneSequenceClassification(unittest.TestCase):
+    epoch_num = 1
+
+    sentence1 = '今天气温比昨天高么？'
+    sentence2 = '今天湿度比昨天高么？'
 
     def setUp(self):
         print(('Testing %s.%s' % (type(self).__name__, self._testMethodName)))
         self.tmp_dir = tempfile.TemporaryDirectory().name
         if not os.path.exists(self.tmp_dir):
             os.makedirs(self.tmp_dir)
+        self.regress_tool = MsRegressTool(baseline=False)
 
     def tearDown(self):
         shutil.rmtree(self.tmp_dir)
         super().tearDown()
 
-    def finetune(self,
-                 model_id,
-                 train_dataset,
-                 eval_dataset,
-                 name=Trainers.nlp_base_trainer,
-                 cfg_modify_fn=None,
-                 **kwargs):
-        kwargs = dict(
-            model=model_id,
-            train_dataset=train_dataset,
-            eval_dataset=eval_dataset,
-            work_dir=self.tmp_dir,
-            cfg_modify_fn=cfg_modify_fn,
-            **kwargs)
-
-        os.environ['LOCAL_RANK'] = '0'
-        trainer = build_trainer(name=name, default_args=kwargs)
-        trainer.train()
-        results_files = os.listdir(self.tmp_dir)
-        self.assertIn(f'{trainer.timestamp}.log.json', results_files)
-        for i in range(10):
-            self.assertIn(f'epoch_{i+1}.pth', results_files)
-
-    @unittest.skip
-    def test_finetune_afqmc(self):
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_trainer_repeatable(self):
+        import torch  # noqa
 
         def cfg_modify_fn(cfg):
-            cfg.task = 'sentence-similarity'
-            cfg['preprocessor'] = {'type': 'sen-sim-tokenizer'}
+            cfg.task = 'nli'
+            cfg['preprocessor'] = {'type': 'nli-tokenizer'}
             cfg.train.optimizer.lr = 2e-5
             cfg['dataset'] = {
                 'train': {
-                    'labels': ['0', '1'],
-                    'first_sequence': 'sentence1',
-                    'second_sequence': 'sentence2',
-                    'label': 'label',
+                    'labels': [
+                        '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10',
+                        '11', '12', '13', '14'
+                    ],
+                    'first_sequence':
+                    'sentence',
+                    'label':
+                    'label',
                 }
             }
-            cfg.train.max_epochs = 10
+            cfg.train.max_epochs = 5
             cfg.train.lr_scheduler = {
                 'type': 'LinearLR',
                 'start_factor': 1.0,
@@ -84,19 +83,125 @@ class TestFinetuneSequenceClassification(unittest.TestCase):
             }]
             return cfg
 
-        from datasets import load_dataset
-        from datasets import DownloadConfig
-        dc = DownloadConfig()
-        dc.local_files_only = True
-        dataset = load_dataset('clue', 'afqmc', download_config=dc)
+        dataset = MsDataset.load('clue', subset_name='tnews')
+
+        kwargs = dict(
+            model='damo/nlp_structbert_backbone_base_std',
+            train_dataset=dataset['train'],
+            eval_dataset=dataset['validation'],
+            work_dir=self.tmp_dir,
+            seed=42,
+            cfg_modify_fn=cfg_modify_fn)
+
+        os.environ['LOCAL_RANK'] = '0'
+        trainer: EpochBasedTrainer = build_trainer(
+            name=Trainers.nlp_base_trainer, default_args=kwargs)
+
+        with self.regress_tool.monitor_ms_train(
+                trainer, 'sbert-base-tnews', level='strict'):
+            trainer.train()
+
+    def finetune(self,
+                 model_id,
+                 train_dataset,
+                 eval_dataset,
+                 name=Trainers.nlp_base_trainer,
+                 cfg_modify_fn=None,
+                 **kwargs):
+        kwargs = dict(
+            model=model_id,
+            train_dataset=train_dataset,
+            eval_dataset=eval_dataset,
+            work_dir=self.tmp_dir,
+            cfg_modify_fn=cfg_modify_fn,
+            **kwargs)
+
+        os.environ['LOCAL_RANK'] = '0'
+        trainer = build_trainer(name=name, default_args=kwargs)
+        trainer.train()
+        results_files = os.listdir(self.tmp_dir)
+        self.assertIn(f'{trainer.timestamp}.log.json', results_files)
+        for i in range(self.epoch_num):
+            self.assertIn(f'epoch_{i + 1}.pth', results_files)
+
+        output_files = os.listdir(
+            os.path.join(self.tmp_dir, ModelFile.TRAIN_OUTPUT_DIR))
+        self.assertIn(ModelFile.CONFIGURATION, output_files)
+        self.assertIn(ModelFile.TORCH_MODEL_BIN_FILE, output_files)
+        copy_src_files = os.listdir(trainer.model_dir)
+
+        print(f'copy_src_files are {copy_src_files}')
+        print(f'output_files are {output_files}')
+        for item in copy_src_files:
+            if not item.startswith('.'):
+                self.assertIn(item, output_files)
+
+    def pipeline_sentence_similarity(self, model_dir):
+        model = Model.from_pretrained(model_dir)
+        pipeline_ins = pipeline(task=Tasks.sentence_similarity, model=model)
+        print(pipeline_ins(input=(self.sentence1, self.sentence2)))
+
+    @unittest.skip
+    def test_finetune_afqmc(self):
+        """This unittest is used to reproduce the clue:afqmc dataset + structbert model training results.
+
+        User can train a custom dataset by modifying this piece of code and comment the @unittest.skip.
+        """
+
+        def cfg_modify_fn(cfg):
+            cfg.task = Tasks.sentence_similarity
+            cfg['preprocessor'] = {'type': Preprocessors.sen_sim_tokenizer}
+            cfg.train.optimizer.lr = 2e-5
+            cfg['dataset'] = {
+                'train': {
+                    'labels': ['0', '1'],
+                    'first_sequence': 'sentence1',
+                    'second_sequence': 'sentence2',
+                    'label': 'label',
+                }
+            }
+            cfg.train.max_epochs = self.epoch_num
+            cfg.train.lr_scheduler = {
+                'type': 'LinearLR',
+                'start_factor': 1.0,
+                'end_factor': 0.0,
+                'total_iters':
+                int(len(dataset['train']) / 32) * cfg.train.max_epochs,
+                'options': {
+                    'by_epoch': False
+                }
+            }
+            cfg.train.hooks = [{
+                'type': 'CheckpointHook',
+                'interval': 1
+            }, {
+                'type': 'TextLoggerHook',
+                'interval': 1
+            }, {
+                'type': 'IterTimerHook'
+            }, {
+                'type': 'EvaluationHook',
+                'by_epoch': False,
+                'interval': 100
+            }]
+            return cfg
+
+        dataset = MsDataset.load('clue', subset_name='afqmc')
         self.finetune(
-            model_id='damo/nlp_structbert_backbone_tiny_std',
+            model_id='damo/nlp_structbert_backbone_base_std',
             train_dataset=dataset['train'],
             eval_dataset=dataset['validation'],
             cfg_modify_fn=cfg_modify_fn)
 
+        output_dir = os.path.join(self.tmp_dir, ModelFile.TRAIN_OUTPUT_DIR)
+        self.pipeline_sentence_similarity(output_dir)
+
     @unittest.skip
     def test_finetune_tnews(self):
+        """This unittest is used to reproduce the clue:tnews dataset + structbert model training results.
+
+        User can train a custom dataset by modifying this piece of code and comment the @unittest.skip.
+        """
 
         def cfg_modify_fn(cfg):
             # TODO no proper task for tnews
@@ -141,20 +246,24 @@ class TestFinetuneSequenceClassification(unittest.TestCase):
             }]
             return cfg
 
-        from datasets import load_dataset
-        from datasets import DownloadConfig
-        dc = DownloadConfig()
-        dc.local_files_only = True
-        dataset = load_dataset('clue', 'tnews', download_config=dc)
+        dataset = MsDataset.load('clue', subset_name='tnews')
 
         self.finetune(
-            model_id='damo/nlp_structbert_backbone_tiny_std',
+            model_id='damo/nlp_structbert_backbone_base_std',
             train_dataset=dataset['train'],
             eval_dataset=dataset['validation'],
             cfg_modify_fn=cfg_modify_fn)
 
     @unittest.skip
     def test_veco_xnli(self):
+        """This unittest is used to reproduce the xnli dataset + veco model training results.
+
+        Here we follow the training scenario listed in the Alicemind open source project:
+        https://github.com/alibaba/AliceMind/tree/main/VECO
+        by training the english language subset.
+        User can train a custom dataset by modifying this piece of code and comment the @unittest.skip.
+        """
+
         from datasets import load_dataset
         langs = ['en']
         langs_eval = ['en']
@@ -240,6 +349,112 @@ class TestFinetuneSequenceClassification(unittest.TestCase):
             name=Trainers.nlp_veco_trainer,
             cfg_modify_fn=cfg_modify_fn)
 
+    @unittest.skip
+    def test_finetune_cluewsc(self):
+        """This unittest is used to reproduce the clue:wsc dataset + structbert model training results.
+
+        A runnable sample of child-tuning is also showed here.
+
+        User can train a custom dataset by modifying this piece of code and comment the @unittest.skip.
+        """
+
+        child_tuning_type = 'ChildTuning-F'
+        mode = {}
+        if child_tuning_type is not None:
+            mode = {'mode': child_tuning_type, 'reserve_p': 0.2}
+
+        def cfg_modify_fn(cfg):
+            cfg.task = 'nli'
+            cfg['preprocessor'] = {'type': 'nli-tokenizer'}
+            cfg['dataset'] = {
+                'train': {
+                    'labels': ['0', '1'],
+                    'first_sequence': 'text',
+                    'second_sequence': 'text2',
+                    'label': 'label',
+                }
+            }
+            cfg.train.dataloader.batch_size_per_gpu = 16
+            cfg.train.max_epochs = 30
+            cfg.train.optimizer = {
+                'type':
+                'AdamW' if child_tuning_type is None else 'ChildTuningAdamW',
+                'lr': 1e-5,
+                'options': {},
+                **mode,
+            }
+            cfg.train.lr_scheduler = {
+                'type':
+                'LinearLR',
+                'start_factor':
+                1.0,
+                'end_factor':
+                0.0,
+                'total_iters':
+                int(
+                    len(dataset['train'])
+                    / cfg.train.dataloader.batch_size_per_gpu)
+                * cfg.train.max_epochs,
+                'options': {
+                    'by_epoch': False
+                }
+            }
+            cfg.train.hooks = [{
+                'type': 'CheckpointHook',
+                'interval': 1
+            }, {
+                'type': 'TextLoggerHook',
+                'interval': 1
+            }, {
+                'type': 'IterTimerHook'
+            }, {
+                'type': 'EvaluationHook',
+                'by_epoch': False,
+                'interval': 30
+            }]
+            return cfg
+
+        def add_sentence2(features):
+            return {
+                'text2':
+                features['target']['span2_text'] + '指代'
+                + features['target']['span1_text']
+            }
+
+        dataset = MsDataset.load('clue', subset_name='cluewsc2020')
+        dataset = {
+            k: v.to_hf_dataset().map(add_sentence2)
+            for k, v in dataset.items()
+        }
+
+        kwargs = dict(
+            model='damo/nlp_structbert_backbone_base_std',
+            train_dataset=dataset['train'],
+            eval_dataset=dataset['validation'],
+            work_dir=self.tmp_dir,
+            cfg_modify_fn=cfg_modify_fn)
+
+        os.environ['LOCAL_RANK'] = '0'
+        trainer: NlpEpochBasedTrainer = build_trainer(
+            name=Trainers.nlp_base_trainer, default_args=kwargs)
+
+        class CalculateFisherHook(Hook):
+
+            @staticmethod
+            def forward_step(model, inputs):
+                inputs = to_device(inputs, trainer.device)
+                trainer.train_step(model, inputs)
+                return trainer.train_outputs['loss']
+
+            def before_run(self, trainer: NlpEpochBasedTrainer):
+                v = calculate_fisher(trainer.model, trainer.train_dataloader,
+                                     self.forward_step, 0.2)
+                trainer.optimizer.set_gradient_mask(v)
+
+        if child_tuning_type == 'ChildTuning-D':
+            trainer.register_hook(CalculateFisherHook())
+        trainer.train()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/trainers/test_finetune_text_generation.py b/tests/trainers/test_finetune_text_generation.py
new file mode 100644
index 00000000..6aefa969
--- /dev/null
+++ b/tests/trainers/test_finetune_text_generation.py
@@ -0,0 +1,167 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os
+import shutil
+import tempfile
+import unittest
+
+from modelscope.hub.snapshot_download import snapshot_download
+from modelscope.metainfo import Trainers
+from modelscope.models.nlp import GPT3ForTextGeneration, PalmForTextGeneration
+from modelscope.msdatasets import MsDataset
+from modelscope.trainers import build_trainer
+from modelscope.utils.constant import ModelFile
+from modelscope.utils.test_utils import test_level
+
+
+class TestFinetuneTextGeneration(unittest.TestCase):
+
+    def setUp(self):
+        print(('Testing %s.%s' % (type(self).__name__, self._testMethodName)))
+        self.tmp_dir = tempfile.TemporaryDirectory().name
+        if not os.path.exists(self.tmp_dir):
+            os.makedirs(self.tmp_dir)
+
+        from datasets import Dataset
+
+        src_dataset_dict = {
+            'src_txt': [
+                'This is test sentence1-1', 'This is test sentence2-1',
+                'This is test sentence3-1'
+            ]
+        }
+        src_tgt_dataset_dict = {
+            'src_txt':
+            src_dataset_dict['src_txt'],
+            'tgt_txt': [
+                'This is test sentence1-2', 'This is test sentence2-2',
+                'This is test sentence3-2'
+            ]
+        }
+
+        self.src_dataset = MsDataset(Dataset.from_dict(src_dataset_dict))
+        self.src_tgt_dataset = MsDataset(
+            Dataset.from_dict(src_tgt_dataset_dict))
+
+        self.max_epochs = 3
+
+    def tearDown(self):
+        shutil.rmtree(self.tmp_dir)
+        super().tearDown()
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_trainer_with_palm(self):
+
+        kwargs = dict(
+            model='damo/nlp_palm2.0_text-generation_english-base',
+            train_dataset=self.src_tgt_dataset,
+            eval_dataset=self.src_tgt_dataset,
+            max_epochs=self.max_epochs,
+            work_dir=self.tmp_dir)
+
+        trainer = build_trainer(
+            name=Trainers.nlp_base_trainer, default_args=kwargs)
+        trainer.train()
+        results_files = os.listdir(self.tmp_dir)
+        self.assertIn(f'{trainer.timestamp}.log.json', results_files)
+        for i in range(self.max_epochs):
+            self.assertIn(f'epoch_{i+1}.pth', results_files)
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_trainer_with_palm_with_model_and_args(self):
+
+        cache_path = snapshot_download(
+            'damo/nlp_palm2.0_text-generation_english-base')
+        model = PalmForTextGeneration.from_pretrained(cache_path)
+        kwargs = dict(
+            cfg_file=os.path.join(cache_path, ModelFile.CONFIGURATION),
+            model=model,
+            train_dataset=self.src_tgt_dataset,
+            eval_dataset=self.src_tgt_dataset,
+            max_epochs=self.max_epochs,
+            work_dir=self.tmp_dir)
+
+        trainer = build_trainer(default_args=kwargs)
+        trainer.train()
+        results_files = os.listdir(self.tmp_dir)
+        self.assertIn(f'{trainer.timestamp}.log.json', results_files)
+        for i in range(self.max_epochs):
+            self.assertIn(f'epoch_{i+1}.pth', results_files)
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_trainer_with_gpt3(self):
+
+        kwargs = dict(
+            model='damo/nlp_gpt3_text-generation_chinese-base',
+            train_dataset=self.src_dataset,
+            eval_dataset=self.src_dataset,
+            max_epochs=self.max_epochs,
+            work_dir=self.tmp_dir)
+
+        trainer = build_trainer(
+            name=Trainers.nlp_base_trainer, default_args=kwargs)
+        trainer.train()
+        results_files = os.listdir(self.tmp_dir)
+        self.assertIn(f'{trainer.timestamp}.log.json', results_files)
+        for i in range(self.max_epochs):
+            self.assertIn(f'epoch_{i+1}.pth', results_files)
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_trainer_with_gpt3_with_model_and_args(self):
+
+        cache_path = snapshot_download(
+            'damo/nlp_gpt3_text-generation_chinese-base')
+        model = GPT3ForTextGeneration.from_pretrained(cache_path)
+        kwargs = dict(
+            cfg_file=os.path.join(cache_path, ModelFile.CONFIGURATION),
+            model=model,
+            train_dataset=self.src_dataset,
+            eval_dataset=self.src_dataset,
+            max_epochs=self.max_epochs,
+            work_dir=self.tmp_dir)
+
+        trainer = build_trainer(default_args=kwargs)
+        trainer.train()
+        results_files = os.listdir(self.tmp_dir)
+        self.assertIn(f'{trainer.timestamp}.log.json', results_files)
+        for i in range(self.max_epochs):
+            self.assertIn(f'epoch_{i+1}.pth', results_files)
+
+    @unittest.skip
+    def test_finetune_cnndm(self):
+        from modelscope.msdatasets import MsDataset
+        dataset_dict = MsDataset.load('DuReader_robust-QG')
+        train_dataset = dataset_dict['train'].to_hf_dataset() \
+            .rename_columns({'text1': 'src_txt', 'text2': 'tgt_txt'})
+        eval_dataset = dataset_dict['validation'].to_hf_dataset() \
+            .rename_columns({'text1': 'src_txt', 'text2': 'tgt_txt'})
+        num_warmup_steps = 200
+        os.environ['LOCAL_RANK'] = '0'
+
+        def noam_lambda(current_step: int):
+            current_step += 1
+            return min(current_step**(-0.5),
+                       current_step * num_warmup_steps**(-1.5))
+
+        def cfg_modify_fn(cfg):
+            cfg.train.lr_scheduler = {
+                'type': 'LambdaLR',
+                'lr_lambda': noam_lambda,
+                'options': {
+                    'by_epoch': False
+                }
+            }
+            return cfg
+
+        kwargs = dict(
+            model='damo/nlp_palm2.0_text-generation_chinese-base',
+            train_dataset=train_dataset,
+            eval_dataset=eval_dataset,
+            work_dir=self.tmp_dir,
+            cfg_modify_fn=cfg_modify_fn)
+        trainer = build_trainer(
+            name=Trainers.nlp_base_trainer, default_args=kwargs)
+        trainer.train()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/trainers/test_finetune_token_classificatin.py b/tests/trainers/test_finetune_token_classificatin.py
index 520d1a3c..9bdab9b7 100644
--- a/tests/trainers/test_finetune_token_classificatin.py
+++ b/tests/trainers/test_finetune_token_classificatin.py
@@ -47,6 +47,11 @@ class TestFinetuneTokenClassification(unittest.TestCase):
 
     @unittest.skip
     def test_word_segmentation(self):
+        """This unittest is used to reproduce the icwb2:pku dataset + structbert model training results.
+
+        User can train a custom dataset by modifying this piece of code and comment the @unittest.skip.
+        """
+
         os.system(
             f'curl http://sighan.cs.uchicago.edu/bakeoff2005/data/icwb2-data.zip > {self.tmp_dir}/icwb2-data.zip'
         )
@@ -87,7 +92,7 @@ class TestFinetuneTokenClassification(unittest.TestCase):
                 }
             }
             cfg['preprocessor'] = {'type': 'token-cls-tokenizer'}
-            cfg.train.max_epochs = 3
+            cfg.train.max_epochs = 2
             cfg.train.lr_scheduler = {
                 'type': 'LinearLR',
                 'start_factor': 1.0,
@@ -114,7 +119,7 @@ class TestFinetuneTokenClassification(unittest.TestCase):
             return cfg
 
         self.finetune(
-            'damo/nlp_structbert_backbone_tiny_std',
+            'damo/nlp_structbert_backbone_base_std',
             train_dataset,
             dev_dataset,
             cfg_modify_fn=cfg_modify_fn)
diff --git a/tests/trainers/test_image_color_enhance_trainer.py b/tests/trainers/test_image_color_enhance_trainer.py
index f1dcbe51..34d84cd2 100644
--- a/tests/trainers/test_image_color_enhance_trainer.py
+++ b/tests/trainers/test_image_color_enhance_trainer.py
@@ -17,6 +17,41 @@ from modelscope.utils.constant import ModelFile
 from modelscope.utils.test_utils import test_level
 
 
+class PairedImageDataset(data.Dataset):
+
+    def __init__(self, root):
+        super(PairedImageDataset, self).__init__()
+        gt_dir = osp.join(root, 'gt')
+        lq_dir = osp.join(root, 'lq')
+        self.gt_filelist = os.listdir(gt_dir)
+        self.gt_filelist = sorted(self.gt_filelist, key=lambda x: int(x[:-4]))
+        self.gt_filelist = [osp.join(gt_dir, f) for f in self.gt_filelist]
+        self.lq_filelist = os.listdir(lq_dir)
+        self.lq_filelist = sorted(self.lq_filelist, key=lambda x: int(x[:-4]))
+        self.lq_filelist = [osp.join(lq_dir, f) for f in self.lq_filelist]
+
+    def _img_to_tensor(self, img):
+        return torch.from_numpy(img[:, :, [2, 1, 0]]).permute(2, 0, 1).type(
+            torch.float32) / 255.
+
+    def __getitem__(self, index):
+        lq = cv2.imread(self.lq_filelist[index])
+        gt = cv2.imread(self.gt_filelist[index])
+        lq = cv2.resize(lq, (256, 256), interpolation=cv2.INTER_CUBIC)
+        gt = cv2.resize(gt, (256, 256), interpolation=cv2.INTER_CUBIC)
+        return \
+            {'src': self._img_to_tensor(lq), 'target': self._img_to_tensor(gt)}
+
+    def __len__(self):
+        return len(self.gt_filelist)
+
+    def to_torch_dataset(self,
+                         columns: Union[str, List[str]] = None,
+                         preprocessors: Union[Callable, List[Callable]] = None,
+                         **format_kwargs):
+        return self
+
+
 class TestImageColorEnhanceTrainer(unittest.TestCase):
 
     def setUp(self):
@@ -27,47 +62,6 @@ class TestImageColorEnhanceTrainer(unittest.TestCase):
 
         self.model_id = 'damo/cv_csrnet_image-color-enhance-models'
 
-        class PairedImageDataset(data.Dataset):
-
-            def __init__(self, root):
-                super(PairedImageDataset, self).__init__()
-                gt_dir = osp.join(root, 'gt')
-                lq_dir = osp.join(root, 'lq')
-                self.gt_filelist = os.listdir(gt_dir)
-                self.gt_filelist = sorted(
-                    self.gt_filelist, key=lambda x: int(x[:-4]))
-                self.gt_filelist = [
-                    osp.join(gt_dir, f) for f in self.gt_filelist
-                ]
-                self.lq_filelist = os.listdir(lq_dir)
-                self.lq_filelist = sorted(
-                    self.lq_filelist, key=lambda x: int(x[:-4]))
-                self.lq_filelist = [
-                    osp.join(lq_dir, f) for f in self.lq_filelist
-                ]
-
-            def _img_to_tensor(self, img):
-                return torch.from_numpy(img[:, :, [2, 1, 0]]).permute(
-                    2, 0, 1).type(torch.float32) / 255.
-
-            def __getitem__(self, index):
-                lq = cv2.imread(self.lq_filelist[index])
-                gt = cv2.imread(self.gt_filelist[index])
-                lq = cv2.resize(lq, (256, 256), interpolation=cv2.INTER_CUBIC)
-                gt = cv2.resize(gt, (256, 256), interpolation=cv2.INTER_CUBIC)
-                return \
-                    {'src': self._img_to_tensor(lq), 'target': self._img_to_tensor(gt)}
-
-            def __len__(self):
-                return len(self.gt_filelist)
-
-            def to_torch_dataset(self,
-                                 columns: Union[str, List[str]] = None,
-                                 preprocessors: Union[Callable,
-                                                      List[Callable]] = None,
-                                 **format_kwargs):
-                return self
-
         self.dataset = PairedImageDataset(
             './data/test/images/image_color_enhance/')
 
diff --git a/tests/trainers/test_image_instance_segmentation_trainer.py b/tests/trainers/test_image_instance_segmentation_trainer.py
index c8557ff5..03f7eea3 100644
--- a/tests/trainers/test_image_instance_segmentation_trainer.py
+++ b/tests/trainers/test_image_instance_segmentation_trainer.py
@@ -15,7 +15,7 @@ from modelscope.msdatasets.task_datasets import \
     ImageInstanceSegmentationCocoDataset
 from modelscope.trainers import build_trainer
 from modelscope.utils.config import Config, ConfigDict
-from modelscope.utils.constant import ModelFile
+from modelscope.utils.constant import DownloadMode, ModelFile
 from modelscope.utils.test_utils import test_level
 
 
@@ -41,34 +41,26 @@ class TestImageInstanceSegmentationTrainer(unittest.TestCase):
         if train_data_cfg is None:
             # use default toy data
             train_data_cfg = ConfigDict(
-                name='pets_small',
-                split='train',
-                classes=('Cat', 'Dog'),
-                test_mode=False)
+                name='pets_small', split='train', test_mode=False)
         if val_data_cfg is None:
             val_data_cfg = ConfigDict(
-                name='pets_small',
-                split='validation',
-                classes=('Cat', 'Dog'),
-                test_mode=True)
+                name='pets_small', split='validation', test_mode=True)
 
         self.train_dataset = MsDataset.load(
             dataset_name=train_data_cfg.name,
             split=train_data_cfg.split,
-            classes=train_data_cfg.classes,
-            test_mode=train_data_cfg.test_mode)
-        assert self.train_dataset.config_kwargs[
-            'classes'] == train_data_cfg.classes
+            test_mode=train_data_cfg.test_mode,
+            download_mode=DownloadMode.FORCE_REDOWNLOAD)
+        assert self.train_dataset.config_kwargs['classes']
         assert next(
             iter(self.train_dataset.config_kwargs['split_config'].values()))
 
         self.eval_dataset = MsDataset.load(
             dataset_name=val_data_cfg.name,
             split=val_data_cfg.split,
-            classes=val_data_cfg.classes,
-            test_mode=val_data_cfg.test_mode)
-        assert self.eval_dataset.config_kwargs[
-            'classes'] == val_data_cfg.classes
+            test_mode=val_data_cfg.test_mode,
+            download_mode=DownloadMode.FORCE_REDOWNLOAD)
+        assert self.eval_dataset.config_kwargs['classes']
         assert next(
             iter(self.eval_dataset.config_kwargs['split_config'].values()))
 
diff --git a/tests/trainers/test_image_portrait_enhancement_trainer.py b/tests/trainers/test_image_portrait_enhancement_trainer.py
index dc450ff0..049adf7e 100644
--- a/tests/trainers/test_image_portrait_enhancement_trainer.py
+++ b/tests/trainers/test_image_portrait_enhancement_trainer.py
@@ -19,6 +19,47 @@ from modelscope.utils.constant import ModelFile
 from modelscope.utils.test_utils import test_level
 
 
+class PairedImageDataset(data.Dataset):
+
+    def __init__(self, root, size=512):
+        super(PairedImageDataset, self).__init__()
+        self.size = size
+        gt_dir = osp.join(root, 'gt')
+        lq_dir = osp.join(root, 'lq')
+        self.gt_filelist = os.listdir(gt_dir)
+        self.gt_filelist = sorted(self.gt_filelist, key=lambda x: int(x[:-4]))
+        self.gt_filelist = [osp.join(gt_dir, f) for f in self.gt_filelist]
+        self.lq_filelist = os.listdir(lq_dir)
+        self.lq_filelist = sorted(self.lq_filelist, key=lambda x: int(x[:-4]))
+        self.lq_filelist = [osp.join(lq_dir, f) for f in self.lq_filelist]
+
+    def _img_to_tensor(self, img):
+        img = torch.from_numpy(img[:, :, [2, 1, 0]]).permute(2, 0, 1).type(
+            torch.float32) / 255.
+        return (img - 0.5) / 0.5
+
+    def __getitem__(self, index):
+        lq = cv2.imread(self.lq_filelist[index])
+        gt = cv2.imread(self.gt_filelist[index])
+        lq = cv2.resize(
+            lq, (self.size, self.size), interpolation=cv2.INTER_CUBIC)
+        gt = cv2.resize(
+            gt, (self.size, self.size), interpolation=cv2.INTER_CUBIC)
+
+        return \
+            {'src': self._img_to_tensor(lq), 'target': self._img_to_tensor(gt)}
+
+    def __len__(self):
+        return len(self.gt_filelist)
+
+    def to_torch_dataset(self,
+                         columns: Union[str, List[str]] = None,
+                         preprocessors: Union[Callable, List[Callable]] = None,
+                         **format_kwargs):
+        # self.preprocessor = preprocessors
+        return self
+
+
 class TestImagePortraitEnhancementTrainer(unittest.TestCase):
 
     def setUp(self):
@@ -29,53 +70,6 @@ class TestImagePortraitEnhancementTrainer(unittest.TestCase):
 
         self.model_id = 'damo/cv_gpen_image-portrait-enhancement'
 
-        class PairedImageDataset(data.Dataset):
-
-            def __init__(self, root, size=512):
-                super(PairedImageDataset, self).__init__()
-                self.size = size
-                gt_dir = osp.join(root, 'gt')
-                lq_dir = osp.join(root, 'lq')
-                self.gt_filelist = os.listdir(gt_dir)
-                self.gt_filelist = sorted(
-                    self.gt_filelist, key=lambda x: int(x[:-4]))
-                self.gt_filelist = [
-                    osp.join(gt_dir, f) for f in self.gt_filelist
-                ]
-                self.lq_filelist = os.listdir(lq_dir)
-                self.lq_filelist = sorted(
-                    self.lq_filelist, key=lambda x: int(x[:-4]))
-                self.lq_filelist = [
-                    osp.join(lq_dir, f) for f in self.lq_filelist
-                ]
-
-            def _img_to_tensor(self, img):
-                img = torch.from_numpy(img[:, :, [2, 1, 0]]).permute(
-                    2, 0, 1).type(torch.float32) / 255.
-                return (img - 0.5) / 0.5
-
-            def __getitem__(self, index):
-                lq = cv2.imread(self.lq_filelist[index])
-                gt = cv2.imread(self.gt_filelist[index])
-                lq = cv2.resize(
-                    lq, (self.size, self.size), interpolation=cv2.INTER_CUBIC)
-                gt = cv2.resize(
-                    gt, (self.size, self.size), interpolation=cv2.INTER_CUBIC)
-
-                return \
-                    {'src': self._img_to_tensor(lq), 'target': self._img_to_tensor(gt)}
-
-            def __len__(self):
-                return len(self.gt_filelist)
-
-            def to_torch_dataset(self,
-                                 columns: Union[str, List[str]] = None,
-                                 preprocessors: Union[Callable,
-                                                      List[Callable]] = None,
-                                 **format_kwargs):
-                # self.preprocessor = preprocessors
-                return self
-
         self.dataset = PairedImageDataset(
             './data/test/images/face_enhancement/')
 
diff --git a/tests/trainers/test_movie_scene_segmentation_trainer.py b/tests/trainers/test_movie_scene_segmentation_trainer.py
new file mode 100644
index 00000000..f25dc92a
--- /dev/null
+++ b/tests/trainers/test_movie_scene_segmentation_trainer.py
@@ -0,0 +1,109 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os
+import shutil
+import tempfile
+import unittest
+import zipfile
+
+from modelscope.hub.snapshot_download import snapshot_download
+from modelscope.metainfo import Trainers
+from modelscope.models.cv.movie_scene_segmentation import \
+    MovieSceneSegmentationModel
+from modelscope.msdatasets import MsDataset
+from modelscope.trainers import build_trainer
+from modelscope.utils.config import Config, ConfigDict
+from modelscope.utils.constant import ModelFile
+from modelscope.utils.test_utils import test_level
+
+
+class TestImageInstanceSegmentationTrainer(unittest.TestCase):
+
+    model_id = 'damo/cv_resnet50-bert_video-scene-segmentation_movienet'
+
+    def setUp(self):
+        print(('Testing %s.%s' % (type(self).__name__, self._testMethodName)))
+
+        cache_path = snapshot_download(self.model_id)
+        config_path = os.path.join(cache_path, ModelFile.CONFIGURATION)
+        cfg = Config.from_file(config_path)
+
+        max_epochs = cfg.train.max_epochs
+
+        train_data_cfg = ConfigDict(
+            name='movie_scene_seg_toydata',
+            split='train',
+            cfg=cfg.preprocessor,
+            test_mode=False)
+
+        test_data_cfg = ConfigDict(
+            name='movie_scene_seg_toydata',
+            split='test',
+            cfg=cfg.preprocessor,
+            test_mode=True)
+
+        self.train_dataset = MsDataset.load(
+            dataset_name=train_data_cfg.name,
+            split=train_data_cfg.split,
+            namespace=train_data_cfg.namespace,
+            cfg=train_data_cfg.cfg,
+            test_mode=train_data_cfg.test_mode)
+        assert next(
+            iter(self.train_dataset.config_kwargs['split_config'].values()))
+
+        self.test_dataset = MsDataset.load(
+            dataset_name=test_data_cfg.name,
+            split=test_data_cfg.split,
+            namespace=test_data_cfg.namespace,
+            cfg=test_data_cfg.cfg,
+            test_mode=test_data_cfg.test_mode)
+        assert next(
+            iter(self.test_dataset.config_kwargs['split_config'].values()))
+
+        self.max_epochs = max_epochs
+
+        self.tmp_dir = tempfile.TemporaryDirectory().name
+        if not os.path.exists(self.tmp_dir):
+            os.makedirs(self.tmp_dir)
+
+    def tearDown(self):
+        shutil.rmtree(self.tmp_dir)
+        super().tearDown()
+
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_trainer(self):
+        kwargs = dict(
+            model=self.model_id,
+            train_dataset=self.train_dataset,
+            eval_dataset=self.test_dataset,
+            work_dir=self.tmp_dir)
+
+        trainer = build_trainer(
+            name=Trainers.movie_scene_segmentation, default_args=kwargs)
+        trainer.train()
+        results_files = os.listdir(trainer.work_dir)
+        self.assertIn(f'{trainer.timestamp}.log.json', results_files)
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_trainer_with_model_and_args(self):
+        tmp_dir = tempfile.TemporaryDirectory().name
+        if not os.path.exists(tmp_dir):
+            os.makedirs(tmp_dir)
+
+        cache_path = snapshot_download(self.model_id)
+        model = MovieSceneSegmentationModel.from_pretrained(cache_path)
+        kwargs = dict(
+            cfg_file=os.path.join(cache_path, ModelFile.CONFIGURATION),
+            model=model,
+            train_dataset=self.train_dataset,
+            eval_dataset=self.test_dataset,
+            work_dir=tmp_dir)
+
+        trainer = build_trainer(
+            name=Trainers.movie_scene_segmentation, default_args=kwargs)
+        trainer.train()
+        results_files = os.listdir(trainer.work_dir)
+        self.assertIn(f'{trainer.timestamp}.log.json', results_files)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/trainers/test_text_generation_trainer.py b/tests/trainers/test_text_generation_trainer.py
deleted file mode 100644
index a60bc903..00000000
--- a/tests/trainers/test_text_generation_trainer.py
+++ /dev/null
@@ -1,131 +0,0 @@
-# Copyright (c) Alibaba, Inc. and its affiliates.
-import os
-import shutil
-import tempfile
-import unittest
-
-from modelscope.hub.snapshot_download import snapshot_download
-from modelscope.metainfo import Trainers
-from modelscope.models.nlp.palm_v2 import PalmForTextGeneration
-from modelscope.msdatasets import MsDataset
-from modelscope.trainers import build_trainer
-from modelscope.utils.constant import ModelFile
-from modelscope.utils.test_utils import test_level
-
-
-class TestTextGenerationTrainer(unittest.TestCase):
-
-    def setUp(self):
-        print(('Testing %s.%s' % (type(self).__name__, self._testMethodName)))
-        self.tmp_dir = tempfile.TemporaryDirectory().name
-        if not os.path.exists(self.tmp_dir):
-            os.makedirs(self.tmp_dir)
-
-        self.model_id = 'damo/nlp_palm2.0_text-generation_english-base'
-
-        # todo: Replace below scripts with MsDataset.load when the formal dataset service is ready
-        from datasets import Dataset
-        dataset_dict = {
-            'src_txt': [
-                'This is test sentence1-1', 'This is test sentence2-1',
-                'This is test sentence3-1'
-            ],
-            'tgt_txt': [
-                'This is test sentence1-2', 'This is test sentence2-2',
-                'This is test sentence3-2'
-            ]
-        }
-        dataset = Dataset.from_dict(dataset_dict)
-
-        class MsDatasetDummy(MsDataset):
-
-            def __len__(self):
-                return len(self._hf_ds)
-
-        self.dataset = MsDatasetDummy(dataset)
-
-    def tearDown(self):
-        shutil.rmtree(self.tmp_dir)
-        super().tearDown()
-
-    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
-    def test_trainer(self):
-
-        kwargs = dict(
-            model=self.model_id,
-            train_dataset=self.dataset,
-            eval_dataset=self.dataset,
-            work_dir=self.tmp_dir)
-
-        trainer = build_trainer(
-            name=Trainers.nlp_base_trainer, default_args=kwargs)
-        trainer.train()
-        results_files = os.listdir(self.tmp_dir)
-        self.assertIn(f'{trainer.timestamp}.log.json', results_files)
-        for i in range(3):
-            self.assertIn(f'epoch_{i+1}.pth', results_files)
-
-    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
-    def test_trainer_with_model_and_args(self):
-        tmp_dir = tempfile.TemporaryDirectory().name
-        if not os.path.exists(tmp_dir):
-            os.makedirs(tmp_dir)
-
-        cache_path = snapshot_download(self.model_id)
-        model = PalmForTextGeneration.from_pretrained(cache_path)
-        kwargs = dict(
-            cfg_file=os.path.join(cache_path, ModelFile.CONFIGURATION),
-            model=model,
-            train_dataset=self.dataset,
-            eval_dataset=self.dataset,
-            max_epochs=2,
-            work_dir=self.tmp_dir)
-
-        trainer = build_trainer(default_args=kwargs)
-        trainer.train()
-        results_files = os.listdir(self.tmp_dir)
-        self.assertIn(f'{trainer.timestamp}.log.json', results_files)
-        for i in range(2):
-            self.assertIn(f'epoch_{i+1}.pth', results_files)
-
-    @unittest.skip
-    def test_finetune_cnndm(self):
-        from datasets import load_dataset
-        dataset_dict = load_dataset('ccdv/cnn_dailymail', '3.0.0')
-        train_dataset = dataset_dict['train'] \
-            .rename_columns({'article': 'src_txt', 'highlights': 'tgt_txt'}) \
-            .remove_columns('id')
-        eval_dataset = dataset_dict['validation'] \
-            .rename_columns({'article': 'src_txt', 'highlights': 'tgt_txt'}) \
-            .remove_columns('id')
-        num_warmup_steps = 2000
-
-        def noam_lambda(current_step: int):
-            current_step += 1
-            return min(current_step**(-0.5),
-                       current_step * num_warmup_steps**(-1.5))
-
-        def cfg_modify_fn(cfg):
-            cfg.train.lr_scheduler = {
-                'type': 'LambdaLR',
-                'lr_lambda': noam_lambda,
-                'options': {
-                    'by_epoch': False
-                }
-            }
-            return cfg
-
-        kwargs = dict(
-            model=self.model_id,
-            train_dataset=train_dataset,
-            eval_dataset=eval_dataset,
-            work_dir=self.tmp_dir,
-            cfg_modify_fn=cfg_modify_fn,
-            model_revision='beta')
-        trainer = build_trainer(
-            name=Trainers.nlp_base_trainer, default_args=kwargs)
-        trainer.train()
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/tests/trainers/test_trainer.py b/tests/trainers/test_trainer.py
index 0259f804..c73a56a3 100644
--- a/tests/trainers/test_trainer.py
+++ b/tests/trainers/test_trainer.py
@@ -14,8 +14,10 @@ from torch.utils.data import IterableDataset
 
 from modelscope.metainfo import Metrics, Trainers
 from modelscope.metrics.builder import MetricKeys
+from modelscope.models.base import Model
 from modelscope.trainers import build_trainer
-from modelscope.utils.constant import LogKeys, ModeKeys, ModelFile
+from modelscope.trainers.base import DummyTrainer
+from modelscope.utils.constant import LogKeys, ModeKeys, ModelFile, Tasks
 from modelscope.utils.test_utils import create_dummy_test_dataset, test_level
 
 
@@ -35,7 +37,7 @@ dummy_dataset_big = create_dummy_test_dataset(
     np.random.random(size=(5, )), np.random.randint(0, 4, (1, )), 40)
 
 
-class DummyModel(nn.Module):
+class DummyModel(nn.Module, Model):
 
     def __init__(self):
         super().__init__()
@@ -62,9 +64,10 @@ class TrainerTest(unittest.TestCase):
         super().tearDown()
         shutil.rmtree(self.tmp_dir)
 
-    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_train_0(self):
         json_cfg = {
+            'task': Tasks.image_classification,
             'train': {
                 'work_dir':
                 self.tmp_dir,
@@ -136,9 +139,10 @@ class TrainerTest(unittest.TestCase):
         self.assertIn(f'{LogKeys.EPOCH}_2.pth', results_files)
         self.assertIn(f'{LogKeys.EPOCH}_3.pth', results_files)
 
-    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_train_1(self):
         json_cfg = {
+            'task': Tasks.image_classification,
             'train': {
                 'work_dir':
                 self.tmp_dir,
@@ -196,9 +200,10 @@ class TrainerTest(unittest.TestCase):
         self.assertIn(f'{LogKeys.EPOCH}_2.pth', results_files)
         self.assertIn(f'{LogKeys.EPOCH}_3.pth', results_files)
 
-    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_train_with_default_config(self):
         json_cfg = {
+            'task': Tasks.image_classification,
             'train': {
                 'work_dir': self.tmp_dir,
                 'dataloader': {
@@ -263,7 +268,7 @@ class TrainerTest(unittest.TestCase):
             {
                 LogKeys.MODE: ModeKeys.EVAL,
                 LogKeys.EPOCH: 1,
-                LogKeys.ITER: 20
+                LogKeys.ITER: 10
             }, json.loads(lines[2]))
         self.assertDictContainsSubset(
             {
@@ -283,7 +288,7 @@ class TrainerTest(unittest.TestCase):
             {
                 LogKeys.MODE: ModeKeys.EVAL,
                 LogKeys.EPOCH: 2,
-                LogKeys.ITER: 20
+                LogKeys.ITER: 10
             }, json.loads(lines[5]))
         self.assertDictContainsSubset(
             {
@@ -303,7 +308,7 @@ class TrainerTest(unittest.TestCase):
             {
                 LogKeys.MODE: ModeKeys.EVAL,
                 LogKeys.EPOCH: 3,
-                LogKeys.ITER: 20
+                LogKeys.ITER: 10
             }, json.loads(lines[8]))
         self.assertIn(f'{LogKeys.EPOCH}_1.pth', results_files)
         self.assertIn(f'{LogKeys.EPOCH}_2.pth', results_files)
@@ -314,9 +319,10 @@ class TrainerTest(unittest.TestCase):
         for i in [2, 5, 8]:
             self.assertIn(MetricKeys.ACCURACY, lines[i])
 
-    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_train_with_iters_per_epoch(self):
         json_cfg = {
+            'task': Tasks.image_classification,
             'train': {
                 'work_dir': self.tmp_dir,
                 'dataloader': {
@@ -435,7 +441,7 @@ class TrainerTest(unittest.TestCase):
 
 class DummyTrainerTest(unittest.TestCase):
 
-    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_dummy(self):
         default_args = dict(cfg_file='configs/examples/train.json')
         trainer = build_trainer('dummy', default_args)
diff --git a/tests/trainers/test_trainer_gpu.py b/tests/trainers/test_trainer_gpu.py
index 9781816d..0176704a 100644
--- a/tests/trainers/test_trainer_gpu.py
+++ b/tests/trainers/test_trainer_gpu.py
@@ -15,8 +15,9 @@ from torch.utils.data import IterableDataset
 
 from modelscope.metainfo import Metrics, Trainers
 from modelscope.metrics.builder import MetricKeys
+from modelscope.models.base import Model
 from modelscope.trainers import EpochBasedTrainer, build_trainer
-from modelscope.utils.constant import LogKeys, ModeKeys, ModelFile
+from modelscope.utils.constant import LogKeys, ModeKeys, ModelFile, Tasks
 from modelscope.utils.test_utils import (DistributedTestCase,
                                          create_dummy_test_dataset, test_level)
 
@@ -37,7 +38,7 @@ dummy_dataset_big = create_dummy_test_dataset(
     np.random.random(size=(5, )), np.random.randint(0, 4, (1, )), 40)
 
 
-class DummyModel(nn.Module):
+class DummyModel(nn.Module, Model):
 
     def __init__(self):
         super().__init__()
@@ -52,8 +53,20 @@ class DummyModel(nn.Module):
         return dict(logits=x, loss=loss)
 
 
-def train_func(work_dir, dist=False, iterable_dataset=False, **kwargs):
+class DummyModelForwardInputs(DummyModel):
+
+    def forward(self, inputs):
+        feat, labels = inputs['feat'], inputs['labels']
+        return super().forward(feat, labels)
+
+
+def train_func(work_dir,
+               dist=False,
+               iterable_dataset=False,
+               forward_inputs=False,
+               **kwargs):
     json_cfg = {
+        'task': Tasks.image_classification,
         'train': {
             'work_dir': work_dir,
             'dataloader': {
@@ -79,7 +92,10 @@ def train_func(work_dir, dist=False, iterable_dataset=False, **kwargs):
     with open(config_path, 'w') as f:
         json.dump(json_cfg, f)
 
-    model = DummyModel()
+    if forward_inputs:
+        model = DummyModelForwardInputs()
+    else:
+        model = DummyModel()
     optimmizer = SGD(model.parameters(), lr=0.01)
     lr_scheduler = StepLR(optimmizer, 2)
     trainer_name = Trainers.default
@@ -118,7 +134,7 @@ class TrainerTestSingleGpu(unittest.TestCase):
         super().tearDown()
         shutil.rmtree(self.tmp_dir)
 
-    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_single_gpu(self):
         train_func(self.tmp_dir)
 
@@ -271,6 +287,22 @@ class TrainerTestMultiGpus(DistributedTestCase):
         for i in [1, 3, 5]:
             self.assertIn(MetricKeys.ACCURACY, lines[i])
 
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_multi_gpus_forward_inputs(self):
+        self.start(
+            train_func,
+            num_gpus=2,
+            work_dir=self.tmp_dir,
+            dist=True,
+            forward_inputs=True)
+
+        results_files = os.listdir(self.tmp_dir)
+        json_files = glob.glob(os.path.join(self.tmp_dir, '*.log.json'))
+        self.assertEqual(len(json_files), 1)
+        self.assertIn(f'{LogKeys.EPOCH}_1.pth', results_files)
+        self.assertIn(f'{LogKeys.EPOCH}_2.pth', results_files)
+        self.assertIn(f'{LogKeys.EPOCH}_3.pth', results_files)
+
     # TODO: support iters_per_epoch for dist mode
     @unittest.skipIf(True, 'need to adapt to DistributedSampler')
     def test_multi_gpus_with_iters_per_epoch(self):
diff --git a/tests/trainers/test_trainer_with_nlp.py b/tests/trainers/test_trainer_with_nlp.py
index 213b6b4f..6030ada9 100644
--- a/tests/trainers/test_trainer_with_nlp.py
+++ b/tests/trainers/test_trainer_with_nlp.py
@@ -6,16 +6,21 @@ import unittest
 
 from modelscope.hub.snapshot_download import snapshot_download
 from modelscope.metainfo import Metrics
+from modelscope.models.base import Model
 from modelscope.models.nlp.sequence_classification import \
     SbertForSequenceClassification
 from modelscope.msdatasets import MsDataset
-from modelscope.trainers import build_trainer
-from modelscope.utils.constant import ModelFile
+from modelscope.pipelines import pipeline
+from modelscope.trainers import EpochBasedTrainer, build_trainer
+from modelscope.utils.config import Config
+from modelscope.utils.constant import ModelFile, Tasks
 from modelscope.utils.hub import read_config
 from modelscope.utils.test_utils import test_level
 
 
 class TestTrainerWithNlp(unittest.TestCase):
+    sentence1 = '今天气温比昨天高么？'
+    sentence2 = '今天湿度比昨天高么？'
 
     def setUp(self):
         print(('Testing %s.%s' % (type(self).__name__, self._testMethodName)))
@@ -30,7 +35,7 @@ class TestTrainerWithNlp(unittest.TestCase):
         shutil.rmtree(self.tmp_dir)
         super().tearDown()
 
-    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_trainer(self):
         model_id = 'damo/nlp_structbert_sentence-similarity_chinese-base'
         kwargs = dict(
@@ -47,6 +52,27 @@ class TestTrainerWithNlp(unittest.TestCase):
         for i in range(10):
             self.assertIn(f'epoch_{i+1}.pth', results_files)
 
+        output_files = os.listdir(
+            os.path.join(self.tmp_dir, ModelFile.TRAIN_OUTPUT_DIR))
+        self.assertIn(ModelFile.CONFIGURATION, output_files)
+        self.assertIn(ModelFile.TORCH_MODEL_BIN_FILE, output_files)
+        copy_src_files = os.listdir(trainer.model_dir)
+
+        print(f'copy_src_files are {copy_src_files}')
+        print(f'output_files are {output_files}')
+        for item in copy_src_files:
+            if not item.startswith('.'):
+                self.assertIn(item, output_files)
+
+        def pipeline_sentence_similarity(model_dir):
+            model = Model.from_pretrained(model_dir)
+            pipeline_ins = pipeline(
+                task=Tasks.sentence_similarity, model=model)
+            print(pipeline_ins(input=(self.sentence1, self.sentence2)))
+
+        output_dir = os.path.join(self.tmp_dir, ModelFile.TRAIN_OUTPUT_DIR)
+        pipeline_sentence_similarity(output_dir)
+
     @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
     def test_trainer_with_backbone_head(self):
         model_id = 'damo/nlp_structbert_sentiment-classification_chinese-base'
@@ -94,6 +120,90 @@ class TestTrainerWithNlp(unittest.TestCase):
             checkpoint_path=os.path.join(self.tmp_dir, 'epoch_10.pth'))
         self.assertTrue(Metrics.accuracy in eval_results)
 
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_trainer_with_configured_datasets(self):
+        model_id = 'damo/nlp_structbert_sentence-similarity_chinese-base'
+        cfg: Config = read_config(model_id)
+        cfg.train.max_epochs = 20
+        cfg.train.work_dir = self.tmp_dir
+        cfg.dataset = {
+            'train': {
+                'name': 'afqmc_small',
+                'split': 'train',
+                'namespace': 'userxiaoming'
+            },
+            'val': {
+                'name': 'afqmc_small',
+                'split': 'train',
+                'namespace': 'userxiaoming'
+            },
+        }
+        cfg_file = os.path.join(self.tmp_dir, 'config.json')
+        cfg.dump(cfg_file)
+        kwargs = dict(model=model_id, cfg_file=cfg_file)
+
+        trainer = build_trainer(default_args=kwargs)
+        trainer.train()
+        results_files = os.listdir(self.tmp_dir)
+        self.assertIn(f'{trainer.timestamp}.log.json', results_files)
+        for i in range(cfg.train.max_epochs):
+            self.assertIn(f'epoch_{i+1}.pth', results_files)
+
+        eval_results = trainer.evaluate(
+            checkpoint_path=os.path.join(self.tmp_dir, 'epoch_10.pth'))
+        self.assertTrue(Metrics.accuracy in eval_results)
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_trainer_with_continue_train(self):
+        from modelscope.utils.regress_test_utils import MsRegressTool
+        model_id = 'damo/nlp_structbert_sentence-similarity_chinese-base'
+        cfg: Config = read_config(model_id)
+        cfg.train.max_epochs = 3
+        cfg.train.work_dir = self.tmp_dir
+        cfg_file = os.path.join(self.tmp_dir, 'config.json')
+        cfg.dump(cfg_file)
+        dataset = MsDataset.load('clue', subset_name='afqmc', split='train')
+        dataset = dataset.to_hf_dataset().select(range(128))
+        kwargs = dict(
+            model=model_id,
+            train_dataset=dataset,
+            eval_dataset=dataset,
+            cfg_file=cfg_file)
+
+        regress_tool = MsRegressTool(baseline=True)
+        trainer: EpochBasedTrainer = build_trainer(default_args=kwargs)
+
+        def lazy_stop_callback():
+            from modelscope.trainers.hooks.hook import Hook, Priority
+
+            class EarlyStopHook(Hook):
+                PRIORITY = Priority.VERY_LOW
+
+                def after_iter(self, trainer):
+                    if trainer.iter == 12:
+                        raise MsRegressTool.EarlyStopError('Test finished.')
+
+            if 'EarlyStopHook' not in [
+                    hook.__class__.__name__ for hook in trainer.hooks
+            ]:
+                trainer.register_hook(EarlyStopHook())
+
+        with regress_tool.monitor_ms_train(
+                trainer,
+                'trainer_continue_train',
+                level='strict',
+                lazy_stop_callback=lazy_stop_callback):
+            trainer.train()
+
+        results_files = os.listdir(self.tmp_dir)
+        self.assertIn(f'{trainer.timestamp}.log.json', results_files)
+
+        trainer = build_trainer(default_args=kwargs)
+        regress_tool = MsRegressTool(baseline=False)
+        with regress_tool.monitor_ms_train(
+                trainer, 'trainer_continue_train', level='strict'):
+            trainer.train(os.path.join(self.tmp_dir, 'iter_12.pth'))
+
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_trainer_with_model_and_args(self):
         tmp_dir = tempfile.TemporaryDirectory().name
diff --git a/tests/trainers/utils/test_inference.py b/tests/trainers/utils/test_inference.py
index 87e5320e..23561734 100644
--- a/tests/trainers/utils/test_inference.py
+++ b/tests/trainers/utils/test_inference.py
@@ -11,6 +11,7 @@ from torch.utils.data import DataLoader
 from modelscope.metrics.builder import MetricKeys
 from modelscope.metrics.sequence_classification_metric import \
     SequenceClassificationMetric
+from modelscope.models.base import Model
 from modelscope.trainers.utils.inference import multi_gpu_test, single_gpu_test
 from modelscope.utils.test_utils import (DistributedTestCase,
                                          create_dummy_test_dataset, test_level)
@@ -20,7 +21,7 @@ dummy_dataset = create_dummy_test_dataset(
     torch.rand((5, )), torch.randint(0, 4, (1, )), 20)
 
 
-class DummyModel(nn.Module):
+class DummyModel(nn.Module, Model):
 
     def __init__(self):
         super().__init__()
diff --git a/tests/utils/__init__.py b/tests/utils/__init__.py
index 9166292f..f1a50035 100644
--- a/tests/utils/__init__.py
+++ b/tests/utils/__init__.py
@@ -1 +1,3 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 from .profiler import *  # noqa F403
diff --git a/tests/utils/profiler.py b/tests/utils/profiler.py
index 92708ad3..f5a522ef 100644
--- a/tests/utils/profiler.py
+++ b/tests/utils/profiler.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import importlib
 import sys
 from functools import wraps
diff --git a/tests/utils/test_ast.py b/tests/utils/test_ast.py
index de99a7b8..9a8ab828 100644
--- a/tests/utils/test_ast.py
+++ b/tests/utils/test_ast.py
@@ -30,7 +30,7 @@ class AstScaningTest(unittest.TestCase):
     def test_ast_scaning_class(self):
         astScaner = AstScaning()
         pipeline_file = os.path.join(MODELSCOPE_PATH, 'pipelines', 'nlp',
-                                     'sequence_classification_pipeline.py')
+                                     'text_generation_pipeline.py')
         output = astScaner.generate_ast(pipeline_file)
         self.assertTrue(output['imports'] is not None)
         self.assertTrue(output['from_imports'] is not None)
@@ -40,14 +40,12 @@ class AstScaningTest(unittest.TestCase):
         self.assertIsInstance(imports, dict)
         self.assertIsInstance(from_imports, dict)
         self.assertIsInstance(decorators, list)
-        self.assertListEqual(
-            list(set(imports.keys()) - set(['typing', 'numpy'])), [])
-        self.assertEqual(len(from_imports.keys()), 9)
+        self.assertListEqual(list(set(imports.keys()) - set(['torch'])), [])
+        self.assertEqual(len(from_imports.keys()), 7)
         self.assertTrue(from_imports['modelscope.metainfo'] is not None)
         self.assertEqual(from_imports['modelscope.metainfo'], ['Pipelines'])
-        self.assertEqual(
-            decorators,
-            [('PIPELINES', 'text-classification', 'sentiment-analysis')])
+        self.assertEqual(decorators,
+                         [('PIPELINES', 'text-generation', 'text-generation')])
 
     def test_files_scaning_method(self):
         fileScaner = FilesAstScaning()
diff --git a/tests/utils/test_config.py b/tests/utils/test_config.py
index d934a86c..8b89fa68 100644
--- a/tests/utils/test_config.py
+++ b/tests/utils/test_config.py
@@ -4,6 +4,8 @@ import copy
 import tempfile
 import unittest
 
+import json
+
 from modelscope.utils.config import Config, check_config
 
 obj = {'a': 1, 'b': {'c': [1, 2, 3], 'd': 'dd'}}
@@ -43,7 +45,8 @@ class ConfigTest(unittest.TestCase):
             self.assertEqual(pretty_text, cfg.dump())
             cfg.dump(ofile.name)
             with open(ofile.name, 'r') as infile:
-                self.assertEqual(json_str, infile.read())
+                self.assertDictEqual(
+                    json.loads(json_str), json.loads(infile.read()))
 
         with tempfile.NamedTemporaryFile(suffix='.yaml') as ofile:
             cfg.dump(ofile.name)
diff --git a/tests/utils/test_device.py b/tests/utils/test_device.py
index 3135b214..0d334fda 100644
--- a/tests/utils/test_device.py
+++ b/tests/utils/test_device.py
@@ -50,6 +50,12 @@ class DeviceTest(unittest.TestCase):
         with self.assertRaises(AssertionError):
             verify_device('xgu')
 
+        with self.assertRaises(AssertionError):
+            verify_device('')
+
+        with self.assertRaises(AssertionError):
+            verify_device(None)
+
     def test_create_device_torch(self):
         if torch.cuda.is_available():
             target_device_type = 'cuda'
@@ -81,6 +87,7 @@ class DeviceTest(unittest.TestCase):
         with device_placement(Frameworks.torch, 'cpu'):
             pass
 
+    @unittest.skip('skip this test to avoid debug logging.')
     def test_device_placement_tf_gpu(self):
         tf.debugging.set_log_device_placement(True)
         with device_placement(Frameworks.tf, 'gpu:0'):