diff --git a/.dev_scripts/build_image.sh b/.dev_scripts/build_image.sh
index e6403aed..81bce38b 100644
--- a/.dev_scripts/build_image.sh
+++ b/.dev_scripts/build_image.sh
@@ -96,9 +96,9 @@ else
 fi
 if [[ $python_version == 3.7* ]]; then
     base_tag=$base_tag-py37
-elif [[ $python_version == z* ]]; then
+elif [[ $python_version == 3.8* ]]; then
     base_tag=$base_tag-py38
-elif [[ $python_version == z* ]]; then
+elif [[ $python_version == 3.9* ]]; then
     base_tag=$base_tag-py39
 else
     echo "Unsupport python version: $python_version"
@@ -129,8 +129,15 @@ else
     echo "Building dsw image well need set ModelScope lib cache location."
     docker_file_content="${docker_file_content} \nENV MODELSCOPE_CACHE=/mnt/workspace/.cache/modelscope"
 fi
+if [ "$is_ci_test" == "True" ]; then
+    echo "Building CI image, uninstall modelscope"
+    docker_file_content="${docker_file_content} \nRUN pip uninstall modelscope -y"
+fi
 printf "$docker_file_content" > Dockerfile
-docker build -t $IMAGE_TO_BUILD  \
+
+while true
+do
+  docker build -t $IMAGE_TO_BUILD  \
              --build-arg USE_GPU \
              --build-arg BASE_IMAGE \
              --build-arg PYTHON_VERSION \
@@ -138,11 +145,14 @@ docker build -t $IMAGE_TO_BUILD  \
              --build-arg CUDATOOLKIT_VERSION \
              --build-arg TENSORFLOW_VERSION \
              -f Dockerfile .
+  if [ $? -eq 0 ]; then
+    echo "Image build done"
+    break
+  else
+    echo "Running docker build command error, we will retry"
+  fi
+done
 
-if [ $? -ne 0 ]; then
-  echo "Running docker build command error, please check the log!"
-  exit -1
-fi
 if [ "$run_ci_test" == "True" ]; then
     echo "Running ci case."
     export MODELSCOPE_CACHE=/home/mulin.lyh/model_scope_cache
diff --git a/.dev_scripts/ci_container_test.sh b/.dev_scripts/ci_container_test.sh
index 35b43535..8d1c9a0c 100644
--- a/.dev_scripts/ci_container_test.sh
+++ b/.dev_scripts/ci_container_test.sh
@@ -20,15 +20,15 @@ if [ "$MODELSCOPE_SDK_DEBUG" == "True" ]; then
         fi
     fi
 
-    awk -F: '/^[^#]/ { print $1 }' requirements/framework.txt | xargs -n 1 pip install -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
-    awk -F: '/^[^#]/ { print $1 }' requirements/audio.txt | xargs -n 1 pip install -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
-    awk -F: '/^[^#]/ { print $1 }' requirements/cv.txt | xargs -n 1 pip install -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
-    awk -F: '/^[^#]/ { print $1 }' requirements/multi-modal.txt | xargs -n 1 pip install -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
-    awk -F: '/^[^#]/ { print $1 }' requirements/nlp.txt | xargs -n 1 pip install -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
-    awk -F: '/^[^#]/ { print $1 }' requirements/science.txt | xargs -n 1 pip install -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
+    pip install -r  requirements/framework.txt -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
+    pip install -r requirements/audio.txt -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
+    pip install -r  requirements/cv.txt -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
+    pip install -r  requirements/multi-modal.txt -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
+    pip install -r  requirements/nlp.txt -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
+    pip install -r  requirements/science.txt -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
 
     # test with install
-    python setup.py install
+    pip install .
 else
     echo "Running case in release image, run case directly!"
 fi
diff --git a/.gitignore b/.gitignore
index de086eea..cf36a205 100644
--- a/.gitignore
+++ b/.gitignore
@@ -122,6 +122,7 @@ tensorboard.sh
 .DS_Store
 replace.sh
 result.png
+result.jpg
 
 # Pytorch
 *.pth
diff --git a/data/test/audios/speaker1_a_en_16k.wav b/data/test/audios/speaker1_a_en_16k.wav
new file mode 100644
index 00000000..d973e659
--- /dev/null
+++ b/data/test/audios/speaker1_a_en_16k.wav
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cb35bff3dac9aec36e259461fecae1e1bc2ec029615f30713111cd598993676c
+size 249646
diff --git a/data/test/audios/speaker1_b_en_16k.wav b/data/test/audios/speaker1_b_en_16k.wav
new file mode 100644
index 00000000..51ff9dc1
--- /dev/null
+++ b/data/test/audios/speaker1_b_en_16k.wav
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d7daff767e13d9a2187b676d958065121cd5e26da046d65cd9604e91a87525a2
+size 201006
diff --git a/data/test/audios/speaker2_a_en_16k.wav b/data/test/audios/speaker2_a_en_16k.wav
new file mode 100644
index 00000000..46ba2ff5
--- /dev/null
+++ b/data/test/audios/speaker2_a_en_16k.wav
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a723c134978a17fe12ca2374d0281a8003a56fa44ff9d2249a08791714983362
+size 249646
diff --git a/data/test/images/GOPR0384_11_00-000001.png b/data/test/images/GOPR0384_11_00-000001.png
new file mode 100644
index 00000000..803b21d9
--- /dev/null
+++ b/data/test/images/GOPR0384_11_00-000001.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1f516e38eea7a16fd48fddc34953cb227d86d22fbcd31de0c1334bb14b96dba8
+size 932252
diff --git a/data/test/images/butterfly_lrx2_y.png b/data/test/images/butterfly_lrx2_y.png
new file mode 100644
index 00000000..1598e075
--- /dev/null
+++ b/data/test/images/butterfly_lrx2_y.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:430575a8cb668113d6b0e91e403be0c0e36a95bbb96c484603a625b52f71edd9
+size 11858
diff --git a/data/test/images/content_check.jpg b/data/test/images/content_check.jpg
new file mode 100644
index 00000000..8cae525b
--- /dev/null
+++ b/data/test/images/content_check.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7d486900ecca027d70453322d0f22de4b36f9534a324b8b1cda3ea86bb72bac6
+size 353096
diff --git a/data/test/images/face_liveness_xc.png b/data/test/images/face_liveness_xc.png
new file mode 100644
index 00000000..54777f9e
--- /dev/null
+++ b/data/test/images/face_liveness_xc.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0abad2347748bf312ab0dbce48fdc643a703d94970e1b181cf19b9be6312db8c
+size 3145728
diff --git a/data/test/images/face_reconstruction.jpg b/data/test/images/face_reconstruction.jpg
new file mode 100644
index 00000000..0ad6cbca
--- /dev/null
+++ b/data/test/images/face_reconstruction.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b3a4f864cee22265fdbb8008719e0e2e36235bd4bb2fdfbc9278b0b964e86eff
+size 1921140
diff --git a/data/test/images/image_debanding.png b/data/test/images/image_debanding.png
new file mode 100644
index 00000000..5b28266f
--- /dev/null
+++ b/data/test/images/image_debanding.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7f4bc4dd40c69ecc54bc9517f52fbf3df9a5f682cd9f4d4f3f1376bf33ede22d
+size 2820304
diff --git a/data/test/images/image_driving_perception.jpg b/data/test/images/image_driving_perception.jpg
new file mode 100644
index 00000000..ef016c01
--- /dev/null
+++ b/data/test/images/image_driving_perception.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1f6b6b4abfcc2fc9042c4e51c2e5f530ff84b345cd3176b11e8317143c5a7e0f
+size 91130
diff --git a/data/test/images/image_ffhq34_00041527.png b/data/test/images/image_ffhq34_00041527.png
new file mode 100644
index 00000000..3fcc17ba
--- /dev/null
+++ b/data/test/images/image_ffhq34_00041527.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0e8a71df766b615e20a5e1cacd47796a5668747e039e7f6f6e1b029b40818cc2
+size 196993
diff --git a/data/test/images/image_inpainting/image_inpainting_1.png b/data/test/images/image_inpainting/image_inpainting_1.png
new file mode 100644
index 00000000..b4c64d61
--- /dev/null
+++ b/data/test/images/image_inpainting/image_inpainting_1.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6099183bbc513371c3bded04dbff688958a9c7ab569370c0fb4809fc64850e47
+size 704685
diff --git a/data/test/images/image_inpainting/image_inpainting_mask_1.png b/data/test/images/image_inpainting/image_inpainting_mask_1.png
new file mode 100644
index 00000000..0b9b4b50
--- /dev/null
+++ b/data/test/images/image_inpainting/image_inpainting_mask_1.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5bebb94d42fa4b8dd462fecfa7b248402a30cbc637344ce26143071ca2c470d7
+size 1636
diff --git a/data/test/images/image_moire.jpg b/data/test/images/image_moire.jpg
new file mode 100644
index 00000000..acd6ce8a
--- /dev/null
+++ b/data/test/images/image_moire.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:680d76723fc28bc6ce729a1cd6f11a7d5fc26b5bfe3b486d885417935c20f493
+size 869811
diff --git a/data/test/images/image_multiple_human_parsing.jpg b/data/test/images/image_multiple_human_parsing.jpg
new file mode 100644
index 00000000..c95881fe
--- /dev/null
+++ b/data/test/images/image_multiple_human_parsing.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:176c824d99af119b36f743d3d90b44529167b0e4fc6db276da60fa140ee3f4a9
+size 87228
diff --git a/data/test/images/image_open_vocabulary_detection.jpg b/data/test/images/image_open_vocabulary_detection.jpg
new file mode 100644
index 00000000..16a45bae
--- /dev/null
+++ b/data/test/images/image_open_vocabulary_detection.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5b5861ca8955f8ff906abe78f2b32bc49deee2832f4518ffe4bb584653f3c9e9
+size 187443
diff --git a/data/test/images/image_paint_by_example/image/example_1.png b/data/test/images/image_paint_by_example/image/example_1.png
new file mode 100644
index 00000000..02ced320
--- /dev/null
+++ b/data/test/images/image_paint_by_example/image/example_1.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:40f535f4411fc9b3ea9d2d8c7a352f6f9a33465e797332bd1a4162b40aaffe5f
+size 338334
diff --git a/data/test/images/image_paint_by_example/image/example_2.png b/data/test/images/image_paint_by_example/image/example_2.png
new file mode 100644
index 00000000..3659223b
--- /dev/null
+++ b/data/test/images/image_paint_by_example/image/example_2.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cd3415c9bf1cd099a379f0b3c8049d0f602ec900c9d335b75058355d8db2b077
+size 358916
diff --git a/data/test/images/image_paint_by_example/image/example_3.png b/data/test/images/image_paint_by_example/image/example_3.png
new file mode 100644
index 00000000..62b29109
--- /dev/null
+++ b/data/test/images/image_paint_by_example/image/example_3.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:63c6cd0f0f3b4201a9450dcf3db4b5b4a2b9ad2f48885854868d0c2b6406aac7
+size 471097
diff --git a/data/test/images/image_paint_by_example/mask/example_1.png b/data/test/images/image_paint_by_example/mask/example_1.png
new file mode 100644
index 00000000..8cabdfcc
--- /dev/null
+++ b/data/test/images/image_paint_by_example/mask/example_1.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9c934ced1221d27153a15c14351c575a91f3ff5a6650c3dc9e0778a4245b2804
+size 1192
diff --git a/data/test/images/image_paint_by_example/mask/example_2.png b/data/test/images/image_paint_by_example/mask/example_2.png
new file mode 100644
index 00000000..0e42dbba
--- /dev/null
+++ b/data/test/images/image_paint_by_example/mask/example_2.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f2ab6add1c8a215ca6199baa68d56bca99dbdae7391937493067a6f363b059de
+size 1453
diff --git a/data/test/images/image_paint_by_example/mask/example_3.png b/data/test/images/image_paint_by_example/mask/example_3.png
new file mode 100644
index 00000000..f5523904
--- /dev/null
+++ b/data/test/images/image_paint_by_example/mask/example_3.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d87bd9fa4dca7c7dbb3253e733517303d9b85c9c6600a58c9e9b7150468036da
+size 1410
diff --git a/data/test/images/image_paint_by_example/reference/example_1.jpg b/data/test/images/image_paint_by_example/reference/example_1.jpg
new file mode 100644
index 00000000..035a4e53
--- /dev/null
+++ b/data/test/images/image_paint_by_example/reference/example_1.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6b63bc83b6f5dfeb66f3c79db6fa28b0683690b5dad80b414a03ed723b351edc
+size 467695
diff --git a/data/test/images/image_paint_by_example/reference/example_2.jpg b/data/test/images/image_paint_by_example/reference/example_2.jpg
new file mode 100644
index 00000000..105c0f7a
--- /dev/null
+++ b/data/test/images/image_paint_by_example/reference/example_2.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9de64a9f9e1903f2a72bbddccfbffd16f6ea9e7a855e673792d66e7ad74c8ff4
+size 240669
diff --git a/data/test/images/image_paint_by_example/reference/example_3.jpg b/data/test/images/image_paint_by_example/reference/example_3.jpg
new file mode 100644
index 00000000..0d2aec15
--- /dev/null
+++ b/data/test/images/image_paint_by_example/reference/example_3.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5965f3f3293fb7616e439ef4821d586de1f129bcf08279bbd10a5f42463d542f
+size 240953
diff --git a/data/test/images/image_phone.jpg b/data/test/images/image_phone.jpg
new file mode 100644
index 00000000..7c116b69
--- /dev/null
+++ b/data/test/images/image_phone.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:10b494cbc1a29b228745bcb26897e2524569b467b88cc9839be38504d268ca30
+size 55485
diff --git a/data/test/images/image_single_human_parsing.jpg b/data/test/images/image_single_human_parsing.jpg
new file mode 100644
index 00000000..981efe4e
--- /dev/null
+++ b/data/test/images/image_single_human_parsing.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2a1976ea249b4ad5409cdae403dcd154fac3c628909b6b1874cc968960e2c62d
+size 8259
diff --git a/data/test/images/image_structured_model_probing_test_image.jpg b/data/test/images/image_structured_model_probing_test_image.jpg
new file mode 100644
index 00000000..cd3a6e07
--- /dev/null
+++ b/data/test/images/image_structured_model_probing_test_image.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2f832af4703878076e42fb41544b82147fd31b6be06713975872f16294d1a613
+size 28297
diff --git a/data/test/images/image_traffic_sign.jpg b/data/test/images/image_traffic_sign.jpg
new file mode 100644
index 00000000..c0e4276f
--- /dev/null
+++ b/data/test/images/image_traffic_sign.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b6ab556a1d69010cfe6dd136ff3fbd17ed122c6d0c3509667ef40a656bc18464
+size 87334
diff --git a/data/test/images/images.zip b/data/test/images/images.zip
new file mode 100644
index 00000000..9d796229
Binary files /dev/null and b/data/test/images/images.zip differ
diff --git a/data/test/images/ir_face_recognition_1.png b/data/test/images/ir_face_recognition_1.png
new file mode 100644
index 00000000..0b577e7b
--- /dev/null
+++ b/data/test/images/ir_face_recognition_1.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:602b46c6ba1d18fd3b91fd3b47112d37ca9d8e1ed72f0c0ea93ad8d493f5182e
+size 20299
diff --git a/data/test/images/ir_face_recognition_2.png b/data/test/images/ir_face_recognition_2.png
new file mode 100644
index 00000000..b9204873
--- /dev/null
+++ b/data/test/images/ir_face_recognition_2.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c0791f043b905f2e77ccf2f8c5b29182e1fc99cee16d9069e8bbc1704e917268
+size 20631
diff --git a/data/test/images/universal_matting.jpg b/data/test/images/universal_matting.jpg
new file mode 100644
index 00000000..d824eb21
--- /dev/null
+++ b/data/test/images/universal_matting.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:78d7bf999d1a4186309693ff1b966edb3ccd40f7861a7589167cf9e33897a693
+size 369725
diff --git a/data/test/images/vision_efficient_tuning_test_1.png b/data/test/images/vision_efficient_tuning_test_1.png
new file mode 100644
index 00000000..fbadaa63
--- /dev/null
+++ b/data/test/images/vision_efficient_tuning_test_1.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8b28d9c33eff034a706534f195f4443f8c053a74d5553787a5cb9b20873c072f
+size 1962
diff --git a/data/test/images/vision_efficient_tuning_test_2.png b/data/test/images/vision_efficient_tuning_test_2.png
new file mode 100644
index 00000000..dff7620b
--- /dev/null
+++ b/data/test/images/vision_efficient_tuning_test_2.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bbd99f0253d6e0d10ec500cf781cc83b93809db58da54bd914b0b80b7fe8d8a4
+size 2409
diff --git a/data/test/videos/kitti-step_testing_image_02_0000.mp4 b/data/test/videos/kitti-step_testing_image_02_0000.mp4
new file mode 100644
index 00000000..36c557cf
--- /dev/null
+++ b/data/test/videos/kitti-step_testing_image_02_0000.mp4
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a834d1272253559cdf45a5f09642fb0b5209242dca854fce849efc15cebd4028
+size 4623264
diff --git a/data/test/videos/video_deinterlace_test.mp4 b/data/test/videos/video_deinterlace_test.mp4
new file mode 100644
index 00000000..45933740
--- /dev/null
+++ b/data/test/videos/video_deinterlace_test.mp4
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9941ac4a5dd0d9eea5d33ce0009da34d0c93c64ed062479e6c8efb4788e8ef7c
+size 522972
diff --git a/data/test/videos/video_nerf_recon_test.mp4 b/data/test/videos/video_nerf_recon_test.mp4
new file mode 100644
index 00000000..9e388ca2
--- /dev/null
+++ b/data/test/videos/video_nerf_recon_test.mp4
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:824cc8beaaa8747a3ec32f4c79308e468838c448853f40e882a7cc090c71bf96
+size 2151630
diff --git a/docker/Dockerfile.ubuntu b/docker/Dockerfile.ubuntu
index aa28d26b..27c0f1f3 100644
--- a/docker/Dockerfile.ubuntu
+++ b/docker/Dockerfile.ubuntu
@@ -12,7 +12,7 @@ RUN apt-get update && apt-get install -y --reinstall ca-certificates && \
     apt-get clean && \
     cp /tmp/resources/ubuntu20.04_sources.tuna /etc/apt/sources.list && \
     apt-get update && \
-    apt-get install -y locales wget git strace gdb  vim ffmpeg libsm6 tzdata language-pack-zh-hans ttf-wqy-microhei ttf-wqy-zenhei xfonts-wqy libxext6 build-essential ninja-build && \
+    apt-get install -y locales wget git strace gdb sox libopenmpi-dev curl strace vim ffmpeg libsm6 tzdata language-pack-zh-hans ttf-wqy-microhei ttf-wqy-zenhei xfonts-wqy libxext6 build-essential ninja-build && \
     wget https://packagecloud.io/github/git-lfs/packages/debian/bullseye/git-lfs_3.2.0_amd64.deb/download -O ./git-lfs_3.2.0_amd64.deb && \
     dpkg -i ./git-lfs_3.2.0_amd64.deb && \
     rm -f ./git-lfs_3.2.0_amd64.deb && \
@@ -58,12 +58,46 @@ RUN if [ "$USE_GPU" = "True" ] ; then \
         pip install --no-cache-dir tensorflow==$TENSORFLOW_VERSION; \
     fi
 
+# mmcv-full<=1.7.0 for mmdet3d compatible
 RUN if [ "$USE_GPU" = "True" ] ; then \
-        CUDA_HOME=/usr/local/cuda TORCH_CUDA_ARCH_LIST="5.0 5.2 6.0 6.1 7.0 7.5 8.0 8.6" MMCV_WITH_OPS=1 MAX_JOBS=8 FORCE_CUDA=1 pip install --no-cache-dir mmcv-full && pip cache purge; \
+        CUDA_HOME=/usr/local/cuda TORCH_CUDA_ARCH_LIST="5.0 5.2 6.0 6.1 7.0 7.5 8.0 8.6" MMCV_WITH_OPS=1 MAX_JOBS=8 FORCE_CUDA=1 pip install --no-cache-dir 'mmcv-full<=1.7.0' && pip cache purge; \
     else \
-        MMCV_WITH_OPS=1 MAX_JOBS=8 pip install --no-cache-dir mmcv-full && pip cache purge; \
+        MMCV_WITH_OPS=1 MAX_JOBS=8 pip install --no-cache-dir 'mmcv-full<=1.7.0' && pip cache purge; \
     fi
 
+# default shell bash
+ENV SHELL=/bin/bash
+# install special package
+RUN if [ "$USE_GPU" = "True" ] ; then \
+        pip install --no-cache-dir dgl-cu113 dglgo -f https://data.dgl.ai/wheels/repo.html; \
+    else \
+        pip install --no-cache-dir dgl dglgo -f https://data.dgl.ai/wheels/repo.html; \
+    fi
+
+# copy install scripts
+COPY docker/scripts/install_unifold.sh docker/scripts/install_colmap.sh docker/scripts/install_pytorch3d_nvdiffrast.sh docker/scripts/install_tiny_cuda_nn.sh docker/scripts/install_apex.sh /tmp/
+
+# for uniford
+RUN if [ "$USE_GPU" = "True" ] ; then \
+        bash /tmp/install_unifold.sh; \
+    else \
+     echo 'cpu unsupport uniford'; \
+    fi
+
+RUN if [ "$USE_GPU" = "True" ] ; then \
+        pip install --no-cache-dir git+https://github.com/gxd1994/Pointnet2.PyTorch.git@master#subdirectory=pointnet2; \
+    else \
+     echo 'cpu unsupport Pointnet2'; \
+    fi
+
+RUN pip install --no-cache-dir detectron2==0.3 -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
+
+# 3d supports
+RUN bash /tmp/install_colmap.sh
+RUN bash /tmp/install_tiny_cuda_nn.sh
+RUN bash /tmp/install_pytorch3d_nvdiffrast.sh
+# end of 3D
+
 # install modelscope
 COPY requirements /var/modelscope
 RUN pip install --no-cache-dir --upgrade pip && \
@@ -76,42 +110,17 @@ RUN pip install --no-cache-dir --upgrade pip && \
     pip install --no-cache-dir -r /var/modelscope/tests.txt -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html && \
     pip cache purge
 
-# default shell bash
-ENV SHELL=/bin/bash
-
-# install special package
-RUN if [ "$USE_GPU" = "True" ] ; then \
-        pip install --no-cache-dir dgl-cu113 dglgo -f https://data.dgl.ai/wheels/repo.html; \
-    else \
-        pip install --no-cache-dir dgl dglgo -f https://data.dgl.ai/wheels/repo.html; \
-    fi
-
 # install  jupyter plugin
 RUN mkdir -p /root/.local/share/jupyter/labextensions/ && \
     cp -r  /tmp/resources/jupyter_plugins/*  /root/.local/share/jupyter/labextensions/
 
 COPY docker/scripts/modelscope_env_init.sh /usr/local/bin/ms_env_init.sh
-RUN pip install --no-cache-dir https://modelscope.oss-cn-beijing.aliyuncs.com/releases/dependencies/xtcocotools-1.12-cp37-cp37m-linux_x86_64.whl --force
+RUN pip install --no-cache-dir xtcocotools==1.12  detectron2==0.3 -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html --force
 
-# for uniford
-COPY docker/scripts/install_unifold.sh /tmp/install_unifold.sh
-RUN if [ "$USE_GPU" = "True" ] ; then \
-        bash /tmp/install_unifold.sh; \
-    else \
-     echo 'cpu unsupport uniford'; \
-    fi
-
-RUN pip install --no-cache-dir mmcls>=0.21.0 mmdet>=2.25.0 decord>=0.6.0 numpy==1.18.5 https://pypi.tuna.tsinghua.edu.cn/packages/70/ad/06f8a06cef819606cb1a521bcc144288daee5c7e73c5d722492866cb1b92/wenetruntime-1.11.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl ipykernel fairseq fasttext deepspeed
-COPY docker/scripts/install_apex.sh /tmp/install_apex.sh
+# speechbrain==0.5.7 for audio compatible
+RUN pip install --no-cache-dir speechbrain==0.5.7 adaseq>=0.5.0 mmcls>=0.21.0 mmdet>=2.25.0 decord>=0.6.0 numpy==1.18.5 wenetruntime==1.11.0 ipykernel fairseq fasttext deepspeed
 RUN if [ "$USE_GPU" = "True" ] ; then \
         bash /tmp/install_apex.sh; \
     else \
      echo 'cpu unsupport apex'; \
     fi
-RUN  apt-get update && apt-get install -y sox && \
-    apt-get clean
-RUN if [ "$USE_GPU" = "True" ] ; then \
-        pip install --no-cache-dir git+https://github.com/gxd1994/Pointnet2.PyTorch.git@master#subdirectory=pointnet2; \
-    else \
-     echo 'cpu unsupport Pointnet2'; \
-    fi
diff --git a/docker/scripts/install_apex.sh b/docker/scripts/install_apex.sh
index f78e849e..47f34da7 100644
--- a/docker/scripts/install_apex.sh
+++ b/docker/scripts/install_apex.sh
@@ -1,6 +1,6 @@
-export MAX_JOBS=16
-git clone https://github.com/NVIDIA/apex
-cd apex
-TORCH_CUDA_ARCH_LIST="6.0;6.1;6.2;7.0;7.5;8.0;8.6" pip install -v --disable-pip-version-check --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./
-cd ..
-rm -rf apex
+export MAX_JOBS=16 \
+&& git clone https://github.com/NVIDIA/apex \
+&& cd apex \
+&& TORCH_CUDA_ARCH_LIST="6.0;6.1;6.2;7.0;7.5;8.0;8.6" pip install -v --disable-pip-version-check --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./ \
+&& cd .. \
+&& rm -rf apex
diff --git a/docker/scripts/install_colmap.sh b/docker/scripts/install_colmap.sh
new file mode 100644
index 00000000..296b40c5
--- /dev/null
+++ b/docker/scripts/install_colmap.sh
@@ -0,0 +1,24 @@
+wget -q https://cmake.org/files/v3.25/cmake-3.25.2-linux-x86_64.sh \
+    && mkdir /opt/cmake \
+    && sh cmake-3.25.2-linux-x86_64.sh --prefix=/opt/cmake --skip-license \
+    && ln -s /opt/cmake/bin/cmake /usr/local/bin/cmake \
+    && rm -f cmake-3.25.2-linux-x86_64.sh \
+    && apt-get update \
+    && apt-get install libboost-program-options-dev libboost-filesystem-dev libboost-graph-dev libboost-system-dev libboost-test-dev libeigen3-dev libflann-dev libsuitesparse-dev libfreeimage-dev libmetis-dev libgoogle-glog-dev libgflags-dev libsqlite3-dev libglew-dev qtbase5-dev libqt5opengl5-dev  libcgal-dev libceres-dev -y \
+    && export CMAKE_BUILD_PARALLEL_LEVEL=36 \
+    && export MAX_JOBS=16 \
+    && export COLMAP_VERSION=dev \
+    && export CUDA_ARCHITECTURES="all" \
+    && git clone https://github.com/colmap/colmap.git \
+    && cd colmap \
+    && git reset --hard ${COLMAP_VERSION} \
+    && mkdir build \
+    && cd build \
+    && cmake .. -GNinja -DCMAKE_CUDA_ARCHITECTURES=${CUDA_ARCHITECTURES} \
+    && ninja \
+    && ninja install \
+    && cd ../.. \
+    && rm -rf colmap \
+    && apt-get clean  \
+    && strip --remove-section=.note.ABI-tag /usr/lib/x86_64-linux-gnu/libQt5Core.so.5 \
+    && rm -rf /var/lib/apt/lists/*
diff --git a/docker/scripts/install_libs.sh b/docker/scripts/install_libs.sh
deleted file mode 100644
index dea0dc19..00000000
--- a/docker/scripts/install_libs.sh
+++ /dev/null
@@ -1,12 +0,0 @@
-#!/bin/bash
-
-set -eo pipefail
-
-ModelScopeLib=/usr/local/modelscope/lib64
-
-if [ ! -d /usr/local/modelscope ]; then
-    mkdir -p $ModelScopeLib
-fi
-
-# audio libs
-wget "http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/release/maas/libs/audio/libmitaec_pyio.so" -O ${ModelScopeLib}/libmitaec_pyio.so
diff --git a/docker/scripts/install_pytorch3d_nvdiffrast.sh b/docker/scripts/install_pytorch3d_nvdiffrast.sh
new file mode 100644
index 00000000..67f552e2
--- /dev/null
+++ b/docker/scripts/install_pytorch3d_nvdiffrast.sh
@@ -0,0 +1,14 @@
+export CMAKE_BUILD_PARALLEL_LEVEL=36 && export MAX_JOBS=36 && export CMAKE_CUDA_ARCHITECTURES="50;52;60;61;70;75;80;86" \
+	&& pip install --no-cache-dir fvcore iopath \
+	&& curl -LO https://github.com/NVIDIA/cub/archive/1.10.0.tar.gz \
+        && tar xzf 1.10.0.tar.gz \
+        && export CUB_HOME=$PWD/cub-1.10.0 \
+        && pip install "git+https://github.com/facebookresearch/pytorch3d.git@stable" \
+	&& rm -fr 1.10.0.tar.gz cub-1.10.0 \
+        && apt-get update \
+	&& apt-get install -y --no-install-recommends pkg-config libglvnd0 libgl1 libglx0 libegl1  libgles2 libglvnd-dev libgl1-mesa-dev libegl1-mesa-dev  libgles2-mesa-dev -y \
+        && git clone https://github.com/NVlabs/nvdiffrast.git \
+	&& cd nvdiffrast \
+        && pip install --no-cache-dir . \
+        && cd .. \
+        && rm -rf nvdiffrast
diff --git a/docker/scripts/install_tiny_cuda_nn.sh b/docker/scripts/install_tiny_cuda_nn.sh
new file mode 100644
index 00000000..96ae5c72
--- /dev/null
+++ b/docker/scripts/install_tiny_cuda_nn.sh
@@ -0,0 +1,8 @@
+export CMAKE_BUILD_PARALLEL_LEVEL=36 && export MAX_JOBS=36 && export TCNN_CUDA_ARCHITECTURES="50;52;60;61;70;75;80;86" \
+        && git clone --recursive https://github.com/nvlabs/tiny-cuda-nn \
+        && cd tiny-cuda-nn \
+        && git checkout v1.6 \
+        && cd bindings/torch \
+        && python setup.py install \
+        && cd ../../.. \
+        && rm -rf tiny-cuda-nn
diff --git a/docs/README.md b/docs/README.md
index a051c6be..8ccd292e 100644
--- a/docs/README.md
+++ b/docs/README.md
@@ -27,9 +27,9 @@
                 Currently supported formats include "json", "yaml/yml".
 
         Examples:
-            >>> load('/path/of/your/file')  # file is storaged in disk
-            >>> load('https://path/of/your/file')  # file is storaged in Internet
-            >>> load('oss://path/of/your/file')  # file is storaged in petrel
+            >>> load('/path/of/your/file')  # file is stored in disk
+            >>> load('https://path/of/your/file')  # file is stored on internet
+            >>> load('oss://path/of/your/file')  # file is stored in petrel
 
         Returns:
             The content from the file.
diff --git a/docs/source/_templates/classtemplate.rst b/docs/source/_templates/classtemplate.rst
index c547bf79..d3ea0e59 100644
--- a/docs/source/_templates/classtemplate.rst
+++ b/docs/source/_templates/classtemplate.rst
@@ -5,7 +5,7 @@
 
 .. autoclass:: {{ name }}
     :members:
-
+    :special-members: __init__, __call__
 
 ..
   autogenerated from source/_templates/classtemplate.rst
diff --git a/docs/source/api/modelscope.models.cv.rst b/docs/source/api/modelscope.models.cv.rst
index c4704112..ac52fef1 100644
--- a/docs/source/api/modelscope.models.cv.rst
+++ b/docs/source/api/modelscope.models.cv.rst
@@ -12,3 +12,16 @@ modelscope.models.cv
     :template: classtemplate.rst
 
     easycv_base.EasyCVBaseModel
+    image_colorization.ddcolor.ddcolor_for_image_colorization.DDColorForImageColorization
+    image_deblur.nafnet_for_image_deblur.NAFNetForImageDeblur
+    image_defrcn_fewshot.defrcn_for_fewshot.DeFRCNForFewShot
+    image_denoise.nafnet_for_image_denoise.NAFNetForImageDenoise
+    image_face_fusion.image_face_fusion.ImageFaceFusion
+    image_matching.quadtree_attention_model.QuadTreeAttentionForImageMatching
+    image_skychange.skychange_model.ImageSkychange
+    language_guided_video_summarization.summarizer.ClipItVideoSummarization
+    panorama_depth_estimation.unifuse_model.PanoramaDepthEstimation
+    video_stabilization.DUTRAFTStabilizer.DUTRAFTStabilizer
+    video_summarization.summarizer.PGLVideoSummarization
+    video_super_resolution.real_basicvsr_for_video_super_resolution.RealBasicVSRNetForVideoSR
+    vision_middleware.model.VisionMiddlewareModel
diff --git a/docs/source/api/modelscope.models.multi_modal.rst b/docs/source/api/modelscope.models.multi_modal.rst
new file mode 100644
index 00000000..c03c078d
--- /dev/null
+++ b/docs/source/api/modelscope.models.multi_modal.rst
@@ -0,0 +1,24 @@
+modelscope.models.multi_modal
+====================
+
+.. automodule:: modelscope.models.multi_modal
+
+.. currentmodule:: modelscope.models.multi_modal
+
+
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+    :template: classtemplate.rst
+
+    clip.CLIPForMultiModalEmbedding
+    diffusion.DiffusionForTextToImageSynthesis
+    gemm.GEMMForMultiModalEmbedding
+    team.TEAMForMultiModalSimilarity
+    mmr.VideoCLIPForMultiModalEmbedding
+    mplug_for_all_tasks.MPlugForAllTasks
+    mplug_for_all_tasks.HiTeAForAllTasks
+    ofa_for_all_tasks.OfaForAllTasks
+    ofa_for_text_to_image_synthesis_model.OfaForTextToImageSynthesis
+    multi_stage_diffusion.MultiStageDiffusionForTextToImageSynthesis
+    vldoc.VLDocForDocVLEmbedding
diff --git a/docs/source/api/modelscope.models.nlp.rst b/docs/source/api/modelscope.models.nlp.rst
new file mode 100644
index 00000000..c3bf19f6
--- /dev/null
+++ b/docs/source/api/modelscope.models.nlp.rst
@@ -0,0 +1,60 @@
+modelscope.models.nlp
+====================
+
+.. automodule:: modelscope.models.nlp
+
+.. currentmodule:: modelscope.models.nlp
+
+
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+    :template: classtemplate.rst
+
+    bart.BartForTextErrorCorrection
+    bert.BertConfig
+    bert.BertModel
+    bert.BertForMaskedLM
+    bert.BertForTextRanking
+    bert.BertForSentenceEmbedding
+    bert.BertForSequenceClassification
+    bert.BertForTokenClassification
+    bert.BertForDocumentSegmentation
+    csanmt.CsanmtForTranslation
+    deberta_v2.DebertaV2Model
+    deberta_v2.DebertaV2ForMaskedLM
+    gpt_neo.GPTNeoModel
+    gpt2.GPT2Model
+    gpt3.GPT3ForTextGeneration
+    gpt3.DistributedGPT3
+    gpt_moe.GPTMoEForTextGeneration
+    gpt_moe.DistributedGPTMoE
+    megatron_bert.MegatronBertConfig
+    megatron_bert.MegatronBertModel
+    megatron_bert.MegatronBertForMaskedLM
+    palm_v2.PalmForTextGeneration
+    ponet.PoNetConfig
+    ponet.PoNetModel
+    ponet.PoNetForMaskedLM
+    space.SpaceForDialogIntent
+    space.SpaceForDialogModeling
+    space.SpaceForDST
+    space_T_cn.TableQuestionAnswering
+    space_T_en.StarForTextToSql
+    structbert.SbertModel
+    structbert.SbertForMaskedLM
+    structbert.SbertForSequenceClassification
+    structbert.SbertForTokenClassification
+    structbert.SbertForFaqQuestionAnswering
+    T5.T5ForConditionalGeneration
+    mglm.MGLMForTextSummarization
+    codegeex.CodeGeeXForCodeTranslation
+    codegeex.CodeGeeXForCodeGeneration
+    veco.VecoConfig
+    veco.VecoModel
+    veco.VecoForMaskedLM
+    veco.VecoForSequenceClassification
+    veco.VecoForTokenClassification
+    bloom.BloomModel
+    unite.UniTEModel
+    use.UserSatisfactionEstimation
diff --git a/docs/source/api/modelscope.models.rst b/docs/source/api/modelscope.models.rst
index 01bbae3b..cd0ec98c 100644
--- a/docs/source/api/modelscope.models.rst
+++ b/docs/source/api/modelscope.models.rst
@@ -12,3 +12,5 @@ modelscope.models
    bases <modelscope.models.base>
    builders <modelscope.models.builder>
    cv <modelscope.models.cv>
+   nlp <modelscope.models.nlp>
+   multi-modal <modelscope.models.multi_modal>
diff --git a/docs/source/api/modelscope.pipelines.audio.rst b/docs/source/api/modelscope.pipelines.audio.rst
new file mode 100644
index 00000000..71d7d13b
--- /dev/null
+++ b/docs/source/api/modelscope.pipelines.audio.rst
@@ -0,0 +1,20 @@
+modelscope.pipelines.audio
+=======================
+
+.. automodule:: modelscope.pipelines.audio
+
+.. currentmodule:: modelscope.pipelines.audio
+
+
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+    :template: classtemplate.rst
+
+    ANSPipeline
+    AutomaticSpeechRecognitionPipeline
+    InverseTextProcessingPipeline
+    KWSFarfieldPipeline
+    KeyWordSpottingKwsbpPipeline
+    LinearAECPipeline
+    TextToSpeechSambertHifiganPipeline
diff --git a/docs/source/api/modelscope.pipelines.cv.rst b/docs/source/api/modelscope.pipelines.cv.rst
index c695ea69..b2190ef1 100644
--- a/docs/source/api/modelscope.pipelines.cv.rst
+++ b/docs/source/api/modelscope.pipelines.cv.rst
@@ -11,4 +11,84 @@ modelscope.pipelines.cv
     :nosignatures:
     :template: classtemplate.rst
 
+    ActionDetectionPipeline
     ActionRecognitionPipeline
+    AnimalRecognitionPipeline
+    ArcFaceRecognitionPipeline
+    Body2DKeypointsPipeline
+    CardDetectionPipeline
+    CMDSSLVideoEmbeddingPipeline
+    CrowdCountingPipeline
+    DDColorImageColorizationPipeline
+    EasyCVDetectionPipeline
+    EasyCVSegmentationPipeline
+    Face2DKeypointsPipeline
+    FaceAttributeRecognitionPipeline
+    FaceDetectionPipeline
+    FaceImageGenerationPipeline
+    FaceLivenessIrPipeline
+    FaceProcessingBasePipeline
+    FaceRecognitionOnnxFmPipeline
+    FaceRecognitionOodPipeline
+    FaceRecognitionPipeline
+    FacialExpressionRecognitionPipeline
+    FacialLandmarkConfidencePipeline
+    GeneralImageClassificationPipeline
+    GeneralRecognitionPipeline
+    HICOSSLVideoEmbeddingPipeline
+    Hand2DKeypointsPipeline
+    HandStaticPipeline
+    HumanWholebodyKeypointsPipeline
+    Image2ImageGenerationPipeline
+    Image2ImageTranslationPipeline
+    ImageCartoonPipeline
+    ImageClassificationPipeline
+    ImageColorEnhancePipeline
+    ImageColorizationPipeline
+    ImageDeblurPipeline
+    ImageDefrcnDetectionPipeline
+    ImageDenoisePipeline
+    ImageDetectionPipeline
+    ImageInpaintingPipeline
+    ImageInstanceSegmentationPipeline
+    ImageMatchingPipeline
+    ImageMattingPipeline
+    ImageMultiViewDepthEstimationPipeline
+    ImagePanopticSegmentationEasyCVPipeline
+    ImagePanopticSegmentationPipeline
+    ImagePortraitEnhancementPipeline
+    ImageReidPersonPipeline
+    ImageSalientDetectionPipeline
+    ImageSemanticSegmentationPipeline
+    ImageSkychangePipeline
+    ImageStyleTransferPipeline
+    ImageSuperResolutionPipeline
+    LanguageGuidedVideoSummarizationPipeline
+    LicensePlateDetectionPipeline
+    LiveCategoryPipeline
+    MaskDINOInstanceSegmentationPipeline
+    MaskFaceRecognitionPipeline
+    MogFaceDetectionPipeline
+    MovieSceneSegmentationPipeline
+    MtcnnFaceDetectionPipeline
+    OCRDetectionPipeline
+    OCRRecognitionPipeline
+    PointCloudSceneFlowEstimationPipeline
+    ProductRetrievalEmbeddingPipeline
+    RealtimeObjectDetectionPipeline
+    ReferringVideoObjectSegmentationPipeline
+    RetinaFaceDetectionPipeline
+    ShopSegmentationPipeline
+    SkinRetouchingPipeline
+    TableRecognitionPipeline
+    TextDrivenSegmentationPipeline
+    TinynasClassificationPipeline
+    UlfdFaceDetectionPipeline
+    VideoCategoryPipeline
+    VideoFrameInterpolationPipeline
+    VideoObjectSegmentationPipeline
+    VideoStabilizationPipeline
+    VideoSuperResolutionPipeline
+    VirtualTryonPipeline
+    VisionMiddlewarePipeline
+    VopRetrievalPipeline
diff --git a/docs/source/api/modelscope.pipelines.multi_modal.rst b/docs/source/api/modelscope.pipelines.multi_modal.rst
new file mode 100644
index 00000000..b62878e3
--- /dev/null
+++ b/docs/source/api/modelscope.pipelines.multi_modal.rst
@@ -0,0 +1,28 @@
+modelscope.pipelines.multi_modal
+=======================
+
+.. automodule:: modelscope.pipelines.multi_modal
+
+.. currentmodule:: modelscope.pipelines.multi_modal
+
+
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+    :template: classtemplate.rst
+
+    AutomaticSpeechRecognitionPipeline
+    ChineseStableDiffusionPipeline
+    DocumentVLEmbeddingPipeline
+    GEMMMultiModalEmbeddingPipeline
+    ImageCaptioningPipeline
+    MGeoRankingPipeline
+    MultiModalEmbeddingPipeline
+    StableDiffusionWrapperPipeline
+    TextToImageSynthesisPipeline
+    VideoCaptioningPipeline
+    VideoMultiModalEmbeddingPipeline
+    VideoQuestionAnsweringPipeline
+    VisualEntailmentPipeline
+    VisualGroundingPipeline
+    VisualQuestionAnsweringPipeline
diff --git a/docs/source/api/modelscope.pipelines.nlp.rst b/docs/source/api/modelscope.pipelines.nlp.rst
new file mode 100644
index 00000000..ef783db1
--- /dev/null
+++ b/docs/source/api/modelscope.pipelines.nlp.rst
@@ -0,0 +1,45 @@
+modelscope.pipelines.nlp
+=======================
+
+.. automodule:: modelscope.pipelines.nlp
+
+.. currentmodule:: modelscope.pipelines.nlp
+
+
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+    :template: classtemplate.rst
+
+    AutomaticPostEditingPipeline
+    CodeGeeXCodeGenerationPipeline
+    CodeGeeXCodeTranslationPipeline
+    ConversationalTextToSqlPipeline
+    DialogIntentPredictionPipeline
+    DialogModelingPipeline
+    DialogStateTrackingPipeline
+    DocumentSegmentationPipeline
+    ExtractiveSummarizationPipeline
+    FaqQuestionAnsweringPipeline
+    FasttextSequenceClassificationPipeline
+    FeatureExtractionPipeline
+    FillMaskPipeline
+    InformationExtractionPipeline
+    MGLMTextSummarizationPipeline
+    NamedEntityRecognitionPipeline
+    SentenceEmbeddingPipeline
+    SummarizationPipeline
+    TableQuestionAnsweringPipeline
+    TextClassificationPipeline
+    TextErrorCorrectionPipeline
+    TextGenerationPipeline
+    TextGenerationT5Pipeline
+    TextRankingPipeline
+    TokenClassificationPipeline
+    TranslationEvaluationPipeline
+    TranslationPipeline
+    TranslationQualityEstimationPipeline
+    UserSatisfactionEstimationPipeline
+    WordSegmentationPipeline
+    WordSegmentationThaiPipeline
+    ZeroShotClassificationPipeline
diff --git a/docs/source/api/modelscope.pipelines.rst b/docs/source/api/modelscope.pipelines.rst
index f0cb433d..b1fbd038 100644
--- a/docs/source/api/modelscope.pipelines.rst
+++ b/docs/source/api/modelscope.pipelines.rst
@@ -12,3 +12,7 @@ modelscope.pipelines
    base     <modelscope.pipelines.base>
    builder <modelscope.pipelines.builder>
    cv    <modelscope.pipelines.cv>
+   nlp    <modelscope.pipelines.nlp>
+   multi-modal    <modelscope.pipelines.multi-modal>
+   audio    <modelscope.pipelines.audio>
+   science <modelscope.pipelines.science>
diff --git a/docs/source/api/modelscope.pipelines.science.rst b/docs/source/api/modelscope.pipelines.science.rst
new file mode 100644
index 00000000..eabb12b6
--- /dev/null
+++ b/docs/source/api/modelscope.pipelines.science.rst
@@ -0,0 +1,14 @@
+modelscope.pipelines.science
+=======================
+
+.. automodule:: modelscope.pipelines.science
+
+.. currentmodule:: modelscope.pipelines.science
+
+
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+    :template: classtemplate.rst
+
+    ProteinStructurePipeline
diff --git a/docs/source/api/modelscope.preprocessors.nlp.rst b/docs/source/api/modelscope.preprocessors.nlp.rst
new file mode 100644
index 00000000..2fcee124
--- /dev/null
+++ b/docs/source/api/modelscope.preprocessors.nlp.rst
@@ -0,0 +1,44 @@
+modelscope.preprocessors.nlp
+====================
+
+.. automodule:: modelscope.preprocessors.nlp
+
+.. currentmodule:: modelscope.preprocessors.nlp
+
+
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+    :template: classtemplate.rst
+
+    TextErrorCorrectionPreprocessor
+    TextGenerationJiebaPreprocessor
+    DocumentSegmentationTransformersPreprocessor
+    FaqQuestionAnsweringTransformersPreprocessor
+    FillMaskPoNetPreprocessor
+    FillMaskTransformersPreprocessor
+    TextRankingTransformersPreprocessor
+    RelationExtractionTransformersPreprocessor
+    TextClassificationTransformersPreprocessor
+    SentenceEmbeddingTransformersPreprocessor
+    TextGenerationTransformersPreprocessor
+    TextGenerationT5Preprocessor
+    TextGenerationSentencePiecePreprocessor
+    SentencePiecePreprocessor
+    TokenClassificationTransformersPreprocessor
+    WordSegmentationBlankSetToLabelPreprocessor
+    WordSegmentationPreprocessorThai
+    NERPreprocessorThai
+    NERPreprocessorViet
+    ZeroShotClassificationTransformersPreprocessor
+    DialogIntentPredictionPreprocessor
+    DialogModelingPreprocessor
+    DialogStateTrackingPreprocessor
+    InputFeatures
+    MultiWOZBPETextField
+    IntentBPETextField
+    ConversationalTextToSqlPreprocessor
+    TableQuestionAnsweringPreprocessor
+    MGLMSummarizationPreprocessor
+    TranslationEvaluationPreprocessor
+    DialogueClassificationUsePreprocessor
diff --git a/docs/source/api/modelscope.preprocessors.rst b/docs/source/api/modelscope.preprocessors.rst
index ae5cf0ce..46ca6bee 100644
--- a/docs/source/api/modelscope.preprocessors.rst
+++ b/docs/source/api/modelscope.preprocessors.rst
@@ -12,3 +12,4 @@ modelscope.preprocessors
    base     <modelscope.preprocessors.base>
    builders <modelscope.preprocessors.builder>
    video    <modelscope.preprocessors.video>
+   nlp      <modelscope.preprocessors.nlp>
diff --git a/docs/source/api/modelscope.trainers.hooks.rst b/docs/source/api/modelscope.trainers.hooks.rst
new file mode 100644
index 00000000..5fd90338
--- /dev/null
+++ b/docs/source/api/modelscope.trainers.hooks.rst
@@ -0,0 +1,29 @@
+modelscope.trainers.hooks
+=======================
+
+.. automodule:: modelscope.trainers.
+
+.. currentmodule:: modelscope.trainers.hooks
+
+
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+    :template: classtemplate.rst
+
+    builder.build_hook
+    hook.Hook
+    priority.Priority
+    checkpoint_hook.CheckpointHook
+    checkpoint_hook.BestCkptSaverHook
+    compression.SparsityHook
+    evaluation_hook.EvaluationHook
+    iter_timer_hook.IterTimerHook
+    logger.TensorboardHook
+    logger.TextLoggerHook
+    lr_scheduler_hook.LrSchedulerHook
+    lr_scheduler_hook.NoneLrSchedulerHook
+    optimizer.OptimizerHook
+    optimizer.NoneOptimizerHook
+    optimizer.ApexAMPOptimizerHook
+    optimizer.TorchAMPOptimizerHook
diff --git a/docs/source/api/modelscope.trainers.multi_modal.rst b/docs/source/api/modelscope.trainers.multi_modal.rst
new file mode 100644
index 00000000..57e354b0
--- /dev/null
+++ b/docs/source/api/modelscope.trainers.multi_modal.rst
@@ -0,0 +1,18 @@
+modelscope.trainers.multi_modal
+=======================
+
+.. automodule:: modelscope.trainers.multi_modal
+
+.. currentmodule:: modelscope.trainers.multi_modal
+
+
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+    :template: classtemplate.rst
+
+    clip.CLIPTrainer
+    team.TEAMImgClsTrainer
+    ofa.OFATrainer
+    mplug.MPlugTrainer
+    mgeo_ranking_trainer.MGeoRankingTrainer
diff --git a/docs/source/api/modelscope.trainers.nlp.rst b/docs/source/api/modelscope.trainers.nlp.rst
new file mode 100644
index 00000000..8120c48b
--- /dev/null
+++ b/docs/source/api/modelscope.trainers.nlp.rst
@@ -0,0 +1,17 @@
+modelscope.trainers.nlp
+=======================
+
+.. automodule:: modelscope.trainers.nlp
+
+.. currentmodule:: modelscope.trainers.nlp
+
+
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+    :template: classtemplate.rst
+
+    sequence_classification_trainer.SequenceClassificationTrainer
+    csanmt_translation_trainer.CsanmtTranslationTrainer
+    text_ranking_trainer.TextRankingTrainer
+    text_generation_trainer.TextGenerationTrainer
diff --git a/docs/source/api/modelscope.trainers.rst b/docs/source/api/modelscope.trainers.rst
index 32f11c6c..926764c3 100644
--- a/docs/source/api/modelscope.trainers.rst
+++ b/docs/source/api/modelscope.trainers.rst
@@ -12,4 +12,7 @@ modelscope.trainers
    base     <modelscope.trainers.base>
    builder <modelscope.trainers.builder>
    EpochBasedTrainer <modelscope.trainers.trainer>
+   Hooks <modelscope.trainers.hooks>
    cv    <modelscope.trainers.cv>
+   nlp    <modelscope.trainers.nlp>
+   multi-modal <modelscope.trainers.multi_modal>
diff --git a/docs/source/conf.py b/docs/source/conf.py
index eb9e9955..3e4e2044 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -13,7 +13,7 @@
 import os
 import sys
 
-import sphinx_book_theme
+# import sphinx_book_theme
 
 sys.path.insert(0, os.path.abspath('../../'))
 # -- Project information -----------------------------------------------------
diff --git a/docs/source/index.rst b/docs/source/index.rst
index 065ea469..c711c87d 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -22,11 +22,6 @@ ModelScope DOCUMENTATION
    Trainer <api/modelscope.trainers>
    MsDataset <api/modelscope.msdatasets>
 
-.. toctree::
-   :maxdepth: 2
-   :caption: Changelog
-
-   change_log.md
 
 Indices and tables
 ==================
diff --git a/examples/pytorch/finetune_image_classification.py b/examples/pytorch/finetune_image_classification.py
index b5c2f651..4e96c2cd 100644
--- a/examples/pytorch/finetune_image_classification.py
+++ b/examples/pytorch/finetune_image_classification.py
@@ -1,51 +1,43 @@
 import os
+from dataclasses import dataclass, field
 
 from modelscope.metainfo import Trainers
 from modelscope.msdatasets.ms_dataset import MsDataset
 from modelscope.trainers.builder import build_trainer
-from modelscope.trainers.training_args import (ArgAttr, CliArgumentParser,
-                                               training_args)
+from modelscope.trainers.training_args import TrainingArgs
 
 
-def define_parser():
-    training_args.num_classes = ArgAttr(
-        cfg_node_name=[
-            'model.mm_model.head.num_classes',
-            'model.mm_model.train_cfg.augments.0.num_classes',
-            'model.mm_model.train_cfg.augments.1.num_classes'
-        ],
-        type=int,
-        help='number of classes')
+@dataclass
+class ImageClassificationTrainingArgs(TrainingArgs):
+    num_classes: int = field(
+        default=None,
+        metadata={
+            'cfg_node': [
+                'model.mm_model.head.num_classes',
+                'model.mm_model.train_cfg.augments.0.num_classes',
+                'model.mm_model.train_cfg.augments.1.num_classes'
+            ],
+            'help':
+            'number of classes',
+        })
 
-    training_args.train_batch_size.default = 16
-    training_args.train_data_worker.default = 1
-    training_args.max_epochs.default = 1
-    training_args.optimizer.default = 'AdamW'
-    training_args.lr.default = 1e-4
-    training_args.warmup_iters = ArgAttr(
-        'train.lr_config.warmup_iters',
-        type=int,
-        default=1,
-        help='number of warmup epochs')
-    training_args.topk = ArgAttr(
-        cfg_node_name=[
-            'train.evaluation.metric_options.topk',
-            'evaluation.metric_options.topk'
-        ],
-        default=(1, ),
-        help='evaluation using topk, tuple format, eg (1,), (1,5)')
+    topk: tuple = field(
+        default=None,
+        metadata={
+            'cfg_node': [
+                'train.evaluation.metric_options.topk',
+                'evaluation.metric_options.topk'
+            ],
+            'help':
+            'evaluation using topk, tuple format, eg (1,), (1,5)',
+        })
 
-    training_args.train_data = ArgAttr(
-        type=str, default='tany0699/cats_and_dogs', help='train dataset')
-    training_args.validation_data = ArgAttr(
-        type=str, default='tany0699/cats_and_dogs', help='validation dataset')
-    training_args.model_id = ArgAttr(
-        type=str,
-        default='damo/cv_vit-base_image-classification_ImageNet-labels',
-        help='model name')
-
-    parser = CliArgumentParser(training_args)
-    return parser
+    warmup_iters: str = field(
+        default=None,
+        metadata={
+            'cfg_node': 'train.lr_config.warmup_iters',
+            'help': 'The warmup iters',
+        })
 
 
 def create_dataset(name, split):
@@ -54,21 +46,26 @@ def create_dataset(name, split):
         dataset_name, namespace=namespace, subset_name='default', split=split)
 
 
-def train(parser):
-    cfg_dict = parser.get_cfg_dict()
-    args = parser.args
-    train_dataset = create_dataset(args.train_data, split='train')
-    val_dataset = create_dataset(args.validation_data, split='validation')
-
-    def cfg_modify_fn(cfg):
-        cfg.merge_from_dict(cfg_dict)
-        return cfg
+def train():
+    args = ImageClassificationTrainingArgs.from_cli(
+        model='damo/cv_vit-base_image-classification_ImageNet-labels',
+        max_epochs=1,
+        lr=1e-4,
+        optimizer='AdamW',
+        warmup_iters=1,
+        topk=(1, ))
+    if args.dataset_name is not None:
+        train_dataset = create_dataset(args.dataset_name, split='train')
+        val_dataset = create_dataset(args.dataset_name, split='validation')
+    else:
+        train_dataset = create_dataset(args.train_dataset_name, split='train')
+        val_dataset = create_dataset(args.val_dataset_name, split='validation')
 
     kwargs = dict(
-        model=args.model_id,  # model id
+        model=args.model,  # model id
         train_dataset=train_dataset,  # training dataset
         eval_dataset=val_dataset,  # validation dataset
-        cfg_modify_fn=cfg_modify_fn  # callback to modify configuration
+        cfg_modify_fn=args  # callback to modify configuration
     )
 
     # in distributed training, specify pytorch launcher
@@ -82,5 +79,4 @@ def train(parser):
 
 
 if __name__ == '__main__':
-    parser = define_parser()
-    train(parser)
+    train()
diff --git a/examples/pytorch/run_train.sh b/examples/pytorch/run_train.sh
index 2093fa09..a555d0c3 100644
--- a/examples/pytorch/run_train.sh
+++ b/examples/pytorch/run_train.sh
@@ -1,5 +1,5 @@
 PYTHONPATH=. python -m torch.distributed.launch --nproc_per_node=2 \
     examples/pytorch/finetune_image_classification.py \
     --num_classes 2 \
-    --train_data 'tany0699/cats_and_dogs' \
-    --validation_data 'tany0699/cats_and_dogs'
+    --train_dataset_name 'tany0699/cats_and_dogs' \
+    --val_dataset_name 'tany0699/cats_and_dogs'
diff --git a/examples/pytorch/text_classification/finetune_text_classification.py b/examples/pytorch/text_classification/finetune_text_classification.py
new file mode 100644
index 00000000..7747bc25
--- /dev/null
+++ b/examples/pytorch/text_classification/finetune_text_classification.py
@@ -0,0 +1,90 @@
+import os
+from dataclasses import dataclass, field
+
+from modelscope.msdatasets import MsDataset
+from modelscope.trainers import EpochBasedTrainer, build_trainer
+from modelscope.trainers.training_args import TrainingArgs
+
+
+def get_labels(cfg, metadata):
+    label2id = cfg.safe_get(metadata['cfg_node'])
+    if label2id is not None:
+        return ','.join(label2id.keys())
+
+
+def set_labels(cfg, labels, metadata):
+    if isinstance(labels, str):
+        labels = labels.split(',')
+    cfg.merge_from_dict(
+        {metadata['cfg_node']: {label: id
+                                for id, label in enumerate(labels)}})
+
+
+@dataclass
+class TextClassificationArguments(TrainingArgs):
+
+    first_sequence: str = field(
+        default=None,
+        metadata={
+            'help': 'The first sequence key of preprocessor',
+            'cfg_node': 'preprocessor.first_sequence'
+        })
+
+    second_sequence: str = field(
+        default=None,
+        metadata={
+            'help': 'The second sequence key of preprocessor',
+            'cfg_node': 'preprocessor.second_sequence'
+        })
+
+    label: str = field(
+        default=None,
+        metadata={
+            'help': 'The label key of preprocessor',
+            'cfg_node': 'preprocessor.label'
+        })
+
+    labels: str = field(
+        default=None,
+        metadata={
+            'help': 'The labels of the dataset',
+            'cfg_node': 'preprocessor.label2id',
+            'cfg_getter': get_labels,
+            'cfg_setter': set_labels,
+        })
+
+    preprocessor: str = field(
+        default=None,
+        metadata={
+            'help': 'The preprocessor type',
+            'cfg_node': 'preprocessor.type'
+        })
+
+    def __call__(self, config):
+        config = super().__call__(config)
+        config.model['num_labels'] = len(self.labels)
+        if config.train.lr_scheduler.type == 'LinearLR':
+            config.train.lr_scheduler['total_iters'] = \
+                int(len(train_dataset) / self.per_device_train_batch_size) * self.max_epochs
+        return config
+
+
+args = TextClassificationArguments.from_cli(
+    task='text-classification', eval_metrics='seq-cls-metric')
+
+print(args)
+
+dataset = MsDataset.load(args.dataset_name, subset_name=args.subset_name)
+train_dataset = dataset['train']
+validation_dataset = dataset['validation']
+
+kwargs = dict(
+    model=args.model,
+    train_dataset=train_dataset,
+    eval_dataset=validation_dataset,
+    seed=args.seed,
+    cfg_modify_fn=args)
+
+os.environ['LOCAL_RANK'] = str(args.local_rank)
+trainer: EpochBasedTrainer = build_trainer(name='trainer', default_args=kwargs)
+trainer.train()
diff --git a/examples/pytorch/text_classification/run_train.sh b/examples/pytorch/text_classification/run_train.sh
new file mode 100644
index 00000000..93c23d0d
--- /dev/null
+++ b/examples/pytorch/text_classification/run_train.sh
@@ -0,0 +1,12 @@
+PYTHONPATH=. python examples/pytorch/text_classification/finetune_text_classification.py \
+    --model 'damo/nlp_structbert_backbone_base_std' \
+    --dataset_name 'clue' \
+    --subset_name 'tnews' \
+    --first_sequence 'sentence' \
+    --preprocessor.label label \
+    --model.num_labels 15 \
+    --labels '0,1,2,3,4,5,6,7,8,9,10,11,12,13,14' \
+    --preprocessor 'sen-cls-tokenizer' \
+    --train.dataloader.workers_per_gpu 0 \
+    --evaluation.dataloader.workers_per_gpu 0 \
+    --train.optimizer.lr 1e-5 \
diff --git a/examples/pytorch/transformers/configuration.json b/examples/pytorch/transformers/configuration.json
new file mode 100644
index 00000000..df6a73c8
--- /dev/null
+++ b/examples/pytorch/transformers/configuration.json
@@ -0,0 +1 @@
+{"framework":"pytorch","train":{"work_dir":"/tmp","max_epochs":10,"dataloader":{"batch_size_per_gpu":16,"workers_per_gpu":0},"optimizer":{"type":"SGD","lr":0.001},"lr_scheduler":{"type":"StepLR","step_size":2},"hooks":[{"type":"CheckpointHook","interval":1}]},"evaluation":{"dataloader":{"batch_size_per_gpu":16,"workers_per_gpu":0,"shuffle":false}}}
diff --git a/examples/pytorch/transformers/finetune_transformers_model.py b/examples/pytorch/transformers/finetune_transformers_model.py
new file mode 100644
index 00000000..bbfb807a
--- /dev/null
+++ b/examples/pytorch/transformers/finetune_transformers_model.py
@@ -0,0 +1,57 @@
+import os
+from dataclasses import dataclass, field
+
+from datasets import load_dataset
+from transformers import (BertForSequenceClassification, BertTokenizerFast,
+                          default_data_collator)
+
+from modelscope.trainers import EpochBasedTrainer, build_trainer
+from modelscope.trainers.default_config import DEFAULT_CONFIG, TrainingArgs
+
+
+@dataclass
+class TransformersArguments(TrainingArgs):
+
+    num_labels: int = field(
+        default=None, metadata={
+            'help': 'The number of labels',
+        })
+
+
+args = TransformersArguments.from_cli(
+    task='text-classification', eval_metrics='seq-cls-metric')
+
+print(args)
+
+dataset = load_dataset(args.dataset_name, args.subset_name)
+
+model = BertForSequenceClassification.from_pretrained(
+    args.model, num_labels=args.num_labels)
+tokenizer = BertTokenizerFast.from_pretrained(args.model)
+
+
+def tokenize_sentence(row):
+    return tokenizer(row['sentence'], padding='max_length', max_length=128)
+
+
+# Extra columns, Rename columns
+dataset = dataset.map(tokenize_sentence).remove_columns(['sentence',
+                                                         'idx']).rename_column(
+                                                             'label', 'labels')
+
+cfg_file = os.path.join(args.work_dir or './', 'configuration.json')
+DEFAULT_CONFIG.dump(cfg_file)
+
+kwargs = dict(
+    model=model,
+    cfg_file=cfg_file,
+    # data_collator
+    data_collator=default_data_collator,
+    train_dataset=dataset['train'],
+    eval_dataset=dataset['validation'],
+    seed=args.seed,
+    cfg_modify_fn=args)
+
+os.environ['LOCAL_RANK'] = str(args.local_rank)
+trainer: EpochBasedTrainer = build_trainer(name='trainer', default_args=kwargs)
+trainer.train()
diff --git a/examples/pytorch/transformers/run_train.sh b/examples/pytorch/transformers/run_train.sh
new file mode 100644
index 00000000..c76c4636
--- /dev/null
+++ b/examples/pytorch/transformers/run_train.sh
@@ -0,0 +1,5 @@
+PYTHONPATH=. python examples/pytorch/transformers/finetune_transformers_model.py \
+    --model bert-base-uncased \
+    --num_labels 15 \
+    --dataset_name clue \
+    --subset_name tnews
diff --git a/modelscope/exporters/__init__.py b/modelscope/exporters/__init__.py
index 48bab33a..0c773dca 100644
--- a/modelscope/exporters/__init__.py
+++ b/modelscope/exporters/__init__.py
@@ -1,5 +1,12 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+from modelscope.utils.import_utils import is_tf_available, is_torch_available
 from .base import Exporter
 from .builder import build_exporter
-from .nlp import SbertForSequenceClassificationExporter
-from .tf_model_exporter import TfModelExporter
-from .torch_model_exporter import TorchModelExporter
+
+if is_tf_available():
+    from .nlp import CsanmtForTranslationExporter
+    from .tf_model_exporter import TfModelExporter
+if is_torch_available():
+    from .nlp import SbertForSequenceClassificationExporter, SbertForZeroShotClassificationExporter
+    from .torch_model_exporter import TorchModelExporter
diff --git a/modelscope/exporters/base.py b/modelscope/exporters/base.py
index bf190660..d105afd2 100644
--- a/modelscope/exporters/base.py
+++ b/modelscope/exporters/base.py
@@ -6,9 +6,11 @@ from typing import Dict, Union
 from modelscope.models import Model
 from modelscope.utils.config import Config, ConfigDict
 from modelscope.utils.constant import ModelFile
-from modelscope.utils.hub import snapshot_download
+from modelscope.utils.logger import get_logger
 from .builder import build_exporter
 
+logger = get_logger(__name__)
+
 
 class Exporter(ABC):
     """Exporter base class to output model to onnx, torch_script, graphdef, etc.
@@ -46,7 +48,12 @@ class Exporter(ABC):
         if hasattr(cfg, 'export'):
             export_cfg.update(cfg.export)
         export_cfg['model'] = model
-        exporter = build_exporter(export_cfg, task_name, kwargs)
+        try:
+            exporter = build_exporter(export_cfg, task_name, kwargs)
+        except KeyError as e:
+            raise KeyError(
+                f'The exporting of model \'{model_cfg.type}\' with task: \'{task_name}\' '
+                f'is not supported currently.') from e
         return exporter
 
     @abstractmethod
diff --git a/modelscope/exporters/nlp/__init__.py b/modelscope/exporters/nlp/__init__.py
index fdfd2711..731e4bb7 100644
--- a/modelscope/exporters/nlp/__init__.py
+++ b/modelscope/exporters/nlp/__init__.py
@@ -1,2 +1,11 @@
-from .sbert_for_sequence_classification_exporter import \
-    SbertForSequenceClassificationExporter
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+from modelscope.utils.import_utils import is_tf_available, is_torch_available
+
+if is_tf_available():
+    from .csanmt_for_translation_exporter import CsanmtForTranslationExporter
+if is_torch_available():
+    from .sbert_for_sequence_classification_exporter import \
+        SbertForSequenceClassificationExporter
+    from .sbert_for_zero_shot_classification_exporter import \
+        SbertForZeroShotClassificationExporter
diff --git a/modelscope/exporters/nlp/csanmt_for_translation_exporter.py b/modelscope/exporters/nlp/csanmt_for_translation_exporter.py
new file mode 100644
index 00000000..95cb5bc9
--- /dev/null
+++ b/modelscope/exporters/nlp/csanmt_for_translation_exporter.py
@@ -0,0 +1,185 @@
+import os
+from typing import Any, Dict
+
+import tensorflow as tf
+from tensorflow.python.framework import ops
+from tensorflow.python.saved_model import tag_constants
+from tensorflow.python.tools import freeze_graph
+
+from modelscope.exporters.builder import EXPORTERS
+from modelscope.exporters.tf_model_exporter import TfModelExporter
+from modelscope.metainfo import Models
+from modelscope.pipelines.nlp.translation_pipeline import TranslationPipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.logger import get_logger
+from modelscope.utils.test_utils import compare_arguments_nested
+
+logger = get_logger(__name__)
+
+if tf.__version__ >= '2.0':
+    tf = tf.compat.v1
+    tf.disable_eager_execution()
+
+tf.logging.set_verbosity(tf.logging.INFO)
+
+
+@EXPORTERS.register_module(Tasks.translation, module_name=Models.translation)
+class CsanmtForTranslationExporter(TfModelExporter):
+
+    def __init__(self, model=None):
+        super().__init__(model)
+        self.pipeline = TranslationPipeline(self.model)
+
+    def generate_dummy_inputs(self, **kwargs) -> Dict[str, Any]:
+        return_dict = self.pipeline.preprocess(
+            "Alibaba Group's mission is to let the world have no difficult business"
+        )
+        return {'input_wids': return_dict['input_ids']}
+
+    def export_saved_model(self, output_dir, rtol=None, atol=None, **kwargs):
+
+        def _generate_signature():
+            receiver_tensors = {
+                'input_wids':
+                tf.saved_model.utils.build_tensor_info(
+                    self.pipeline.input_wids)
+            }
+            export_outputs = {
+                'output_seqs':
+                tf.saved_model.utils.build_tensor_info(
+                    self.pipeline.output['output_seqs'])
+            }
+
+            signature_def = tf.saved_model.signature_def_utils.build_signature_def(
+                receiver_tensors, export_outputs,
+                tf.saved_model.signature_constants.PREDICT_METHOD_NAME)
+
+            return {'translation_signature': signature_def}
+
+        with self.pipeline._session.as_default() as sess:
+            builder = tf.saved_model.builder.SavedModelBuilder(output_dir)
+            builder.add_meta_graph_and_variables(
+                sess, [tag_constants.SERVING],
+                signature_def_map=_generate_signature(),
+                assets_collection=ops.get_collection(
+                    ops.GraphKeys.ASSET_FILEPATHS),
+                clear_devices=True)
+            builder.save()
+
+        dummy_inputs = self.generate_dummy_inputs()
+        with tf.Session(graph=tf.Graph()) as sess:
+            # Restore model from the saved_modle file, that is exported by TensorFlow estimator.
+            MetaGraphDef = tf.saved_model.loader.load(sess, ['serve'],
+                                                      output_dir)
+
+            # SignatureDef protobuf
+            SignatureDef_map = MetaGraphDef.signature_def
+            SignatureDef = SignatureDef_map['translation_signature']
+            # TensorInfo protobuf
+            X_TensorInfo = SignatureDef.inputs['input_wids']
+            y_TensorInfo = SignatureDef.outputs['output_seqs']
+            X = tf.saved_model.utils.get_tensor_from_tensor_info(
+                X_TensorInfo, sess.graph)
+            y = tf.saved_model.utils.get_tensor_from_tensor_info(
+                y_TensorInfo, sess.graph)
+            outputs = sess.run(y, feed_dict={X: dummy_inputs['input_wids']})
+            trans_result = self.pipeline.postprocess({'output_seqs': outputs})
+            logger.info(trans_result)
+
+        outputs_origin = self.pipeline.forward(
+            {'input_ids': dummy_inputs['input_wids']})
+
+        tols = {}
+        if rtol is not None:
+            tols['rtol'] = rtol
+        if atol is not None:
+            tols['atol'] = atol
+        if not compare_arguments_nested('Output match failed', outputs,
+                                        outputs_origin['output_seqs'], **tols):
+            raise RuntimeError(
+                'Export saved model failed because of validation error.')
+
+        return {'model': output_dir}
+
+    def export_frozen_graph_def(self,
+                                output_dir: str,
+                                rtol=None,
+                                atol=None,
+                                **kwargs):
+        input_saver_def = self.pipeline.model_loader.as_saver_def()
+        inference_graph_def = tf.get_default_graph().as_graph_def()
+        for node in inference_graph_def.node:
+            node.device = ''
+
+        frozen_dir = os.path.join(output_dir, 'frozen')
+        tf.gfile.MkDir(frozen_dir)
+        frozen_graph_path = os.path.join(frozen_dir,
+                                         'frozen_inference_graph.pb')
+
+        outputs = {
+            'output_trans_result':
+            tf.identity(
+                self.pipeline.output['output_seqs'],
+                name='NmtModel/output_trans_result')
+        }
+
+        for output_key in outputs:
+            tf.add_to_collection('inference_op', outputs[output_key])
+
+        output_node_names = ','.join([
+            '%s/%s' % ('NmtModel', output_key)
+            for output_key in outputs.keys()
+        ])
+        print(output_node_names)
+        _ = freeze_graph.freeze_graph_with_def_protos(
+            input_graph_def=tf.get_default_graph().as_graph_def(),
+            input_saver_def=input_saver_def,
+            input_checkpoint=self.pipeline.model_path,
+            output_node_names=output_node_names,
+            restore_op_name='save/restore_all',
+            filename_tensor_name='save/Const:0',
+            output_graph=frozen_graph_path,
+            clear_devices=True,
+            initializer_nodes='')
+
+        # 5. test frozen.pb
+        dummy_inputs = self.generate_dummy_inputs()
+        with self.pipeline._session.as_default() as sess:
+            sess.run(tf.tables_initializer())
+
+            graph = tf.Graph()
+            with tf.gfile.GFile(frozen_graph_path, 'rb') as f:
+                graph_def = tf.GraphDef()
+                graph_def.ParseFromString(f.read())
+
+            with graph.as_default():
+                tf.import_graph_def(graph_def, name='')
+            graph.finalize()
+
+            with tf.Session(graph=graph) as trans_sess:
+                outputs = trans_sess.run(
+                    'NmtModel/strided_slice_9:0',
+                    feed_dict={'input_wids:0': dummy_inputs['input_wids']})
+                trans_result = self.pipeline.postprocess(
+                    {'output_seqs': outputs})
+                logger.info(trans_result)
+
+        outputs_origin = self.pipeline.forward(
+            {'input_ids': dummy_inputs['input_wids']})
+
+        tols = {}
+        if rtol is not None:
+            tols['rtol'] = rtol
+        if atol is not None:
+            tols['atol'] = atol
+        if not compare_arguments_nested('Output match failed', outputs,
+                                        outputs_origin['output_seqs'], **tols):
+            raise RuntimeError(
+                'Export frozen graphdef failed because of validation error.')
+
+        return {'model': frozen_graph_path}
+
+    def export_onnx(self, output_dir: str, opset=13, **kwargs):
+        raise NotImplementedError(
+            'csanmt model does not support onnx format, consider using savedmodel instead.'
+        )
diff --git a/modelscope/exporters/nlp/sbert_for_sequence_classification_exporter.py b/modelscope/exporters/nlp/sbert_for_sequence_classification_exporter.py
index 802e92a2..1e238769 100644
--- a/modelscope/exporters/nlp/sbert_for_sequence_classification_exporter.py
+++ b/modelscope/exporters/nlp/sbert_for_sequence_classification_exporter.py
@@ -1,4 +1,3 @@
-import os
 from collections import OrderedDict
 from typing import Any, Dict, Mapping, Tuple
 
@@ -7,9 +6,7 @@ from torch.utils.data.dataloader import default_collate
 from modelscope.exporters.builder import EXPORTERS
 from modelscope.exporters.torch_model_exporter import TorchModelExporter
 from modelscope.metainfo import Models
-from modelscope.preprocessors import (
-    Preprocessor, TextClassificationTransformersPreprocessor,
-    build_preprocessor)
+from modelscope.preprocessors import Preprocessor
 from modelscope.utils.constant import ModeKeys, Tasks
 
 
@@ -17,8 +14,6 @@ from modelscope.utils.constant import ModeKeys, Tasks
 @EXPORTERS.register_module(
     Tasks.text_classification, module_name=Models.structbert)
 @EXPORTERS.register_module(Tasks.sentence_similarity, module_name=Models.bert)
-@EXPORTERS.register_module(
-    Tasks.zero_shot_classification, module_name=Models.bert)
 @EXPORTERS.register_module(
     Tasks.sentiment_classification, module_name=Models.bert)
 @EXPORTERS.register_module(Tasks.nli, module_name=Models.bert)
@@ -27,8 +22,6 @@ from modelscope.utils.constant import ModeKeys, Tasks
 @EXPORTERS.register_module(
     Tasks.sentiment_classification, module_name=Models.structbert)
 @EXPORTERS.register_module(Tasks.nli, module_name=Models.structbert)
-@EXPORTERS.register_module(
-    Tasks.zero_shot_classification, module_name=Models.structbert)
 class SbertForSequenceClassificationExporter(TorchModelExporter):
 
     def generate_dummy_inputs(self,
diff --git a/modelscope/exporters/nlp/sbert_for_zero_shot_classification_exporter.py b/modelscope/exporters/nlp/sbert_for_zero_shot_classification_exporter.py
new file mode 100644
index 00000000..461a772f
--- /dev/null
+++ b/modelscope/exporters/nlp/sbert_for_zero_shot_classification_exporter.py
@@ -0,0 +1,58 @@
+from collections import OrderedDict
+from typing import Any, Dict, Mapping
+
+from modelscope.exporters.builder import EXPORTERS
+from modelscope.exporters.torch_model_exporter import TorchModelExporter
+from modelscope.metainfo import Models
+from modelscope.preprocessors import Preprocessor
+from modelscope.utils.constant import Tasks
+
+
+@EXPORTERS.register_module(
+    Tasks.zero_shot_classification, module_name=Models.bert)
+@EXPORTERS.register_module(
+    Tasks.zero_shot_classification, module_name=Models.structbert)
+class SbertForZeroShotClassificationExporter(TorchModelExporter):
+
+    def generate_dummy_inputs(self,
+                              candidate_labels,
+                              hypothesis_template,
+                              max_length=128,
+                              pair: bool = False,
+                              **kwargs) -> Dict[str, Any]:
+        """Generate dummy inputs for model exportation to onnx or other formats by tracing.
+
+        Args:
+
+            max_length(int): The max length of sentence, default 128.
+            hypothesis_template(str): The template of prompt, like '这篇文章的标题是{}'
+            candidate_labels(List): The labels of prompt,
+            like ['文化', '体育', '娱乐', '财经', '家居', '汽车', '教育', '科技', '军事']
+            pair(bool, `optional`): Whether to generate sentence pairs or single sentences.
+
+        Returns:
+            Dummy inputs.
+        """
+
+        assert hasattr(
+            self.model, 'model_dir'
+        ), 'model_dir attribute is required to build the preprocessor'
+        preprocessor = Preprocessor.from_pretrained(
+            self.model.model_dir, max_length=max_length)
+        return preprocessor(
+            preprocessor.nlp_tokenizer.tokenizer.unk_token,
+            candidate_labels=candidate_labels,
+            hypothesis_template=hypothesis_template)
+
+    @property
+    def inputs(self) -> Mapping[str, Mapping[int, str]]:
+        dynamic_axis = {0: 'batch', 1: 'sequence'}
+        return OrderedDict([
+            ('input_ids', dynamic_axis),
+            ('attention_mask', dynamic_axis),
+            ('token_type_ids', dynamic_axis),
+        ])
+
+    @property
+    def outputs(self) -> Mapping[str, Mapping[int, str]]:
+        return OrderedDict({'logits': {0: 'batch'}})
diff --git a/modelscope/exporters/tf_model_exporter.py b/modelscope/exporters/tf_model_exporter.py
index 3035b4ce..96ae32f7 100644
--- a/modelscope/exporters/tf_model_exporter.py
+++ b/modelscope/exporters/tf_model_exporter.py
@@ -1,5 +1,6 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import os
+from abc import abstractmethod
 from typing import Any, Callable, Dict, Mapping
 
 import tensorflow as tf
@@ -7,7 +8,7 @@ import tensorflow as tf
 from modelscope.outputs import ModelOutputBase
 from modelscope.utils.constant import ModelFile
 from modelscope.utils.logger import get_logger
-from modelscope.utils.regress_test_utils import compare_arguments_nested
+from modelscope.utils.test_utils import compare_arguments_nested
 from .base import Exporter
 
 logger = get_logger()
@@ -29,6 +30,14 @@ class TfModelExporter(Exporter):
         self._tf2_export_onnx(model, onnx_file, opset=opset, **kwargs)
         return {'model': onnx_file}
 
+    @abstractmethod
+    def export_saved_model(self, output_dir: str, **kwargs):
+        pass
+
+    @abstractmethod
+    def export_frozen_graph_def(self, output_dir: str, **kwargs):
+        pass
+
     def _tf2_export_onnx(self,
                          model,
                          output: str,
@@ -59,56 +68,67 @@ class TfModelExporter(Exporter):
         onnx.save(onnx_model, output)
 
         if validation:
-            try:
-                import onnx
-                import onnxruntime as ort
-            except ImportError:
-                logger.warn(
-                    'Cannot validate the exported onnx file, because '
-                    'the installation of onnx or onnxruntime cannot be found')
-                return
+            self._validate_model(dummy_inputs, model, output, rtol, atol,
+                                 call_func)
 
-            def tensor_nested_numpify(tensors):
-                if isinstance(tensors, (list, tuple)):
-                    return type(tensors)(
-                        tensor_nested_numpify(t) for t in tensors)
-                if isinstance(tensors, Mapping):
-                    # return dict
-                    return {
-                        k: tensor_nested_numpify(t)
-                        for k, t in tensors.items()
-                    }
-                if isinstance(tensors, tf.Tensor):
-                    t = tensors.cpu()
-                    return t.numpy()
-                return tensors
+    def _validate_model(
+        self,
+        dummy_inputs,
+        model,
+        output,
+        rtol: float = None,
+        atol: float = None,
+        call_func: Callable = None,
+    ):
+        try:
+            import onnx
+            import onnxruntime as ort
+        except ImportError:
+            logger.warn(
+                'Cannot validate the exported onnx file, because '
+                'the installation of onnx or onnxruntime cannot be found')
+            return
 
-            onnx_model = onnx.load(output)
-            onnx.checker.check_model(onnx_model)
-            ort_session = ort.InferenceSession(output)
-            outputs_origin = call_func(
-                dummy_inputs) if call_func is not None else model(dummy_inputs)
-            if isinstance(outputs_origin, (Mapping, ModelOutputBase)):
-                outputs_origin = list(
-                    tensor_nested_numpify(outputs_origin).values())
-            elif isinstance(outputs_origin, (tuple, list)):
-                outputs_origin = list(tensor_nested_numpify(outputs_origin))
-            outputs = ort_session.run(
-                None,
-                tensor_nested_numpify(dummy_inputs),
-            )
-            outputs = tensor_nested_numpify(outputs)
-            if isinstance(outputs, dict):
-                outputs = list(outputs.values())
-            elif isinstance(outputs, tuple):
-                outputs = list(outputs)
+        def tensor_nested_numpify(tensors):
+            if isinstance(tensors, (list, tuple)):
+                return type(tensors)(tensor_nested_numpify(t) for t in tensors)
+            if isinstance(tensors, Mapping):
+                # return dict
+                return {
+                    k: tensor_nested_numpify(t)
+                    for k, t in tensors.items()
+                }
+            if isinstance(tensors, tf.Tensor):
+                t = tensors.cpu()
+                return t.numpy()
+            return tensors
 
-            tols = {}
-            if rtol is not None:
-                tols['rtol'] = rtol
-            if atol is not None:
-                tols['atol'] = atol
-            if not compare_arguments_nested('Onnx model output match failed',
-                                            outputs, outputs_origin, **tols):
-                raise RuntimeError(
-                    'export onnx failed because of validation error.')
+        onnx_model = onnx.load(output)
+        onnx.checker.check_model(onnx_model, full_check=True)
+        ort_session = ort.InferenceSession(output)
+        outputs_origin = call_func(
+            dummy_inputs) if call_func is not None else model(dummy_inputs)
+        if isinstance(outputs_origin, (Mapping, ModelOutputBase)):
+            outputs_origin = list(
+                tensor_nested_numpify(outputs_origin).values())
+        elif isinstance(outputs_origin, (tuple, list)):
+            outputs_origin = list(tensor_nested_numpify(outputs_origin))
+        outputs = ort_session.run(
+            None,
+            tensor_nested_numpify(dummy_inputs),
+        )
+        outputs = tensor_nested_numpify(outputs)
+        if isinstance(outputs, dict):
+            outputs = list(outputs.values())
+        elif isinstance(outputs, tuple):
+            outputs = list(outputs)
+
+        tols = {}
+        if rtol is not None:
+            tols['rtol'] = rtol
+        if atol is not None:
+            tols['atol'] = atol
+        if not compare_arguments_nested('Onnx model output match failed',
+                                        outputs, outputs_origin, **tols):
+            raise RuntimeError(
+                'export onnx failed because of validation error.')
diff --git a/modelscope/fileio/io.py b/modelscope/fileio/io.py
index 1b23997a..5dc4a833 100644
--- a/modelscope/fileio/io.py
+++ b/modelscope/fileio/io.py
@@ -27,9 +27,9 @@ def load(file, file_format=None, **kwargs):
             Currently supported formats include "json", "yaml/yml".
 
     Examples:
-        >>> load('/path/of/your/file')  # file is storaged in disk
-        >>> load('https://path/of/your/file')  # file is storaged in Internet
-        >>> load('oss://path/of/your/file')  # file is storaged in petrel
+        >>> load('/path/of/your/file')  # file is stored in disk
+        >>> load('https://path/of/your/file')  # file is stored on internet
+        >>> load('oss://path/of/your/file')  # file is stored in petrel
 
     Returns:
         The content from the file.
diff --git a/modelscope/hub/check_model.py b/modelscope/hub/check_model.py
index f2d4a98f..7fd9f324 100644
--- a/modelscope/hub/check_model.py
+++ b/modelscope/hub/check_model.py
@@ -1,15 +1,13 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 
 import os
-import pickle
 from typing import Dict, Optional, Union
 from urllib.parse import urlparse
 
 from modelscope.hub.api import HubApi, ModelScopeConfig
-from modelscope.hub.constants import (FILE_HASH, MODEL_META_FILE_NAME,
-                                      MODEL_META_MODEL_ID)
+from modelscope.hub.constants import FILE_HASH
 from modelscope.hub.git import GitCommandWrapper
-from modelscope.hub.utils.caching import FileSystemCache, ModelFileSystemCache
+from modelscope.hub.utils.caching import ModelFileSystemCache
 from modelscope.hub.utils.utils import compute_hash
 from modelscope.utils.logger import get_logger
 
diff --git a/modelscope/metainfo.py b/modelscope/metainfo.py
index f05f23df..63ddd6d9 100644
--- a/modelscope/metainfo.py
+++ b/modelscope/metainfo.py
@@ -1,4 +1,5 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
+from modelscope.utils.constant import Fields, Tasks
 
 
 class Models(object):
@@ -7,35 +8,44 @@ class Models(object):
         Holds the standard model name to use for identifying different model.
     This should be used to register models.
 
-        Model name should only contain model info but not task info.
+        Model name should only contain model information but not task information.
     """
     # tinynas models
     tinynas_detection = 'tinynas-detection'
     tinynas_damoyolo = 'tinynas-damoyolo'
-
     # vision models
     detection = 'detection'
+    mask_scoring = 'MaskScoring'
+    image_restoration = 'image-restoration'
     realtime_object_detection = 'realtime-object-detection'
     realtime_video_object_detection = 'realtime-video-object-detection'
     scrfd = 'scrfd'
+    depe = 'depe'
     classification_model = 'ClassificationModel'
     easyrobust_model = 'EasyRobustModel'
     bnext = 'bnext'
+    yolopv2 = 'yolopv2'
     nafnet = 'nafnet'
     csrnet = 'csrnet'
+    adaint = 'adaint'
+    deeplpfnet = 'deeplpfnet'
+    rrdb = 'rrdb'
     cascade_mask_rcnn_swin = 'cascade_mask_rcnn_swin'
     maskdino_swin = 'maskdino_swin'
     gpen = 'gpen'
     product_retrieval_embedding = 'product-retrieval-embedding'
     body_2d_keypoints = 'body-2d-keypoints'
     body_3d_keypoints = 'body-3d-keypoints'
+    body_3d_keypoints_hdformer = 'hdformer'
     crowd_counting = 'HRNetCrowdCounting'
     face_2d_keypoints = 'face-2d-keypoints'
     panoptic_segmentation = 'swinL-panoptic-segmentation'
     r50_panoptic_segmentation = 'r50-panoptic-segmentation'
     image_reid_person = 'passvitb'
     image_inpainting = 'FFTInpainting'
+    image_paintbyexample = 'Stablediffusion-Paintbyexample'
     video_summarization = 'pgl-video-summarization'
+    video_panoptic_segmentation = 'swinb-video-panoptic-segmentation'
     language_guided_video_summarization = 'clip-it-language-guided-video-summarization'
     swinL_semantic_segmentation = 'swinL-semantic-segmentation'
     vitadapter_semantic_segmentation = 'vitadapter-semantic-segmentation'
@@ -70,6 +80,7 @@ class Models(object):
     video_human_matting = 'video-human-matting'
     video_frame_interpolation = 'video-frame-interpolation'
     video_object_segmentation = 'video-object-segmentation'
+    video_deinterlace = 'video-deinterlace'
     quadtree_attention_image_matching = 'quadtree-attention-image-matching'
     vision_middleware = 'vision-middleware'
     video_stabilization = 'video-stabilization'
@@ -78,14 +89,31 @@ class Models(object):
     image_casmvs_depth_estimation = 'image-casmvs-depth-estimation'
     vop_retrieval_model = 'vop-retrieval-model'
     ddcolor = 'ddcolor'
+    image_probing_model = 'image-probing-model'
     defrcn = 'defrcn'
     image_face_fusion = 'image-face-fusion'
+    content_check = 'content-check'
+    open_vocabulary_detection_vild = 'open-vocabulary-detection-vild'
+    ecbsr = 'ecbsr'
+    msrresnet_lite = 'msrresnet-lite'
+    object_detection_3d = 'object_detection_3d'
+    ddpm = 'ddpm'
+    ocr_recognition = 'OCRRecognition'
+    image_quality_assessment_mos = 'image-quality-assessment-mos'
+    image_quality_assessment_degradation = 'image-quality-assessment-degradation'
+    m2fp = 'm2fp'
+    nerf_recon_acc = 'nerf-recon-acc'
+    bts_depth_estimation = 'bts-depth-estimation'
+    vision_efficient_tuning = 'vision-efficient-tuning'
+
+    bad_image_detecting = 'bad-image-detecting'
 
     # EasyCV models
     yolox = 'YOLOX'
     segformer = 'Segformer'
     hand_2d_keypoints = 'HRNet-Hand2D-Keypoints'
     image_object_detection_auto = 'image-object-detection-auto'
+    dino = 'DINO'
 
     # nlp models
     bert = 'bert'
@@ -122,6 +150,12 @@ class Models(object):
     unite = 'unite'
     megatron_bert = 'megatron-bert'
     use = 'user-satisfaction-estimation'
+    fid_plug = 'fid-plug'
+    lstm = 'lstm'
+    xlm_roberta = 'xlm-roberta'
+    transformers = 'transformers'
+    plug_mental = 'plug-mental'
+    doc2bot = 'doc2bot'
 
     # audio models
     sambert_hifigan = 'sambert-hifigan'
@@ -135,6 +169,8 @@ class Models(object):
     generic_itn = 'generic-itn'
     generic_punc = 'generic-punc'
     generic_sv = 'generic-sv'
+    ecapa_tdnn_sv = 'ecapa-tdnn-sv'
+    generic_lm = 'generic-lm'
 
     # multi-modal models
     ofa = 'ofa'
@@ -162,6 +198,7 @@ class TaskModels(object):
     fill_mask = 'fill-mask'
     feature_extraction = 'feature-extraction'
     text_generation = 'text-generation'
+    text_ranking = 'text-ranking'
 
 
 class Heads(object):
@@ -179,6 +216,11 @@ class Heads(object):
     information_extraction = 'information-extraction'
     # text gen
     text_generation = 'text-generation'
+    # text ranking
+    text_ranking = 'text-ranking'
+    # crf
+    lstm_crf = 'lstm-crf'
+    transformer_crf = 'transformer-crf'
 
 
 class Pipelines(object):
@@ -193,6 +235,7 @@ class Pipelines(object):
     """
     # vision tasks
     portrait_matting = 'unet-image-matting'
+    universal_matting = 'unet-universal-matting'
     image_denoise = 'nafnet-image-denoise'
     image_deblur = 'nafnet-image-deblur'
     person_image_cartoon = 'unet-person-image-cartoon'
@@ -209,16 +252,19 @@ class Pipelines(object):
     hand_2d_keypoints = 'hrnetv2w18_hand-2d-keypoints_image'
     human_detection = 'resnet18-human-detection'
     object_detection = 'vit-object-detection'
+    abnormal_object_detection = 'abnormal-object-detection'
     easycv_detection = 'easycv-detection'
     easycv_segmentation = 'easycv-segmentation'
     face_2d_keypoints = 'mobilenet_face-2d-keypoints_alignment'
     salient_detection = 'u2net-salient-detection'
     salient_boudary_detection = 'res2net-salient-detection'
     camouflaged_detection = 'res2net-camouflaged-detection'
+    image_demoire = 'uhdm-image-demoireing'
     image_classification = 'image-classification'
     face_detection = 'resnet-face-detection-scrfd10gkps'
     face_liveness_ir = 'manual-face-liveness-flir'
     face_liveness_rgb = 'manual-face-liveness-flir'
+    face_liveness_xc = 'manual-face-liveness-flxc'
     card_detection = 'resnet-card-detection-scrfd34gkps'
     ulfd_face_detection = 'manual-face-detection-ulfd'
     tinymog_face_detection = 'manual-face-detection-tinymog'
@@ -234,20 +280,28 @@ class Pipelines(object):
     nextvit_small_daily_image_classification = 'nextvit-small_image-classification_Dailylife-labels'
     convnext_base_image_classification_garbage = 'convnext-base_image-classification_garbage'
     bnext_small_image_classification = 'bnext-small_image-classification_ImageNet-labels'
+    yolopv2_image_driving_percetion_bdd100k = 'yolopv2_image-driving-percetion_bdd100k'
     common_image_classification = 'common-image-classification'
     image_color_enhance = 'csrnet-image-color-enhance'
+    adaint_image_color_enhance = 'adaint-image-color-enhance'
+    deeplpf_image_color_enhance = 'deeplpf-image-color-enhance'
     virtual_try_on = 'virtual-try-on'
     image_colorization = 'unet-image-colorization'
     image_style_transfer = 'AAMS-style-transfer'
     image_super_resolution = 'rrdb-image-super-resolution'
+    image_debanding = 'rrdb-image-debanding'
     face_image_generation = 'gan-face-image-generation'
     product_retrieval_embedding = 'resnet50-product-retrieval-embedding'
-    realtime_object_detection = 'cspnet_realtime-object-detection_yolox'
     realtime_video_object_detection = 'cspnet_realtime-video-object-detection_streamyolo'
     face_recognition = 'ir101-face-recognition-cfglint'
     face_recognition_ood = 'ir-face-recognition-ood-rts'
+    face_quality_assessment = 'manual-face-quality-assessment-fqa'
+    face_recognition_ood = 'ir-face-recognition-rts'
+    face_recognition_onnx_ir = 'manual-face-recognition-frir'
+    face_recognition_onnx_fm = 'manual-face-recognition-frfm'
     arc_face_recognition = 'ir50-face-recognition-arcface'
     mask_face_recognition = 'resnet-face-recognition-facemask'
+    content_check = 'resnet50-image-classification-cc'
     image_instance_segmentation = 'cascade-mask-rcnn-swin-image-instance-segmentation'
     maskdino_instance_segmentation = 'maskdino-swin-image-instance-segmentation'
     image2image_translation = 'image-to-image-translation'
@@ -259,6 +313,7 @@ class Pipelines(object):
     image_object_detection_auto = 'yolox_image-object-detection-auto'
     hand_detection = 'yolox-pai_hand-detection'
     skin_retouching = 'unet-skin-retouching'
+    face_reconstruction = 'resnet50-face-reconstruction'
     tinynas_classification = 'tinynas-classification'
     easyrobust_classification = 'easyrobust-classification'
     tinynas_detection = 'tinynas-detection'
@@ -277,6 +332,8 @@ class Pipelines(object):
     panorama_depth_estimation = 'panorama-depth-estimation'
     image_reid_person = 'passvitb-image-reid-person'
     image_inpainting = 'fft-inpainting'
+    image_paintbyexample = 'stablediffusion-paintbyexample'
+    image_inpainting_sdv2 = 'image-inpainting-sdv2'
     text_driven_segmentation = 'text-driven-segmentation'
     movie_scene_segmentation = 'resnet50-bert-movie-scene-segmentation'
     shop_segmentation = 'shop-segmentation'
@@ -294,15 +351,31 @@ class Pipelines(object):
     vision_middleware_multi_task = 'vision-middleware-multi-task'
     video_frame_interpolation = 'video-frame-interpolation'
     video_object_segmentation = 'video-object-segmentation'
+    video_deinterlace = 'video-deinterlace'
     image_matching = 'image-matching'
     video_stabilization = 'video-stabilization'
     video_super_resolution = 'realbasicvsr-video-super-resolution'
     pointcloud_sceneflow_estimation = 'pointcloud-sceneflow-estimation'
     image_multi_view_depth_estimation = 'image-multi-view-depth-estimation'
+    video_panoptic_segmentation = 'video-panoptic-segmentation'
     vop_retrieval = 'vop-video-text-retrieval'
     ddcolor_image_colorization = 'ddcolor-image-colorization'
+    image_structured_model_probing = 'image-structured-model-probing'
     image_fewshot_detection = 'image-fewshot-detection'
     image_face_fusion = 'image-face-fusion'
+    open_vocabulary_detection_vild = 'open-vocabulary-detection-vild'
+    ddpm_image_semantic_segmentation = 'ddpm-image-semantic-segmentation'
+    video_colorization = 'video-colorization'
+    motion_generattion = 'mdm-motion-generation'
+    mobile_image_super_resolution = 'mobile-image-super-resolution'
+    image_human_parsing = 'm2fp-image-human-parsing'
+    object_detection_3d_depe = 'object-detection-3d-depe'
+    nerf_recon_acc = 'nerf-recon-acc'
+    bad_image_detecting = 'bad-image-detecting'
+
+    image_quality_assessment_mos = 'image-quality-assessment-mos'
+    image_quality_assessment_degradation = 'image-quality-assessment-degradation'
+    vision_efficient_tuning = 'vision-efficient-tuning'
 
     # nlp tasks
     automatic_post_editing = 'automatic-post-editing'
@@ -317,6 +390,7 @@ class Pipelines(object):
     named_entity_recognition_thai = 'named-entity-recognition-thai'
     named_entity_recognition_viet = 'named-entity-recognition-viet'
     text_generation = 'text-generation'
+    fid_dialogue = 'fid-dialogue'
     text2text_generation = 'text2text-generation'
     sentiment_analysis = 'sentiment-analysis'
     sentiment_classification = 'sentiment-classification'
@@ -324,6 +398,7 @@ class Pipelines(object):
     fill_mask = 'fill-mask'
     fill_mask_ponet = 'fill-mask-ponet'
     csanmt_translation = 'csanmt-translation'
+    interactive_translation = 'interactive-translation'
     nli = 'nli'
     dialog_intent_prediction = 'dialog-intent-prediction'
     dialog_modeling = 'dialog-modeling'
@@ -352,6 +427,10 @@ class Pipelines(object):
     token_classification = 'token-classification'
     translation_evaluation = 'translation-evaluation'
     user_satisfaction_estimation = 'user-satisfaction-estimation'
+    siamese_uie = 'siamese-uie'
+    document_grounded_dialog_retrieval = 'document-grounded-dialog-retrieval'
+    document_grounded_dialog_rerank = 'document-grounded-dialog-rerank'
+    document_grounded_dialog_generate = 'document-grounded-dialog-generate'
 
     # audio tasks
     sambert_hifigan_tts = 'sambert-hifigan-tts'
@@ -365,6 +444,9 @@ class Pipelines(object):
     itn_inference = 'itn-inference'
     punc_inference = 'punc-inference'
     sv_inference = 'sv-inference'
+    vad_inference = 'vad-inference'
+    speaker_verification = 'speaker-verification'
+    lm_inference = 'language-model'
 
     # multi-modal tasks
     image_captioning = 'image-captioning'
@@ -386,31 +468,322 @@ class Pipelines(object):
     diffusers_stable_diffusion = 'diffusers-stable-diffusion'
     document_vl_embedding = 'document-vl-embedding'
     chinese_stable_diffusion = 'chinese-stable-diffusion'
+    gridvlp_multi_modal_classification = 'gridvlp-multi-modal-classification'
+    gridvlp_multi_modal_embedding = 'gridvlp-multi-modal-embedding'
 
     # science tasks
     protein_structure = 'unifold-protein-structure'
 
 
-class Trainers(object):
-    """ Names for different trainer.
+DEFAULT_MODEL_FOR_PIPELINE = {
+    # TaskName: (pipeline_module_name, model_repo)
+    Tasks.sentence_embedding:
+    (Pipelines.sentence_embedding,
+     'damo/nlp_corom_sentence-embedding_english-base'),
+    Tasks.text_ranking: (Pipelines.mgeo_ranking,
+                         'damo/mgeo_address_ranking_chinese_base'),
+    Tasks.text_ranking: (Pipelines.text_ranking,
+                         'damo/nlp_corom_passage-ranking_english-base'),
+    Tasks.word_segmentation:
+    (Pipelines.word_segmentation,
+     'damo/nlp_structbert_word-segmentation_chinese-base'),
+    Tasks.part_of_speech: (Pipelines.part_of_speech,
+                           'damo/nlp_structbert_part-of-speech_chinese-base'),
+    Tasks.token_classification:
+    (Pipelines.part_of_speech,
+     'damo/nlp_structbert_part-of-speech_chinese-base'),
+    Tasks.named_entity_recognition:
+    (Pipelines.named_entity_recognition,
+     'damo/nlp_raner_named-entity-recognition_chinese-base-news'),
+    Tasks.relation_extraction:
+    (Pipelines.relation_extraction,
+     'damo/nlp_bert_relation-extraction_chinese-base'),
+    Tasks.information_extraction:
+    (Pipelines.relation_extraction,
+     'damo/nlp_bert_relation-extraction_chinese-base'),
+    Tasks.sentence_similarity:
+    (Pipelines.sentence_similarity,
+     'damo/nlp_structbert_sentence-similarity_chinese-base'),
+    Tasks.translation: (Pipelines.csanmt_translation,
+                        'damo/nlp_csanmt_translation_zh2en'),
+    Tasks.nli: (Pipelines.nli, 'damo/nlp_structbert_nli_chinese-base'),
+    Tasks.sentiment_classification:
+    (Pipelines.sentiment_classification,
+     'damo/nlp_structbert_sentiment-classification_chinese-base'
+     ),  # TODO: revise back after passing the pr
+    Tasks.portrait_matting: (Pipelines.portrait_matting,
+                             'damo/cv_unet_image-matting'),
+    Tasks.universal_matting: (Pipelines.universal_matting,
+                              'damo/cv_unet_universal-matting'),
+    Tasks.human_detection: (Pipelines.human_detection,
+                            'damo/cv_resnet18_human-detection'),
+    Tasks.image_object_detection: (Pipelines.object_detection,
+                                   'damo/cv_vit_object-detection_coco'),
+    Tasks.image_denoising: (Pipelines.image_denoise,
+                            'damo/cv_nafnet_image-denoise_sidd'),
+    Tasks.image_deblurring: (Pipelines.image_deblur,
+                             'damo/cv_nafnet_image-deblur_gopro'),
+    Tasks.video_stabilization: (Pipelines.video_stabilization,
+                                'damo/cv_dut-raft_video-stabilization_base'),
+    Tasks.video_super_resolution:
+    (Pipelines.video_super_resolution,
+     'damo/cv_realbasicvsr_video-super-resolution_videolq'),
+    Tasks.text_classification:
+    (Pipelines.sentiment_classification,
+     'damo/nlp_structbert_sentiment-classification_chinese-base'),
+    Tasks.text_generation: (Pipelines.text_generation,
+                            'damo/nlp_palm2.0_text-generation_chinese-base'),
+    Tasks.zero_shot_classification:
+    (Pipelines.zero_shot_classification,
+     'damo/nlp_structbert_zero-shot-classification_chinese-base'),
+    Tasks.task_oriented_conversation: (Pipelines.dialog_modeling,
+                                       'damo/nlp_space_dialog-modeling'),
+    Tasks.dialog_state_tracking: (Pipelines.dialog_state_tracking,
+                                  'damo/nlp_space_dialog-state-tracking'),
+    Tasks.table_question_answering:
+    (Pipelines.table_question_answering_pipeline,
+     'damo/nlp-convai-text2sql-pretrain-cn'),
+    Tasks.document_grounded_dialog_generate:
+    (Pipelines.document_grounded_dialog_generate,
+     'DAMO_ConvAI/nlp_convai_generation_pretrain'),
+    Tasks.document_grounded_dialog_rerank:
+    (Pipelines.document_grounded_dialog_rerank,
+     'damo/nlp_convai_rerank_pretrain'),
+    Tasks.document_grounded_dialog_retrieval:
+    (Pipelines.document_grounded_dialog_retrieval,
+     'DAMO_ConvAI/nlp_convai_retrieval_pretrain'),
+    Tasks.text_error_correction:
+    (Pipelines.text_error_correction,
+     'damo/nlp_bart_text-error-correction_chinese'),
+    Tasks.image_captioning: (Pipelines.image_captioning,
+                             'damo/ofa_image-caption_coco_large_en'),
+    Tasks.video_captioning:
+    (Pipelines.video_captioning,
+     'damo/multi-modal_hitea_video-captioning_base_en'),
+    Tasks.image_portrait_stylization:
+    (Pipelines.person_image_cartoon,
+     'damo/cv_unet_person-image-cartoon_compound-models'),
+    Tasks.ocr_detection: (Pipelines.ocr_detection,
+                          'damo/cv_resnet18_ocr-detection-line-level_damo'),
+    Tasks.table_recognition:
+    (Pipelines.table_recognition,
+     'damo/cv_dla34_table-structure-recognition_cycle-centernet'),
+    Tasks.document_vl_embedding:
+    (Pipelines.document_vl_embedding,
+     'damo/multi-modal_convnext-roberta-base_vldoc-embedding'),
+    Tasks.license_plate_detection:
+    (Pipelines.license_plate_detection,
+     'damo/cv_resnet18_license-plate-detection_damo'),
+    Tasks.fill_mask: (Pipelines.fill_mask, 'damo/nlp_veco_fill-mask-large'),
+    Tasks.feature_extraction: (Pipelines.feature_extraction,
+                               'damo/pert_feature-extraction_base-test'),
+    Tasks.action_recognition: (Pipelines.action_recognition,
+                               'damo/cv_TAdaConv_action-recognition'),
+    Tasks.action_detection: (Pipelines.action_detection,
+                             'damo/cv_ResNetC3D_action-detection_detection2d'),
+    Tasks.live_category: (Pipelines.live_category,
+                          'damo/cv_resnet50_live-category'),
+    Tasks.video_category: (Pipelines.video_category,
+                           'damo/cv_resnet50_video-category'),
+    Tasks.multi_modal_embedding: (Pipelines.multi_modal_embedding,
+                                  'damo/multi-modal_clip-vit-base-patch16_zh'),
+    Tasks.generative_multi_modal_embedding:
+    (Pipelines.generative_multi_modal_embedding,
+     'damo/multi-modal_gemm-vit-large-patch14_generative-multi-modal-embedding'
+     ),
+    Tasks.multi_modal_similarity:
+    (Pipelines.multi_modal_similarity,
+     'damo/multi-modal_team-vit-large-patch14_multi-modal-similarity'),
+    Tasks.visual_question_answering:
+    (Pipelines.visual_question_answering,
+     'damo/mplug_visual-question-answering_coco_large_en'),
+    Tasks.video_question_answering:
+    (Pipelines.video_question_answering,
+     'damo/multi-modal_hitea_video-question-answering_base_en'),
+    Tasks.video_embedding: (Pipelines.cmdssl_video_embedding,
+                            'damo/cv_r2p1d_video_embedding'),
+    Tasks.text_to_image_synthesis:
+    (Pipelines.text_to_image_synthesis,
+     'damo/cv_diffusion_text-to-image-synthesis_tiny'),
+    Tasks.body_2d_keypoints: (Pipelines.body_2d_keypoints,
+                              'damo/cv_hrnetv2w32_body-2d-keypoints_image'),
+    Tasks.body_3d_keypoints: (Pipelines.body_3d_keypoints,
+                              'damo/cv_canonical_body-3d-keypoints_video'),
+    Tasks.hand_2d_keypoints:
+    (Pipelines.hand_2d_keypoints,
+     'damo/cv_hrnetw18_hand-pose-keypoints_coco-wholebody'),
+    Tasks.card_detection: (Pipelines.card_detection,
+                           'damo/cv_resnet_carddetection_scrfd34gkps'),
+    Tasks.content_check: (Pipelines.content_check,
+                          'damo/cv_resnet50_content-check_cc'),
+    Tasks.face_detection:
+    (Pipelines.mog_face_detection,
+     'damo/cv_resnet101_face-detection_cvpr22papermogface'),
+    Tasks.face_liveness: (Pipelines.face_liveness_ir,
+                          'damo/cv_manual_face-liveness_flir'),
+    Tasks.face_recognition: (Pipelines.face_recognition,
+                             'damo/cv_ir101_facerecognition_cfglint'),
+    Tasks.facial_expression_recognition:
+    (Pipelines.facial_expression_recognition,
+     'damo/cv_vgg19_facial-expression-recognition_fer'),
+    Tasks.face_attribute_recognition:
+    (Pipelines.face_attribute_recognition,
+     'damo/cv_resnet34_face-attribute-recognition_fairface'),
+    Tasks.face_2d_keypoints: (Pipelines.face_2d_keypoints,
+                              'damo/cv_mobilenet_face-2d-keypoints_alignment'),
+    Tasks.face_quality_assessment:
+    (Pipelines.face_quality_assessment,
+     'damo/cv_manual_face-quality-assessment_fqa'),
+    Tasks.video_multi_modal_embedding:
+    (Pipelines.video_multi_modal_embedding,
+     'damo/multi_modal_clip_vtretrival_msrvtt_53'),
+    Tasks.image_color_enhancement:
+    (Pipelines.image_color_enhance,
+     'damo/cv_csrnet_image-color-enhance-models'),
+    Tasks.virtual_try_on: (Pipelines.virtual_try_on,
+                           'damo/cv_daflow_virtual-try-on_base'),
+    Tasks.image_colorization: (Pipelines.ddcolor_image_colorization,
+                               'damo/cv_ddcolor_image-colorization'),
+    Tasks.video_colorization: (Pipelines.video_colorization,
+                               'damo/cv_unet_video-colorization'),
+    Tasks.image_segmentation:
+    (Pipelines.image_instance_segmentation,
+     'damo/cv_swin-b_image-instance-segmentation_coco'),
+    Tasks.image_driving_perception:
+    (Pipelines.yolopv2_image_driving_percetion_bdd100k,
+     'damo/cv_yolopv2_image-driving-perception_bdd100k'),
+    Tasks.image_depth_estimation:
+    (Pipelines.image_depth_estimation,
+     'damo/cv_newcrfs_image-depth-estimation_indoor'),
+    Tasks.indoor_layout_estimation:
+    (Pipelines.indoor_layout_estimation,
+     'damo/cv_panovit_indoor-layout-estimation'),
+    Tasks.video_depth_estimation:
+    (Pipelines.video_depth_estimation,
+     'damo/cv_dro-resnet18_video-depth-estimation_indoor'),
+    Tasks.panorama_depth_estimation:
+    (Pipelines.panorama_depth_estimation,
+     'damo/cv_unifuse_panorama-depth-estimation'),
+    Tasks.image_style_transfer: (Pipelines.image_style_transfer,
+                                 'damo/cv_aams_style-transfer_damo'),
+    Tasks.face_image_generation: (Pipelines.face_image_generation,
+                                  'damo/cv_gan_face-image-generation'),
+    Tasks.image_super_resolution: (Pipelines.image_super_resolution,
+                                   'damo/cv_rrdb_image-super-resolution'),
+    Tasks.image_debanding: (Pipelines.image_debanding,
+                            'damo/cv_rrdb_image-debanding'),
+    Tasks.image_portrait_enhancement:
+    (Pipelines.image_portrait_enhancement,
+     'damo/cv_gpen_image-portrait-enhancement'),
+    Tasks.product_retrieval_embedding:
+    (Pipelines.product_retrieval_embedding,
+     'damo/cv_resnet50_product-bag-embedding-models'),
+    Tasks.image_to_image_generation:
+    (Pipelines.image_to_image_generation,
+     'damo/cv_latent_diffusion_image2image_generate'),
+    Tasks.image_classification:
+    (Pipelines.daily_image_classification,
+     'damo/cv_vit-base_image-classification_Dailylife-labels'),
+    Tasks.image_object_detection:
+    (Pipelines.image_object_detection_auto,
+     'damo/cv_yolox_image-object-detection-auto'),
+    Tasks.ocr_recognition:
+    (Pipelines.ocr_recognition,
+     'damo/cv_convnextTiny_ocr-recognition-general_damo'),
+    Tasks.skin_retouching: (Pipelines.skin_retouching,
+                            'damo/cv_unet_skin-retouching'),
+    Tasks.faq_question_answering:
+    (Pipelines.faq_question_answering,
+     'damo/nlp_structbert_faq-question-answering_chinese-base'),
+    Tasks.crowd_counting: (Pipelines.crowd_counting,
+                           'damo/cv_hrnet_crowd-counting_dcanet'),
+    Tasks.video_single_object_tracking:
+    (Pipelines.video_single_object_tracking,
+     'damo/cv_vitb_video-single-object-tracking_ostrack'),
+    Tasks.image_reid_person: (Pipelines.image_reid_person,
+                              'damo/cv_passvitb_image-reid-person_market'),
+    Tasks.text_driven_segmentation:
+    (Pipelines.text_driven_segmentation,
+     'damo/cv_vitl16_segmentation_text-driven-seg'),
+    Tasks.movie_scene_segmentation: (
+        Pipelines.movie_scene_segmentation,
+        'damo/cv_resnet50-bert_video-scene-segmentation_movienet'),
+    Tasks.shop_segmentation: (Pipelines.shop_segmentation,
+                              'damo/cv_vitb16_segmentation_shop-seg'),
+    Tasks.image_inpainting: (Pipelines.image_inpainting,
+                             'damo/cv_fft_inpainting_lama'),
+    Tasks.image_paintbyexample: (Pipelines.image_paintbyexample,
+                                 'damo/cv_stable-diffusion_paint-by-example'),
+    Tasks.video_inpainting: (Pipelines.video_inpainting,
+                             'damo/cv_video-inpainting'),
+    Tasks.video_human_matting: (Pipelines.video_human_matting,
+                                'damo/cv_effnetv2_video-human-matting'),
+    Tasks.video_frame_interpolation: (
+        Pipelines.video_frame_interpolation,
+        'damo/cv_raft_video-frame-interpolation'),
+    Tasks.video_deinterlace: (Pipelines.video_deinterlace,
+                              'damo/cv_unet_video-deinterlace'),
+    Tasks.human_wholebody_keypoint: (
+        Pipelines.human_wholebody_keypoint,
+        'damo/cv_hrnetw48_human-wholebody-keypoint_image'),
+    Tasks.hand_static: (Pipelines.hand_static,
+                        'damo/cv_mobileface_hand-static'),
+    Tasks.face_human_hand_detection: (
+        Pipelines.face_human_hand_detection,
+        'damo/cv_nanodet_face-human-hand-detection'),
+    Tasks.face_emotion: (Pipelines.face_emotion, 'damo/cv_face-emotion'),
+    Tasks.product_segmentation: (Pipelines.product_segmentation,
+                                 'damo/cv_F3Net_product-segmentation'),
+    Tasks.referring_video_object_segmentation: (
+        Pipelines.referring_video_object_segmentation,
+        'damo/cv_swin-t_referring_video-object-segmentation'),
+    Tasks.video_summarization: (Pipelines.video_summarization,
+                                'damo/cv_googlenet_pgl-video-summarization'),
+    Tasks.image_skychange: (Pipelines.image_skychange,
+                            'damo/cv_hrnetocr_skychange'),
+    Tasks.translation_evaluation: (
+        Pipelines.translation_evaluation,
+        'damo/nlp_unite_mup_translation_evaluation_multilingual_large'),
+    Tasks.video_object_segmentation: (
+        Pipelines.video_object_segmentation,
+        'damo/cv_rdevos_video-object-segmentation'),
+    Tasks.video_multi_object_tracking: (
+        Pipelines.video_multi_object_tracking,
+        'damo/cv_yolov5_video-multi-object-tracking_fairmot'),
+    Tasks.image_multi_view_depth_estimation: (
+        Pipelines.image_multi_view_depth_estimation,
+        'damo/cv_casmvs_multi-view-depth-estimation_general'),
+    Tasks.image_fewshot_detection: (
+        Pipelines.image_fewshot_detection,
+        'damo/cv_resnet101_detection_fewshot-defrcn'),
+    Tasks.image_body_reshaping: (Pipelines.image_body_reshaping,
+                                 'damo/cv_flow-based-body-reshaping_damo'),
+    Tasks.image_face_fusion: (Pipelines.image_face_fusion,
+                              'damo/cv_unet-image-face-fusion_damo'),
+    Tasks.image_matching: (
+        Pipelines.image_matching,
+        'damo/cv_quadtree_attention_image-matching_outdoor'),
+    Tasks.image_quality_assessment_mos: (
+        Pipelines.image_quality_assessment_mos,
+        'damo/cv_resnet_image-quality-assessment-mos_youtubeUGC'),
+    Tasks.image_quality_assessment_degradation: (
+        Pipelines.image_quality_assessment_degradation,
+        'damo/cv_resnet50_image-quality-assessment_degradation'),
+    Tasks.vision_efficient_tuning: (
+        Pipelines.vision_efficient_tuning,
+        'damo/cv_vitb16_classification_vision-efficient-tuning-adapter'),
+    Tasks.object_detection_3d: (Pipelines.object_detection_3d_depe,
+                                'damo/cv_object-detection-3d_depe'),
+    Tasks.bad_image_detecting: (Pipelines.bad_image_detecting,
+                                'damo/cv_mobilenet-v2_bad-image-detecting'),
+    Tasks.nerf_recon_acc: (Pipelines.nerf_recon_acc,
+                           'damo/cv_nerf-3d-reconstruction-accelerate_damo'),
+    Tasks.siamese_uie: (Pipelines.siamese_uie,
+                        'damo/nlp_structbert_siamese-uie_chinese-base'),
+}
 
-        Holds the standard trainer name to use for identifying different trainer.
-    This should be used to register trainers.
-
-        For a general Trainer, you can use EpochBasedTrainer.
-        For a model specific Trainer, you can use ${ModelName}-${Task}-trainer.
-    """
-
-    default = 'trainer'
-    easycv = 'easycv'
-    tinynas_damoyolo = 'tinynas-damoyolo'
-
-    # multi-modal trainers
-    clip_multi_modal_embedding = 'clip-multi-modal-embedding'
-    ofa = 'ofa'
-    mplug = 'mplug'
-    mgeo_ranking_trainer = 'mgeo-ranking-trainer'
 
+class CVTrainers(object):
     # cv trainers
     image_instance_segmentation = 'image-instance-segmentation'
     image_portrait_enhancement = 'image-portrait-enhancement'
@@ -424,6 +797,8 @@ class Trainers(object):
     image_classification = 'image-classification'
     image_fewshot_detection = 'image-fewshot-detection'
 
+
+class NLPTrainers(object):
     # nlp trainers
     bert_sentiment_analysis = 'bert-sentiment-analysis'
     dialog_modeling_trainer = 'dialog-modeling-trainer'
@@ -431,14 +806,26 @@ class Trainers(object):
     nlp_base_trainer = 'nlp-base-trainer'
     nlp_veco_trainer = 'nlp-veco-trainer'
     nlp_text_ranking_trainer = 'nlp-text-ranking-trainer'
+    nlp_sentence_embedding_trainer = 'nlp-sentence-embedding-trainer'
     text_generation_trainer = 'text-generation-trainer'
     nlp_plug_trainer = 'nlp-plug-trainer'
     gpt3_trainer = 'nlp-gpt3-trainer'
     faq_question_answering_trainer = 'faq-question-answering-trainer'
     gpt_moe_trainer = 'nlp-gpt-moe-trainer'
     table_question_answering_trainer = 'table-question-answering-trainer'
+    document_grounded_dialog_generate_trainer = 'document-grounded-dialog-generate-trainer'
+    document_grounded_dialog_rerank_trainer = 'document-grounded-dialog-rerank-trainer'
+    document_grounded_dialog_retrieval_trainer = 'document-grounded-dialog-retrieval-trainer'
 
-    # audio trainers
+
+class MultiModalTrainers(object):
+    clip_multi_modal_embedding = 'clip-multi-modal-embedding'
+    ofa = 'ofa'
+    mplug = 'mplug'
+    mgeo_ranking_trainer = 'mgeo-ranking-trainer'
+
+
+class AudioTrainers(object):
     speech_frcrn_ans_cirm_16k = 'speech_frcrn_ans_cirm_16k'
     speech_dfsmn_kws_char_farfield = 'speech_dfsmn_kws_char_farfield'
     speech_kws_fsmn_char_ctc_nearfield = 'speech_kws_fsmn_char_ctc_nearfield'
@@ -447,6 +834,45 @@ class Trainers(object):
     speech_separation = 'speech-separation'
 
 
+class Trainers(CVTrainers, NLPTrainers, MultiModalTrainers, AudioTrainers):
+    """ Names for different trainer.
+
+        Holds the standard trainer name to use for identifying different trainer.
+    This should be used to register trainers.
+
+        For a general Trainer, you can use EpochBasedTrainer.
+        For a model specific Trainer, you can use ${ModelName}-${Task}-trainer.
+    """
+
+    default = 'trainer'
+    easycv = 'easycv'
+    tinynas_damoyolo = 'tinynas-damoyolo'
+
+    @staticmethod
+    def get_trainer_domain(attribute_or_value):
+        if attribute_or_value in vars(
+                CVTrainers) or attribute_or_value in vars(CVTrainers).values():
+            return Fields.cv
+        elif attribute_or_value in vars(
+                NLPTrainers) or attribute_or_value in vars(
+                    NLPTrainers).values():
+            return Fields.nlp
+        elif attribute_or_value in vars(
+                AudioTrainers) or attribute_or_value in vars(
+                    AudioTrainers).values():
+            return Fields.audio
+        elif attribute_or_value in vars(
+                MultiModalTrainers) or attribute_or_value in vars(
+                    MultiModalTrainers).values():
+            return Fields.multi_modal
+        elif attribute_or_value == Trainers.default:
+            return Trainers.default
+        elif attribute_or_value == Trainers.easycv:
+            return Trainers.easycv
+        else:
+            return 'unknown'
+
+
 class Preprocessors(object):
     """ Names for different preprocessor.
 
@@ -466,12 +892,18 @@ class Preprocessors(object):
     image_classification_mmcv_preprocessor = 'image-classification-mmcv-preprocessor'
     image_color_enhance_preprocessor = 'image-color-enhance-preprocessor'
     image_instance_segmentation_preprocessor = 'image-instance-segmentation-preprocessor'
+    image_driving_perception_preprocessor = 'image-driving-perception-preprocessor'
     image_portrait_enhancement_preprocessor = 'image-portrait-enhancement-preprocessor'
+    image_quality_assessment_mos_preprocessor = 'image-quality_assessment-mos-preprocessor'
     video_summarization_preprocessor = 'video-summarization-preprocessor'
     movie_scene_segmentation_preprocessor = 'movie-scene-segmentation-preprocessor'
     image_classification_bypass_preprocessor = 'image-classification-bypass-preprocessor'
     object_detection_scrfd = 'object-detection-scrfd'
     image_sky_change_preprocessor = 'image-sky-change-preprocessor'
+    image_demoire_preprocessor = 'image-demoire-preprocessor'
+    ocr_recognition = 'ocr-recognition'
+    bad_image_detecting_preprocessor = 'bad-image-detecting-preprocessor'
+    nerf_recon_acc_preprocessor = 'nerf-recon-acc-preprocessor'
 
     # nlp preprocessor
     sen_sim_tokenizer = 'sen-sim-tokenizer'
@@ -510,6 +942,10 @@ class Preprocessors(object):
     sentence_piece = 'sentence-piece'
     translation_evaluation = 'translation-evaluation-preprocessor'
     dialog_use_preprocessor = 'dialog-use-preprocessor'
+    siamese_uie_preprocessor = 'siamese-uie-preprocessor'
+    document_grounded_dialog_retrieval = 'document-grounded-dialog-retrieval'
+    document_grounded_dialog_rerank = 'document-grounded-dialog-rerank'
+    document_grounded_dialog_generate = 'document-grounded-dialog-generate'
 
     # audio preprocessor
     linear_aec_fbank = 'linear-aec-fbank'
@@ -555,10 +991,14 @@ class Metrics(object):
     image_ins_seg_coco_metric = 'image-ins-seg-coco-metric'
     # metrics for sequence classification task
     seq_cls_metric = 'seq-cls-metric'
+    # loss metric
+    loss_metric = 'loss-metric'
     # metrics for token-classification task
     token_cls_metric = 'token-cls-metric'
     # metrics for text-generation task
     text_gen_metric = 'text-gen-metric'
+    # file saving wrapper
+    prediction_saving_wrapper = 'prediction-saving-wrapper'
     # metrics for image-color-enhance task
     image_color_enhance_metric = 'image-color-enhance-metric'
     # metrics for image-portrait-enhancement task
@@ -576,6 +1016,12 @@ class Metrics(object):
     referring_video_object_segmentation_metric = 'referring-video-object-segmentation-metric'
     # metric for video stabilization task
     video_stabilization_metric = 'video-stabilization-metric'
+    # metirc for image-quality-assessment-mos task
+    image_quality_assessment_mos_metric = 'image-quality-assessment-mos-metric'
+    # metirc for image-quality-assessment-degradation task
+    image_quality_assessment_degradation_metric = 'image-quality-assessment-degradation-metric'
+    # metric for text-ranking task
+    text_ranking_metric = 'text-ranking-metric'
 
 
 class Optimizers(object):
@@ -609,6 +1055,7 @@ class Hooks(object):
     # checkpoint
     CheckpointHook = 'CheckpointHook'
     BestCkptSaverHook = 'BestCkptSaverHook'
+    LoadCheckpointHook = 'LoadCheckpointHook'
 
     # logger
     TextLoggerHook = 'TextLoggerHook'
diff --git a/modelscope/metrics/__init__.py b/modelscope/metrics/__init__.py
index f814cf4d..e463ea63 100644
--- a/modelscope/metrics/__init__.py
+++ b/modelscope/metrics/__init__.py
@@ -25,7 +25,10 @@ if TYPE_CHECKING:
     from .video_stabilization_metric import VideoStabilizationMetric
     from .video_super_resolution_metric.video_super_resolution_metric import VideoSuperResolutionMetric
     from .ppl_metric import PplMetric
-
+    from .image_quality_assessment_degradation_metric import ImageQualityAssessmentDegradationMetric
+    from .image_quality_assessment_mos_metric import ImageQualityAssessmentMosMetric
+    from .text_ranking_metric import TextRankingMetric
+    from .loss_metric import LossMetric
 else:
     _import_structure = {
         'audio_noise_metric': ['AudioNoiseMetric'],
@@ -50,6 +53,12 @@ else:
         'video_frame_interpolation_metric': ['VideoFrameInterpolationMetric'],
         'video_stabilization_metric': ['VideoStabilizationMetric'],
         'ppl_metric': ['PplMetric'],
+        'image_quality_assessment_degradation_metric':
+        ['ImageQualityAssessmentDegradationMetric'],
+        'image_quality_assessment_mos_metric':
+        ['ImageQualityAssessmentMosMetric'],
+        'text_ranking_metric': ['TextRankingMetric'],
+        'loss_metric': ['LossMetric']
     }
 
     import sys
diff --git a/modelscope/metrics/accuracy_metric.py b/modelscope/metrics/accuracy_metric.py
index b1976d8e..2327a9c7 100644
--- a/modelscope/metrics/accuracy_metric.py
+++ b/modelscope/metrics/accuracy_metric.py
@@ -8,6 +8,7 @@ from modelscope.metainfo import Metrics
 from modelscope.outputs import OutputKeys
 from modelscope.utils.chinese_utils import remove_space_between_chinese_chars
 from modelscope.utils.registry import default_group
+from modelscope.utils.tensor_utils import torch_nested_numpify
 from .base import Metric
 from .builder import METRICS, MetricKeys
 
@@ -36,8 +37,10 @@ class AccuracyMetric(Metric):
                 eval_results = outputs[key]
                 break
         assert type(ground_truths) == type(eval_results)
+        ground_truths = torch_nested_numpify(ground_truths)
         for truth in ground_truths:
             self.labels.append(truth)
+        eval_results = torch_nested_numpify(eval_results)
         for result in eval_results:
             if isinstance(truth, str):
                 if isinstance(result, list):
diff --git a/modelscope/metrics/builder.py b/modelscope/metrics/builder.py
index 025187fd..76278288 100644
--- a/modelscope/metrics/builder.py
+++ b/modelscope/metrics/builder.py
@@ -12,7 +12,9 @@ METRICS = Registry('metrics')
 class MetricKeys(object):
     ACCURACY = 'accuracy'
     F1 = 'f1'
+    Binary_F1 = 'binary-f1'
     Macro_F1 = 'macro-f1'
+    Micro_F1 = 'micro-f1'
     PRECISION = 'precision'
     RECALL = 'recall'
     PSNR = 'psnr'
@@ -33,6 +35,11 @@ class MetricKeys(object):
     DISTORTION_VALUE = 'distortion_value'
     STABILITY_SCORE = 'stability_score'
     PPL = 'ppl'
+    PLCC = 'plcc'
+    SRCC = 'srcc'
+    RMSE = 'rmse'
+    MRR = 'mrr'
+    NDCG = 'ndcg'
 
 
 task_default_metrics = {
@@ -59,6 +66,10 @@ task_default_metrics = {
     Tasks.video_frame_interpolation:
     [Metrics.video_frame_interpolation_metric],
     Tasks.video_stabilization: [Metrics.video_stabilization_metric],
+    Tasks.image_quality_assessment_degradation:
+    [Metrics.image_quality_assessment_degradation_metric],
+    Tasks.image_quality_assessment_mos:
+    [Metrics.image_quality_assessment_mos_metric],
 }
 
 
diff --git a/modelscope/metrics/image_quality_assessment_degradation_metric.py b/modelscope/metrics/image_quality_assessment_degradation_metric.py
new file mode 100644
index 00000000..8bb8aa7f
--- /dev/null
+++ b/modelscope/metrics/image_quality_assessment_degradation_metric.py
@@ -0,0 +1,75 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import os
+import sys
+import tempfile
+from collections import defaultdict
+from typing import Dict
+
+import cv2
+import numpy as np
+import torch
+from scipy.stats import pearsonr, spearmanr
+from tqdm import tqdm
+
+from modelscope.metainfo import Metrics
+from modelscope.utils.registry import default_group
+from .base import Metric
+from .builder import METRICS, MetricKeys
+
+
+@METRICS.register_module(
+    group_key=default_group,
+    module_name=Metrics.image_quality_assessment_degradation_metric)
+class ImageQualityAssessmentDegradationMetric(Metric):
+    """The metric for image-quality-assessment-degradation task.
+    """
+
+    def __init__(self):
+        self.inputs = defaultdict(list)
+        self.outputs = defaultdict(list)
+
+    def add(self, outputs: Dict, inputs: Dict):
+        item_degradation_id = outputs['item_id'][0] + outputs[
+            'distortion_type'][0]
+        if outputs['distortion_type'][0] in ['01', '02', '03']:
+            pred = outputs['blur_degree']
+        elif outputs['distortion_type'][0] in ['09', '10', '21']:
+            pred = outputs['comp_degree']
+        elif outputs['distortion_type'][0] in ['11', '12', '13', '14']:
+            pred = outputs['noise_degree']
+        else:
+            return
+
+        self.outputs[item_degradation_id].append(pred[0].float())
+        self.inputs[item_degradation_id].append(outputs['target'].float())
+
+    def evaluate(self):
+        degree_plccs = []
+        degree_sroccs = []
+
+        for item_degradation_id, degree_value in self.inputs.items():
+            degree_label = torch.cat(degree_value).flatten().data.cpu().numpy()
+            degree_pred = torch.cat(self.outputs[item_degradation_id]).flatten(
+            ).data.cpu().numpy()
+            degree_plcc = pearsonr(degree_label, degree_pred)[0]
+            degree_srocc = spearmanr(degree_label, degree_pred)[0]
+            degree_plccs.append(degree_plcc)
+            degree_sroccs.append(degree_srocc)
+        degree_plcc_mean = np.array(degree_plccs).mean()
+        degree_srocc_mean = np.array(degree_sroccs).mean()
+
+        return {
+            MetricKeys.PLCC: degree_plcc_mean,
+            MetricKeys.SRCC: degree_srocc_mean,
+        }
+
+    def merge(self, other: 'ImageQualityAssessmentDegradationMetric'):
+        self.inputs.extend(other.inputs)
+        self.outputs.extend(other.outputs)
+
+    def __getstate__(self):
+        return self.inputs, self.outputs
+
+    def __setstate__(self, state):
+        self.inputs, self.outputs = state
diff --git a/modelscope/metrics/image_quality_assessment_mos_metric.py b/modelscope/metrics/image_quality_assessment_mos_metric.py
new file mode 100644
index 00000000..14be7fe2
--- /dev/null
+++ b/modelscope/metrics/image_quality_assessment_mos_metric.py
@@ -0,0 +1,57 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import os
+import sys
+import tempfile
+from typing import Dict
+
+import cv2
+import numpy as np
+import torch
+from scipy.stats import pearsonr, spearmanr
+from tqdm import tqdm
+
+from modelscope.metainfo import Metrics
+from modelscope.utils.registry import default_group
+from .base import Metric
+from .builder import METRICS, MetricKeys
+
+
+@METRICS.register_module(
+    group_key=default_group,
+    module_name=Metrics.image_quality_assessment_mos_metric)
+class ImageQualityAssessmentMosMetric(Metric):
+    """The metric for image-quality-assessment-mos task.
+    """
+
+    def __init__(self):
+        self.inputs = []
+        self.outputs = []
+
+    def add(self, outputs: Dict, inputs: Dict):
+        self.outputs.append(outputs['pred'].float())
+        self.inputs.append(outputs['target'].float())
+
+    def evaluate(self):
+
+        mos_labels = torch.cat(self.inputs).flatten().data.cpu().numpy()
+        mos_preds = torch.cat(self.outputs).flatten().data.cpu().numpy()
+        mos_plcc = pearsonr(mos_labels, mos_preds)[0]
+        mos_srocc = spearmanr(mos_labels, mos_preds)[0]
+        mos_rmse = np.sqrt(np.mean((mos_labels - mos_preds)**2))
+
+        return {
+            MetricKeys.PLCC: mos_plcc,
+            MetricKeys.SRCC: mos_srocc,
+            MetricKeys.RMSE: mos_rmse,
+        }
+
+    def merge(self, other: 'ImageQualityAssessmentMosMetric'):
+        self.inputs.extend(other.inputs)
+        self.outputs.extend(other.outputs)
+
+    def __getstate__(self):
+        return self.inputs, self.outputs
+
+    def __setstate__(self, state):
+        self.inputs, self.outputs = state
diff --git a/modelscope/metrics/loss_metric.py b/modelscope/metrics/loss_metric.py
new file mode 100644
index 00000000..923d10b6
--- /dev/null
+++ b/modelscope/metrics/loss_metric.py
@@ -0,0 +1,46 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+from typing import Dict
+
+import numpy as np
+from sklearn.metrics import accuracy_score, f1_score
+
+from modelscope.metainfo import Metrics
+from modelscope.outputs import OutputKeys
+from modelscope.utils.registry import default_group
+from modelscope.utils.tensor_utils import (torch_nested_detach,
+                                           torch_nested_numpify)
+from .base import Metric
+from .builder import METRICS, MetricKeys
+
+
+@METRICS.register_module(
+    group_key=default_group, module_name=Metrics.loss_metric)
+class LossMetric(Metric):
+    """The metric class to calculate average loss of batches.
+
+    Args:
+        loss_key: The key of loss
+    """
+
+    def __init__(self, loss_key=OutputKeys.LOSS, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.loss_key = loss_key
+        self.losses = []
+
+    def add(self, outputs: Dict, inputs: Dict):
+        loss = outputs[self.loss_key]
+        self.losses.append(torch_nested_numpify(torch_nested_detach(loss)))
+
+    def evaluate(self):
+        return {OutputKeys.LOSS: float(np.average(self.losses))}
+
+    def merge(self, other: 'LossMetric'):
+        self.losses.extend(other.losses)
+
+    def __getstate__(self):
+        return self.losses
+
+    def __setstate__(self, state):
+        self.__init__()
+        self.losses = state
diff --git a/modelscope/metrics/prediction_saving_wrapper.py b/modelscope/metrics/prediction_saving_wrapper.py
new file mode 100644
index 00000000..c7aee4e1
--- /dev/null
+++ b/modelscope/metrics/prediction_saving_wrapper.py
@@ -0,0 +1,42 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+from typing import Dict
+
+import numpy as np
+from sklearn.metrics import accuracy_score, f1_score
+
+from modelscope.metainfo import Metrics
+from modelscope.outputs import OutputKeys
+from modelscope.utils.registry import default_group
+from modelscope.utils.tensor_utils import (torch_nested_detach,
+                                           torch_nested_numpify)
+from .base import Metric
+from .builder import METRICS, MetricKeys
+
+
+@METRICS.register_module(
+    group_key=default_group, module_name=Metrics.prediction_saving_wrapper)
+class PredictionSavingWrapper(Metric):
+    """The wrapper to save predictions to file.
+    Args:
+        saving_fn: The saving_fn used to save predictions to files.
+    """
+
+    def __init__(self, saving_fn, **kwargs):
+        super().__init__(**kwargs)
+        self.saving_fn = saving_fn
+
+    def add(self, outputs: Dict, inputs: Dict):
+        self.saving_fn(inputs, outputs)
+
+    def evaluate(self):
+        return {}
+
+    def merge(self, other: 'PredictionSavingWrapper'):
+        pass
+
+    def __getstate__(self):
+        pass
+
+    def __setstate__(self, state):
+        pass
diff --git a/modelscope/metrics/sequence_classification_metric.py b/modelscope/metrics/sequence_classification_metric.py
index 5a817691..1b9e57cb 100644
--- a/modelscope/metrics/sequence_classification_metric.py
+++ b/modelscope/metrics/sequence_classification_metric.py
@@ -48,19 +48,29 @@ class SequenceClassificationMetric(Metric):
     def evaluate(self):
         preds = np.concatenate(self.preds, axis=0)
         labels = np.concatenate(self.labels, axis=0)
-        preds = np.argmax(preds, axis=1)
-        return {
-            MetricKeys.ACCURACY:
-            accuracy_score(labels, preds),
-            MetricKeys.F1:
-            f1_score(
-                labels,
-                preds,
-                average='micro' if any([label > 1
-                                        for label in labels]) else None),
-            MetricKeys.Macro_F1:
-            f1_score(labels, preds, average='macro'),
-        }
+        assert len(preds.shape) == 2, 'Only support predictions with shape: (batch_size, num_labels),' \
+                                      'multi-label classification is not supported in this metric class.'
+        preds_max = np.argmax(preds, axis=1)
+        if preds.shape[1] > 2:
+            metrics = {
+                MetricKeys.ACCURACY: accuracy_score(labels, preds_max),
+                MetricKeys.Micro_F1:
+                f1_score(labels, preds_max, average='micro'),
+                MetricKeys.Macro_F1:
+                f1_score(labels, preds_max, average='macro'),
+            }
+
+            metrics[MetricKeys.F1] = metrics[MetricKeys.Micro_F1]
+            return metrics
+        else:
+            metrics = {
+                MetricKeys.ACCURACY:
+                accuracy_score(labels, preds_max),
+                MetricKeys.Binary_F1:
+                f1_score(labels, preds_max, average='binary'),
+            }
+            metrics[MetricKeys.F1] = metrics[MetricKeys.Binary_F1]
+            return metrics
 
     def merge(self, other: 'SequenceClassificationMetric'):
         self.preds.extend(other.preds)
diff --git a/modelscope/metrics/text_ranking_metric.py b/modelscope/metrics/text_ranking_metric.py
new file mode 100644
index 00000000..f3c2448e
--- /dev/null
+++ b/modelscope/metrics/text_ranking_metric.py
@@ -0,0 +1,91 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+from typing import Dict, List
+
+import numpy as np
+
+from modelscope.metainfo import Metrics
+from modelscope.metrics.base import Metric
+from modelscope.metrics.builder import METRICS, MetricKeys
+from modelscope.utils.registry import default_group
+
+
+@METRICS.register_module(
+    group_key=default_group, module_name=Metrics.text_ranking_metric)
+class TextRankingMetric(Metric):
+    """The metric computation class for text ranking classes.
+
+    This metric class calculates mrr and ndcg metric for the whole evaluation dataset.
+
+    Args:
+        target_text: The key of the target text column in the `inputs` arg.
+        pred_text: The key of the predicted text column in the `outputs` arg.
+    """
+
+    def __init__(self, mrr_k: int = 1, ndcg_k: int = 1):
+        self.labels: List = []
+        self.qids: List = []
+        self.logits: List = []
+        self.mrr_k: int = mrr_k
+        self.ndcg_k: int = ndcg_k
+
+    def add(self, outputs: Dict[str, List], inputs: Dict[str, List]):
+        self.labels.extend(inputs.pop('labels').detach().cpu().numpy())
+        self.qids.extend(inputs.pop('qid').detach().cpu().numpy())
+
+        logits = outputs['logits'].squeeze(-1).detach().cpu().numpy()
+        logits = self._sigmoid(logits).tolist()
+        self.logits.extend(logits)
+
+    def evaluate(self):
+        rank_result = {}
+        for qid, score, label in zip(self.qids, self.logits, self.labels):
+            if qid not in rank_result:
+                rank_result[qid] = []
+            rank_result[qid].append((score, label))
+
+        for qid in rank_result:
+            rank_result[qid] = sorted(rank_result[qid], key=lambda x: x[0])
+
+        return {
+            MetricKeys.MRR: self._compute_mrr(rank_result),
+            MetricKeys.NDCG: self._compute_ndcg(rank_result)
+        }
+
+    @staticmethod
+    def _sigmoid(logits):
+        return np.exp(logits) / (1 + np.exp(logits))
+
+    def _compute_mrr(self, result):
+        mrr = 0
+        for res in result.values():
+            sorted_res = sorted(res, key=lambda x: x[0], reverse=True)
+            ar = 0
+            for index, ele in enumerate(sorted_res[:self.mrr_k]):
+                if str(ele[1]) == '1':
+                    ar = 1.0 / (index + 1)
+                    break
+            mrr += ar
+        return mrr / len(result)
+
+    def _compute_ndcg(self, result):
+        ndcg = 0
+        from sklearn.metrics import ndcg_score
+        for res in result.values():
+            sorted_res = sorted(res, key=lambda x: [0], reverse=True)
+            labels = np.array([[ele[1] for ele in sorted_res]])
+            scores = np.array([[ele[0] for ele in sorted_res]])
+            ndcg += float(ndcg_score(labels, scores, k=self.ndcg_k))
+        return ndcg / len(result)
+
+    def merge(self, other: 'TextRankingMetric'):
+        self.labels.extend(other.labels)
+        self.qids.extend(other.qids)
+        self.logits.extend(other.logits)
+
+    def __getstate__(self):
+        return self.labels, self.qids, self.logits, self.mrr_k, self.ndcg_k
+
+    def __setstate__(self, state):
+        self.__init__()
+        self.labels, self.qids, self.logits, self.mrr_k, self.ndcg_k = state
diff --git a/modelscope/models/__init__.py b/modelscope/models/__init__.py
index e7cb2adc..24d86dfd 100644
--- a/modelscope/models/__init__.py
+++ b/modelscope/models/__init__.py
@@ -9,4 +9,5 @@ from .base import Head, Model
 from .builder import BACKBONES, HEADS, MODELS, build_model
 
 if is_torch_available():
-    from .base import TorchModel, TorchHead
+    from .base.base_torch_model import TorchModel
+    from .base.base_torch_head import TorchHead
diff --git a/modelscope/models/audio/__init__.py b/modelscope/models/audio/__init__.py
index 3c3ba54a..740086d8 100644
--- a/modelscope/models/audio/__init__.py
+++ b/modelscope/models/audio/__init__.py
@@ -1,3 +1,3 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 
-from . import ans, asr, itn, kws, tts
+from . import ans, asr, itn, kws, sv, tts
diff --git a/modelscope/models/audio/asr/generic_automatic_speech_recognition.py b/modelscope/models/audio/asr/generic_automatic_speech_recognition.py
index aebc6751..8e73b738 100644
--- a/modelscope/models/audio/asr/generic_automatic_speech_recognition.py
+++ b/modelscope/models/audio/asr/generic_automatic_speech_recognition.py
@@ -13,6 +13,9 @@ __all__ = ['GenericAutomaticSpeechRecognition']
 
 @MODELS.register_module(
     Tasks.auto_speech_recognition, module_name=Models.generic_asr)
+@MODELS.register_module(
+    Tasks.voice_activity_detection, module_name=Models.generic_asr)
+@MODELS.register_module(Tasks.language_model, module_name=Models.generic_asr)
 class GenericAutomaticSpeechRecognition(Model):
 
     def __init__(self, model_dir: str, am_model_name: str,
diff --git a/modelscope/models/audio/separation/mossformer.py b/modelscope/models/audio/separation/mossformer.py
index 2316bb26..5264beef 100644
--- a/modelscope/models/audio/separation/mossformer.py
+++ b/modelscope/models/audio/separation/mossformer.py
@@ -120,13 +120,12 @@ class Encoder(nn.Module):
         in_channels: Number of  input channels.
         out_channels: Number of output channels.
 
-    Example:
-    -------
+    Examples:
+
     >>> x = torch.randn(2, 1000)
     >>> encoder = Encoder(kernel_size=4, out_channels=64)
     >>> h = encoder(x)
-    >>> h.shape
-    torch.Size([2, 64, 499])
+    >>> h.shape # torch.Size([2, 64, 499])
     """
 
     def __init__(self,
diff --git a/modelscope/models/audio/sv/__init__.py b/modelscope/models/audio/sv/__init__.py
index e69de29b..cac0191a 100644
--- a/modelscope/models/audio/sv/__init__.py
+++ b/modelscope/models/audio/sv/__init__.py
@@ -0,0 +1,18 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .ecapa_tdnn import SpeakerVerificationECAPATDNN
+
+else:
+    _import_structure = {'ecapa_tdnn': ['SpeakerVerificationECAPATDNN']}
+    import sys
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/models/audio/sv/ecapa_tdnn.py b/modelscope/models/audio/sv/ecapa_tdnn.py
new file mode 100644
index 00000000..0b655816
--- /dev/null
+++ b/modelscope/models/audio/sv/ecapa_tdnn.py
@@ -0,0 +1,504 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+""" This ECAPA-TDNN implementation is adapted from https://github.com/speechbrain/speechbrain.
+"""
+import math
+import os
+from typing import Any, Dict, Union
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torchaudio.compliance.kaldi as Kaldi
+
+from modelscope.metainfo import Models
+from modelscope.models import MODELS, TorchModel
+from modelscope.utils.constant import Tasks
+
+
+def length_to_mask(length, max_len=None, dtype=None, device=None):
+    assert len(length.shape) == 1
+
+    if max_len is None:
+        max_len = length.max().long().item()
+    mask = torch.arange(
+        max_len, device=length.device, dtype=length.dtype).expand(
+            len(length), max_len) < length.unsqueeze(1)
+
+    if dtype is None:
+        dtype = length.dtype
+
+    if device is None:
+        device = length.device
+
+    mask = torch.as_tensor(mask, dtype=dtype, device=device)
+    return mask
+
+
+def get_padding_elem(L_in: int, stride: int, kernel_size: int, dilation: int):
+    if stride > 1:
+        n_steps = math.ceil(((L_in - kernel_size * dilation) / stride) + 1)
+        L_out = stride * (n_steps - 1) + kernel_size * dilation
+        padding = [kernel_size // 2, kernel_size // 2]
+
+    else:
+        L_out = (L_in - dilation * (kernel_size - 1) - 1) // stride + 1
+
+        padding = [(L_in - L_out) // 2, (L_in - L_out) // 2]
+    return padding
+
+
+class Conv1d(nn.Module):
+
+    def __init__(
+        self,
+        out_channels,
+        kernel_size,
+        in_channels,
+        stride=1,
+        dilation=1,
+        padding='same',
+        groups=1,
+        bias=True,
+        padding_mode='reflect',
+    ):
+        super().__init__()
+        self.kernel_size = kernel_size
+        self.stride = stride
+        self.dilation = dilation
+        self.padding = padding
+        self.padding_mode = padding_mode
+
+        self.conv = nn.Conv1d(
+            in_channels,
+            out_channels,
+            self.kernel_size,
+            stride=self.stride,
+            dilation=self.dilation,
+            padding=0,
+            groups=groups,
+            bias=bias,
+        )
+
+    def forward(self, x):
+        if self.padding == 'same':
+            x = self._manage_padding(x, self.kernel_size, self.dilation,
+                                     self.stride)
+
+        elif self.padding == 'causal':
+            num_pad = (self.kernel_size - 1) * self.dilation
+            x = F.pad(x, (num_pad, 0))
+
+        elif self.padding == 'valid':
+            pass
+
+        else:
+            raise ValueError(
+                "Padding must be 'same', 'valid' or 'causal'. Got "
+                + self.padding)
+
+        wx = self.conv(x)
+
+        return wx
+
+    def _manage_padding(
+        self,
+        x,
+        kernel_size: int,
+        dilation: int,
+        stride: int,
+    ):
+        L_in = x.shape[-1]
+        padding = get_padding_elem(L_in, stride, kernel_size, dilation)
+        x = F.pad(x, padding, mode=self.padding_mode)
+
+        return x
+
+
+class BatchNorm1d(nn.Module):
+
+    def __init__(
+        self,
+        input_size,
+        eps=1e-05,
+        momentum=0.1,
+    ):
+        super().__init__()
+        self.norm = nn.BatchNorm1d(
+            input_size,
+            eps=eps,
+            momentum=momentum,
+        )
+
+    def forward(self, x):
+        return self.norm(x)
+
+
+class TDNNBlock(nn.Module):
+
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        dilation,
+        activation=nn.ReLU,
+        groups=1,
+    ):
+        super(TDNNBlock, self).__init__()
+        self.conv = Conv1d(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            dilation=dilation,
+            groups=groups,
+        )
+        self.activation = activation()
+        self.norm = BatchNorm1d(input_size=out_channels)
+
+    def forward(self, x):
+        return self.norm(self.activation(self.conv(x)))
+
+
+class Res2NetBlock(torch.nn.Module):
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 scale=8,
+                 kernel_size=3,
+                 dilation=1):
+        super(Res2NetBlock, self).__init__()
+        assert in_channels % scale == 0
+        assert out_channels % scale == 0
+
+        in_channel = in_channels // scale
+        hidden_channel = out_channels // scale
+
+        self.blocks = nn.ModuleList([
+            TDNNBlock(
+                in_channel,
+                hidden_channel,
+                kernel_size=kernel_size,
+                dilation=dilation,
+            ) for i in range(scale - 1)
+        ])
+        self.scale = scale
+
+    def forward(self, x):
+        y = []
+        for i, x_i in enumerate(torch.chunk(x, self.scale, dim=1)):
+            if i == 0:
+                y_i = x_i
+            elif i == 1:
+                y_i = self.blocks[i - 1](x_i)
+            else:
+                y_i = self.blocks[i - 1](x_i + y_i)
+            y.append(y_i)
+        y = torch.cat(y, dim=1)
+        return y
+
+
+class SEBlock(nn.Module):
+
+    def __init__(self, in_channels, se_channels, out_channels):
+        super(SEBlock, self).__init__()
+
+        self.conv1 = Conv1d(
+            in_channels=in_channels, out_channels=se_channels, kernel_size=1)
+        self.relu = torch.nn.ReLU(inplace=True)
+        self.conv2 = Conv1d(
+            in_channels=se_channels, out_channels=out_channels, kernel_size=1)
+        self.sigmoid = torch.nn.Sigmoid()
+
+    def forward(self, x, lengths=None):
+        L = x.shape[-1]
+        if lengths is not None:
+            mask = length_to_mask(lengths * L, max_len=L, device=x.device)
+            mask = mask.unsqueeze(1)
+            total = mask.sum(dim=2, keepdim=True)
+            s = (x * mask).sum(dim=2, keepdim=True) / total
+        else:
+            s = x.mean(dim=2, keepdim=True)
+
+        s = self.relu(self.conv1(s))
+        s = self.sigmoid(self.conv2(s))
+
+        return s * x
+
+
+class AttentiveStatisticsPooling(nn.Module):
+
+    def __init__(self, channels, attention_channels=128, global_context=True):
+        super().__init__()
+
+        self.eps = 1e-12
+        self.global_context = global_context
+        if global_context:
+            self.tdnn = TDNNBlock(channels * 3, attention_channels, 1, 1)
+        else:
+            self.tdnn = TDNNBlock(channels, attention_channels, 1, 1)
+        self.tanh = nn.Tanh()
+        self.conv = Conv1d(
+            in_channels=attention_channels,
+            out_channels=channels,
+            kernel_size=1)
+
+    def forward(self, x, lengths=None):
+        L = x.shape[-1]
+
+        def _compute_statistics(x, m, dim=2, eps=self.eps):
+            mean = (m * x).sum(dim)
+            std = torch.sqrt(
+                (m * (x - mean.unsqueeze(dim)).pow(2)).sum(dim).clamp(eps))
+            return mean, std
+
+        if lengths is None:
+            lengths = torch.ones(x.shape[0], device=x.device)
+
+        # Make binary mask of shape [N, 1, L]
+        mask = length_to_mask(lengths * L, max_len=L, device=x.device)
+        mask = mask.unsqueeze(1)
+
+        # Expand the temporal context of the pooling layer by allowing the
+        # self-attention to look at global properties of the utterance.
+        if self.global_context:
+            # torch.std is unstable for backward computation
+            # https://github.com/pytorch/pytorch/issues/4320
+            total = mask.sum(dim=2, keepdim=True).float()
+            mean, std = _compute_statistics(x, mask / total)
+            mean = mean.unsqueeze(2).repeat(1, 1, L)
+            std = std.unsqueeze(2).repeat(1, 1, L)
+            attn = torch.cat([x, mean, std], dim=1)
+        else:
+            attn = x
+
+        # Apply layers
+        attn = self.conv(self.tanh(self.tdnn(attn)))
+
+        # Filter out zero-paddings
+        attn = attn.masked_fill(mask == 0, float('-inf'))
+
+        attn = F.softmax(attn, dim=2)
+        mean, std = _compute_statistics(x, attn)
+        # Append mean and std of the batch
+        pooled_stats = torch.cat((mean, std), dim=1)
+        pooled_stats = pooled_stats.unsqueeze(2)
+
+        return pooled_stats
+
+
+class SERes2NetBlock(nn.Module):
+
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        res2net_scale=8,
+        se_channels=128,
+        kernel_size=1,
+        dilation=1,
+        activation=torch.nn.ReLU,
+        groups=1,
+    ):
+        super().__init__()
+        self.out_channels = out_channels
+        self.tdnn1 = TDNNBlock(
+            in_channels,
+            out_channels,
+            kernel_size=1,
+            dilation=1,
+            activation=activation,
+            groups=groups,
+        )
+        self.res2net_block = Res2NetBlock(out_channels, out_channels,
+                                          res2net_scale, kernel_size, dilation)
+        self.tdnn2 = TDNNBlock(
+            out_channels,
+            out_channels,
+            kernel_size=1,
+            dilation=1,
+            activation=activation,
+            groups=groups,
+        )
+        self.se_block = SEBlock(out_channels, se_channels, out_channels)
+
+        self.shortcut = None
+        if in_channels != out_channels:
+            self.shortcut = Conv1d(
+                in_channels=in_channels,
+                out_channels=out_channels,
+                kernel_size=1,
+            )
+
+    def forward(self, x, lengths=None):
+        residual = x
+        if self.shortcut:
+            residual = self.shortcut(x)
+
+        x = self.tdnn1(x)
+        x = self.res2net_block(x)
+        x = self.tdnn2(x)
+        x = self.se_block(x, lengths)
+
+        return x + residual
+
+
+class ECAPA_TDNN(nn.Module):
+    """An implementation of the speaker embedding model in a paper.
+    "ECAPA-TDNN: Emphasized Channel Attention, Propagation and Aggregation in
+    TDNN Based Speaker Verification" (https://arxiv.org/abs/2005.07143).
+    """
+
+    def __init__(
+        self,
+        input_size,
+        device='cpu',
+        lin_neurons=192,
+        activation=torch.nn.ReLU,
+        channels=[512, 512, 512, 512, 1536],
+        kernel_sizes=[5, 3, 3, 3, 1],
+        dilations=[1, 2, 3, 4, 1],
+        attention_channels=128,
+        res2net_scale=8,
+        se_channels=128,
+        global_context=True,
+        groups=[1, 1, 1, 1, 1],
+    ):
+
+        super().__init__()
+        assert len(channels) == len(kernel_sizes)
+        assert len(channels) == len(dilations)
+        self.channels = channels
+        self.blocks = nn.ModuleList()
+
+        # The initial TDNN layer
+        self.blocks.append(
+            TDNNBlock(
+                input_size,
+                channels[0],
+                kernel_sizes[0],
+                dilations[0],
+                activation,
+                groups[0],
+            ))
+
+        # SE-Res2Net layers
+        for i in range(1, len(channels) - 1):
+            self.blocks.append(
+                SERes2NetBlock(
+                    channels[i - 1],
+                    channels[i],
+                    res2net_scale=res2net_scale,
+                    se_channels=se_channels,
+                    kernel_size=kernel_sizes[i],
+                    dilation=dilations[i],
+                    activation=activation,
+                    groups=groups[i],
+                ))
+
+        # Multi-layer feature aggregation
+        self.mfa = TDNNBlock(
+            channels[-1],
+            channels[-1],
+            kernel_sizes[-1],
+            dilations[-1],
+            activation,
+            groups=groups[-1],
+        )
+
+        # Attentive Statistical Pooling
+        self.asp = AttentiveStatisticsPooling(
+            channels[-1],
+            attention_channels=attention_channels,
+            global_context=global_context,
+        )
+        self.asp_bn = BatchNorm1d(input_size=channels[-1] * 2)
+
+        # Final linear transformation
+        self.fc = Conv1d(
+            in_channels=channels[-1] * 2,
+            out_channels=lin_neurons,
+            kernel_size=1,
+        )
+
+    def forward(self, x, lengths=None):
+        """Returns the embedding vector.
+
+        Arguments
+        ---------
+        x : torch.Tensor
+            Tensor of shape (batch, time, channel).
+        """
+        x = x.transpose(1, 2)
+
+        xl = []
+        for layer in self.blocks:
+            try:
+                x = layer(x, lengths=lengths)
+            except TypeError:
+                x = layer(x)
+            xl.append(x)
+
+        # Multi-layer feature aggregation
+        x = torch.cat(xl[1:], dim=1)
+        x = self.mfa(x)
+
+        # Attentive Statistical Pooling
+        x = self.asp(x, lengths=lengths)
+        x = self.asp_bn(x)
+
+        # Final linear transformation
+        x = self.fc(x)
+
+        x = x.transpose(1, 2).squeeze(1)
+        return x
+
+
+@MODELS.register_module(
+    Tasks.speaker_verification, module_name=Models.ecapa_tdnn_sv)
+class SpeakerVerificationECAPATDNN(TorchModel):
+
+    def __init__(self, model_dir, model_config: Dict[str, Any], *args,
+                 **kwargs):
+        super().__init__(model_dir, model_config, *args, **kwargs)
+        self.model_config = model_config
+        self.other_config = kwargs
+        if self.model_config['channel'] != 1024:
+            raise ValueError(
+                'modelscope error: Currently only 1024-channel ecapa tdnn is supported.'
+            )
+
+        self.feature_dim = 80
+        channels_config = [1024, 1024, 1024, 1024, 3072]
+
+        self.embedding_model = ECAPA_TDNN(
+            self.feature_dim, channels=channels_config)
+
+        pretrained_model_name = kwargs['pretrained_model']
+        self.__load_check_point(pretrained_model_name)
+
+        self.embedding_model.eval()
+
+    def forward(self, audio):
+        assert len(audio.shape) == 2 and audio.shape[
+            0] == 1, 'modelscope error: the shape of input audio to model needs to be [1, T]'
+        # audio shape: [1, T]
+        feature = self.__extract_feature(audio)
+        embedding = self.embedding_model(feature)
+
+        return embedding
+
+    def __extract_feature(self, audio):
+        feature = Kaldi.fbank(audio, num_mel_bins=self.feature_dim)
+        feature = feature - feature.mean(dim=0, keepdim=True)
+        feature = feature.unsqueeze(0)
+        return feature
+
+    def __load_check_point(self, pretrained_model_name, device=None):
+        if not device:
+            device = torch.device('cpu')
+        self.embedding_model.load_state_dict(
+            torch.load(
+                os.path.join(self.model_dir, pretrained_model_name),
+                map_location=device),
+            strict=True)
diff --git a/modelscope/models/base/__init__.py b/modelscope/models/base/__init__.py
index 8c47ecaf..8b727354 100644
--- a/modelscope/models/base/__init__.py
+++ b/modelscope/models/base/__init__.py
@@ -1,6 +1,9 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 
+from modelscope.utils.import_utils import is_torch_available
 from .base_head import *  # noqa F403
 from .base_model import *  # noqa F403
-from .base_torch_head import *  # noqa F403
-from .base_torch_model import *  # noqa F403
+
+if is_torch_available():
+    from .base_torch_model import TorchModel
+    from .base_torch_head import TorchHead
diff --git a/modelscope/models/base/base_model.py b/modelscope/models/base/base_model.py
index d933d8ae..e0cedc34 100644
--- a/modelscope/models/base/base_model.py
+++ b/modelscope/models/base/base_model.py
@@ -2,13 +2,11 @@
 import os
 import os.path as osp
 from abc import ABC, abstractmethod
-from typing import Any, Callable, Dict, List, Optional, Union
+from typing import Any, Dict, List, Optional, Union
 
 from modelscope.hub.check_model import check_local_model_is_latest
 from modelscope.hub.snapshot_download import snapshot_download
 from modelscope.models.builder import build_model
-from modelscope.utils.checkpoint import (save_checkpoint, save_configuration,
-                                         save_pretrained)
 from modelscope.utils.config import Config
 from modelscope.utils.constant import DEFAULT_MODEL_REVISION, Invoke, ModelFile
 from modelscope.utils.device import verify_device
@@ -150,9 +148,7 @@ class Model(ABC):
     def save_pretrained(self,
                         target_folder: Union[str, os.PathLike],
                         save_checkpoint_names: Union[str, List[str]] = None,
-                        save_function: Callable = save_checkpoint,
                         config: Optional[dict] = None,
-                        save_config_function: Callable = save_configuration,
                         **kwargs):
         """save the pretrained model, its configuration and other related files to a directory,
             so that it can be re-loaded
@@ -164,21 +160,8 @@ class Model(ABC):
             save_checkpoint_names (Union[str, List[str]]):
             The checkpoint names to be saved in the target_folder
 
-            save_function (Callable, optional):
-            The function to use to save the state dictionary.
-
             config (Optional[dict], optional):
             The config for the configuration.json, might not be identical with model.config
-
-            save_config_function (Callble, optional):
-            The function to use to save the configuration.
-
         """
-        if config is None and hasattr(self, 'cfg'):
-            config = self.cfg
-
-        if config is not None:
-            save_config_function(target_folder, config)
-
-        save_pretrained(self, target_folder, save_checkpoint_names,
-                        save_function, **kwargs)
+        raise NotImplementedError(
+            'save_pretrained method need to be implemented by the subclass.')
diff --git a/modelscope/models/base/base_torch_model.py b/modelscope/models/base/base_torch_model.py
index 98221682..c3c3d40c 100644
--- a/modelscope/models/base/base_torch_model.py
+++ b/modelscope/models/base/base_torch_model.py
@@ -1,14 +1,16 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 
+import os
 from copy import deepcopy
-from typing import Any, Dict
+from typing import Any, Callable, Dict, List, Optional, Union
 
 import torch
 from torch import nn
 from torch.nn.parallel import DataParallel, DistributedDataParallel
 
+from modelscope.utils.checkpoint import (save_checkpoint, save_configuration,
+                                         save_pretrained)
 from modelscope.utils.file_utils import func_receive_dict_inputs
-from modelscope.utils.hub import parse_label_mapping
 from modelscope.utils.logger import get_logger
 from .base_model import Model
 
@@ -88,3 +90,39 @@ class TorchModel(Model, torch.nn.Module):
         elif isinstance(module, nn.LayerNorm):
             module.bias.data.zero_()
             module.weight.data.fill_(1.0)
+
+    def save_pretrained(self,
+                        target_folder: Union[str, os.PathLike],
+                        save_checkpoint_names: Union[str, List[str]] = None,
+                        save_function: Callable = save_checkpoint,
+                        config: Optional[dict] = None,
+                        save_config_function: Callable = save_configuration,
+                        **kwargs):
+        """save the pretrained model, its configuration and other related files to a directory,
+            so that it can be re-loaded
+
+        Args:
+            target_folder (Union[str, os.PathLike]):
+            Directory to which to save. Will be created if it doesn't exist.
+
+            save_checkpoint_names (Union[str, List[str]]):
+            The checkpoint names to be saved in the target_folder
+
+            save_function (Callable, optional):
+            The function to use to save the state dictionary.
+
+            config (Optional[dict], optional):
+            The config for the configuration.json, might not be identical with model.config
+
+            save_config_function (Callble, optional):
+            The function to use to save the configuration.
+
+        """
+        if config is None and hasattr(self, 'cfg'):
+            config = self.cfg
+
+        save_pretrained(self, target_folder, save_checkpoint_names,
+                        save_function, **kwargs)
+
+        if config is not None:
+            save_config_function(target_folder, config)
diff --git a/modelscope/models/builder.py b/modelscope/models/builder.py
index 2804c6c7..da18edd8 100644
--- a/modelscope/models/builder.py
+++ b/modelscope/models/builder.py
@@ -1,9 +1,13 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
-
+from modelscope.metainfo import Models
 from modelscope.utils.config import ConfigDict
 from modelscope.utils.constant import Tasks
 from modelscope.utils.import_utils import INDEX_KEY, LazyImportModule
-from modelscope.utils.registry import TYPE_NAME, Registry, build_from_cfg
+from modelscope.utils.logger import get_logger
+from modelscope.utils.registry import Registry, build_from_cfg
+from modelscope.utils.task_utils import get_task_by_subtask_name
+
+logger = get_logger()
 
 MODELS = Registry('models')
 BACKBONES = MODELS
@@ -27,8 +31,20 @@ def build_model(cfg: ConfigDict,
             :obj:`Tasks` for more details
         default_args (dict, optional): Default initialization arguments.
     """
-    return build_from_cfg(
-        cfg, MODELS, group_key=task_name, default_args=default_args)
+    try:
+        model = build_from_cfg(
+            cfg, MODELS, group_key=task_name, default_args=default_args)
+    except KeyError as e:
+        # Handle subtask with a backbone model that hasn't been registered
+        # All the subtask with a parent task should have a task model, otherwise it is not a
+        # valid subtask
+        parent_task, task_model_type = get_task_by_subtask_name(task_name)
+        if task_model_type is None:
+            raise KeyError(e)
+        cfg['type'] = task_model_type
+        model = build_from_cfg(
+            cfg, MODELS, group_key=parent_task, default_args=default_args)
+    return model
 
 
 def build_backbone(cfg: ConfigDict, default_args: dict = None):
@@ -38,8 +54,29 @@ def build_backbone(cfg: ConfigDict, default_args: dict = None):
         cfg (:obj:`ConfigDict`): config dict for backbone object.
         default_args (dict, optional): Default initialization arguments.
     """
-    return build_from_cfg(
-        cfg, BACKBONES, group_key=Tasks.backbone, default_args=default_args)
+    try:
+        model_dir = cfg.pop('model_dir', None)
+        model = build_from_cfg(
+            cfg,
+            BACKBONES,
+            group_key=Tasks.backbone,
+            default_args=default_args)
+    except KeyError:
+        # Handle backbone that is not in the register group by using transformers AutoModel.
+        # AutoModel are mostly using in NLP and part of Multi-Modal, while the number of backbone in CV、Audio and MM
+        # is limited, thus could be added and registered in Modelscope directly
+        logger.WARNING(
+            f'The backbone {cfg.type} is not registered in modelscope, try to import the backbone from hf transformers.'
+        )
+        cfg['type'] = Models.transformers
+        if model_dir is not None:
+            cfg['model_dir'] = model_dir
+        model = build_from_cfg(
+            cfg,
+            BACKBONES,
+            group_key=Tasks.backbone,
+            default_args=default_args)
+    return model
 
 
 def build_head(cfg: ConfigDict,
diff --git a/modelscope/models/cv/__init__.py b/modelscope/models/cv/__init__.py
index 97aa0a62..5afa7f48 100644
--- a/modelscope/models/cv/__init__.py
+++ b/modelscope/models/cv/__init__.py
@@ -1,25 +1,29 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 
 # yapf: disable
-from . import (action_recognition, animal_recognition, body_2d_keypoints,
-               body_3d_keypoints, cartoon, cmdssl_video_embedding,
-               crowd_counting, face_2d_keypoints, face_detection,
-               face_generation, human_wholebody_keypoint, image_classification,
+from . import (action_recognition, animal_recognition, bad_image_detecting,
+               body_2d_keypoints, body_3d_keypoints, cartoon,
+               cmdssl_video_embedding, crowd_counting, face_2d_keypoints,
+               face_detection, face_generation, face_reconstruction,
+               human_wholebody_keypoint, image_classification,
                image_color_enhance, image_colorization, image_defrcn_fewshot,
                image_denoise, image_inpainting, image_instance_segmentation,
                image_matching, image_mvs_depth_estimation,
                image_panoptic_segmentation, image_portrait_enhancement,
-               image_reid_person, image_semantic_segmentation,
+               image_probing_model, image_quality_assessment_degradation,
+               image_quality_assessment_mos, image_reid_person,
+               image_restoration, image_semantic_segmentation,
                image_to_image_generation, image_to_image_translation,
                language_guided_video_summarization, movie_scene_segmentation,
                object_detection, panorama_depth_estimation,
                pointcloud_sceneflow_estimation, product_retrieval_embedding,
-               realtime_object_detection, referring_video_object_segmentation,
+               referring_video_object_segmentation,
                robust_image_classification, salient_detection,
-               shop_segmentation, super_resolution, video_frame_interpolation,
-               video_object_segmentation, video_single_object_tracking,
-               video_stabilization, video_summarization,
-               video_super_resolution, virual_tryon, vision_middleware,
-               vop_retrieval)
+               shop_segmentation, stream_yolo, super_resolution,
+               video_deinterlace, video_frame_interpolation,
+               video_object_segmentation, video_panoptic_segmentation,
+               video_single_object_tracking, video_stabilization,
+               video_summarization, video_super_resolution, virual_tryon,
+               vision_middleware, vop_retrieval)
 
 # yapf: enable
diff --git a/modelscope/models/cv/abnormal_object_detection/__init__.py b/modelscope/models/cv/abnormal_object_detection/__init__.py
new file mode 100644
index 00000000..2a40a349
--- /dev/null
+++ b/modelscope/models/cv/abnormal_object_detection/__init__.py
@@ -0,0 +1,20 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .mmdet_model import AbnormalDetectionModel
+
+else:
+    _import_structure = {'mmdet_model': ['AbnormalDetectionModel']}
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/models/cv/abnormal_object_detection/mmdet_model.py b/modelscope/models/cv/abnormal_object_detection/mmdet_model.py
new file mode 100644
index 00000000..d341b13a
--- /dev/null
+++ b/modelscope/models/cv/abnormal_object_detection/mmdet_model.py
@@ -0,0 +1,103 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os.path as osp
+
+import numpy as np
+import torch
+
+from modelscope.metainfo import Models
+from modelscope.models.base.base_torch_model import TorchModel
+from modelscope.models.builder import MODELS
+from modelscope.utils.config import Config
+from modelscope.utils.constant import ModelFile, Tasks
+from .mmdet_ms import MaskScoringNRoIHead, SingleRoINExtractor
+
+
+@MODELS.register_module(
+    Tasks.image_object_detection, module_name=Models.mask_scoring)
+class AbnormalDetectionModel(TorchModel):
+
+    def __init__(self, model_dir: str, *args, **kwargs):
+        """str -- model file root."""
+        super().__init__(model_dir, *args, **kwargs)
+
+        from mmcv.runner import load_checkpoint
+        from mmdet.datasets import replace_ImageToTensor
+        from mmdet.datasets.pipelines import Compose
+        from mmdet.models import build_detector
+
+        model_path = osp.join(model_dir, ModelFile.TORCH_MODEL_FILE)
+        config_path = osp.join(model_dir, 'mmcv_config.py')
+        config = Config.from_file(config_path)
+        config.model.pretrained = None
+        self.model = build_detector(
+            config.model, test_cfg=config.get('test_cfg'))
+
+        checkpoint = load_checkpoint(
+            self.model, model_path, map_location='cpu')
+        self.class_names = checkpoint['meta']['CLASSES']
+        config.test_pipeline[0].type = 'LoadImageFromWebcam'
+        self.transform_input = Compose(
+            replace_ImageToTensor(config.test_pipeline))
+        self.model.cfg = config
+        self.model.eval()
+        self.score_thr = config.score_thr
+
+    def inference(self, data):
+        """data is dict,contain img and img_metas,follow with mmdet.
+        Args:
+            imgs (List[Tensor]): the outer list indicates test-time
+                augmentations and inner Tensor should have a shape NxCxHxW,
+                which contains all images in the batch.
+            img_metas (List[List[dict]]): the outer list indicates test-time
+                augs (multiscale, flip, etc.) and the inner list indicates
+                images in a batch.
+        """
+
+        with torch.no_grad():
+            results = self.model(
+                return_loss=False,
+                rescale=True,
+                img=data['img'],
+                img_metas=data['img_metas'])
+        return results
+
+    def preprocess(self, image):
+        """image is numpy return is dict contain img and img_metas,follow with mmdet."""
+
+        from mmcv.parallel import collate, scatter
+        data = dict(img=image)
+        data = self.transform_input(data)
+        data = collate([data], samples_per_gpu=1)
+        data['img_metas'] = [
+            img_metas.data[0] for img_metas in data['img_metas']
+        ]
+        data['img'] = [img.data[0] for img in data['img']]
+
+        if next(self.model.parameters()).is_cuda:
+            data = scatter(data, [next(self.model.parameters()).device])[0]
+
+        return data
+
+    def postprocess(self, inputs):
+
+        if isinstance(inputs[0], tuple):
+            bbox_result, _ = inputs[0]
+        else:
+            bbox_result, _ = inputs[0], None
+        labels = [
+            np.full(bbox.shape[0], i, dtype=np.int32)
+            for i, bbox in enumerate(bbox_result)
+        ]
+        labels = np.concatenate(labels)
+
+        bbox_result = np.vstack(bbox_result)
+        scores = bbox_result[:, -1]
+        inds = scores > self.score_thr
+        if np.sum(np.array(inds).astype('int')) == 0:
+            return None, None, None
+        bboxes = bbox_result[inds, :]
+        labels = labels[inds]
+        scores = np.around(bboxes[:, 4], 6)
+        bboxes = (bboxes[:, 0:4]).astype(int)
+        labels = [self.class_names[i_label] for i_label in labels]
+        return bboxes, scores, labels
diff --git a/modelscope/models/cv/abnormal_object_detection/mmdet_ms/__init__.py b/modelscope/models/cv/abnormal_object_detection/mmdet_ms/__init__.py
new file mode 100644
index 00000000..d51550b7
--- /dev/null
+++ b/modelscope/models/cv/abnormal_object_detection/mmdet_ms/__init__.py
@@ -0,0 +1,2 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from .roi_head import MaskScoringNRoIHead, SingleRoINExtractor
diff --git a/modelscope/models/cv/abnormal_object_detection/mmdet_ms/roi_head/__init__.py b/modelscope/models/cv/abnormal_object_detection/mmdet_ms/roi_head/__init__.py
new file mode 100644
index 00000000..0b6b0975
--- /dev/null
+++ b/modelscope/models/cv/abnormal_object_detection/mmdet_ms/roi_head/__init__.py
@@ -0,0 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from .mask_scoring_roi_head import MaskScoringNRoIHead
+from .roi_extractors import SingleRoINExtractor
+
+__all__ = ['MaskScoringNRoIHead', 'SingleRoINExtractor']
diff --git a/modelscope/models/cv/abnormal_object_detection/mmdet_ms/roi_head/mask_scoring_roi_head.py b/modelscope/models/cv/abnormal_object_detection/mmdet_ms/roi_head/mask_scoring_roi_head.py
new file mode 100644
index 00000000..60726d80
--- /dev/null
+++ b/modelscope/models/cv/abnormal_object_detection/mmdet_ms/roi_head/mask_scoring_roi_head.py
@@ -0,0 +1,138 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+# Implementation in this file is modified based on mmdetection
+# Originally Apache 2.0 License and publicly avaialbe at https://github.com/open-mmlab/mmdetection
+import torch
+from mmdet.core import bbox2roi
+from mmdet.models.builder import HEADS, build_head
+from mmdet.models.roi_heads.standard_roi_head import StandardRoIHead
+
+
+@HEADS.register_module()
+class MaskScoringNRoIHead(StandardRoIHead):
+    """Mask Scoring RoIHead for Mask Scoring RCNN.
+
+    https://arxiv.org/abs/1903.00241
+    """
+
+    def __init__(self,
+                 mask_iou_head=None,
+                 reg_roi_scale_factor=None,
+                 **kwargs):
+        # assert mask_iou_head is not None
+        super(MaskScoringNRoIHead, self).__init__(**kwargs)
+        if mask_iou_head is not None:
+            self.mask_iou_head = build_head(mask_iou_head)
+        self.reg_roi_scale_factor = reg_roi_scale_factor
+
+    def _bbox_forward(self, x, rois):
+        """Box head forward function used in both training and testing time."""
+        bbox_cls_feats = self.bbox_roi_extractor(
+            x[:self.bbox_roi_extractor.num_inputs], rois)
+        bbox_reg_feats = self.bbox_roi_extractor(
+            x[:self.bbox_roi_extractor.num_inputs],
+            rois,
+            roi_scale_factor=self.reg_roi_scale_factor)
+        if self.with_shared_head:
+            bbox_cls_feats = self.shared_head(bbox_cls_feats)
+            bbox_reg_feats = self.shared_head(bbox_reg_feats)
+        cls_score, bbox_pred = self.bbox_head(bbox_cls_feats, bbox_reg_feats)
+
+        bbox_results = dict(
+            cls_score=cls_score,
+            bbox_pred=bbox_pred,
+            bbox_feats=bbox_cls_feats)
+        return bbox_results
+
+    def _mask_forward_train(self, x, sampling_results, bbox_feats, gt_masks,
+                            img_metas):
+        """Run forward function and calculate loss for Mask head in
+        training."""
+        pos_labels = torch.cat([res.pos_gt_labels for res in sampling_results])
+        mask_results = super(MaskScoringNRoIHead,
+                             self)._mask_forward_train(x, sampling_results,
+                                                       bbox_feats, gt_masks,
+                                                       img_metas)
+        if mask_results['loss_mask'] is None:
+            return mask_results
+
+        # mask iou head forward and loss
+        pos_mask_pred = mask_results['mask_pred'][
+            range(mask_results['mask_pred'].size(0)), pos_labels]
+        mask_iou_pred = self.mask_iou_head(mask_results['mask_feats'],
+                                           pos_mask_pred)
+        pos_mask_iou_pred = mask_iou_pred[range(mask_iou_pred.size(0)),
+                                          pos_labels]
+
+        mask_iou_targets = self.mask_iou_head.get_targets(
+            sampling_results, gt_masks, pos_mask_pred,
+            mask_results['mask_targets'], self.train_cfg)
+        loss_mask_iou = self.mask_iou_head.loss(pos_mask_iou_pred,
+                                                mask_iou_targets)
+        mask_results['loss_mask'].update(loss_mask_iou)
+        return mask_results
+
+    def simple_test_mask(self,
+                         x,
+                         img_metas,
+                         det_bboxes,
+                         det_labels,
+                         rescale=False):
+        """Obtain mask prediction without augmentation."""
+        # image shapes of images in the batch
+        ori_shapes = tuple(meta['ori_shape'] for meta in img_metas)
+        scale_factors = tuple(meta['scale_factor'] for meta in img_metas)
+
+        num_imgs = len(det_bboxes)
+        if all(det_bbox.shape[0] == 0 for det_bbox in det_bboxes):
+            num_classes = self.mask_head.num_classes
+            segm_results = [[[] for _ in range(num_classes)]
+                            for _ in range(num_imgs)]
+            mask_scores = [[[] for _ in range(num_classes)]
+                           for _ in range(num_imgs)]
+        else:
+            # if det_bboxes is rescaled to the original image size, we need to
+            # rescale it back to the testing scale to obtain RoIs.
+            if rescale and not isinstance(scale_factors[0], float):
+                scale_factors = [
+                    torch.from_numpy(scale_factor).to(det_bboxes[0].device)
+                    for scale_factor in scale_factors
+                ]
+            _bboxes = [
+                det_bboxes[i][:, :4]
+                * scale_factors[i] if rescale else det_bboxes[i]
+                for i in range(num_imgs)
+            ]
+            mask_rois = bbox2roi(_bboxes)
+            mask_results = self._mask_forward(x, mask_rois)
+            concat_det_labels = torch.cat(det_labels)
+            # get mask scores with mask iou head
+            mask_feats = mask_results['mask_feats']
+            mask_pred = mask_results['mask_pred']
+            mask_iou_pred = self.mask_iou_head(
+                mask_feats, mask_pred[range(concat_det_labels.size(0)),
+                                      concat_det_labels])
+            # split batch mask prediction back to each image
+            num_bboxes_per_img = tuple(len(_bbox) for _bbox in _bboxes)
+            mask_preds = mask_pred.split(num_bboxes_per_img, 0)
+            mask_iou_preds = mask_iou_pred.split(num_bboxes_per_img, 0)
+
+            # apply mask post-processing to each image individually
+            segm_results = []
+            mask_scores = []
+            for i in range(num_imgs):
+                if det_bboxes[i].shape[0] == 0:
+                    segm_results.append(
+                        [[] for _ in range(self.mask_head.num_classes)])
+                    mask_scores.append(
+                        [[] for _ in range(self.mask_head.num_classes)])
+                else:
+                    segm_result = self.mask_head.get_seg_masks(
+                        mask_preds[i], _bboxes[i], det_labels[i],
+                        self.test_cfg, ori_shapes[i], scale_factors[i],
+                        rescale)
+                    # get mask scores with mask iou head
+                    mask_score = self.mask_iou_head.get_mask_scores(
+                        mask_iou_preds[i], det_bboxes[i], det_labels[i])
+                    segm_results.append(segm_result)
+                    mask_scores.append(mask_score)
+        return list(zip(segm_results, mask_scores))
diff --git a/modelscope/models/cv/abnormal_object_detection/mmdet_ms/roi_head/roi_extractors/__init__.py b/modelscope/models/cv/abnormal_object_detection/mmdet_ms/roi_head/roi_extractors/__init__.py
new file mode 100644
index 00000000..7dac4ac6
--- /dev/null
+++ b/modelscope/models/cv/abnormal_object_detection/mmdet_ms/roi_head/roi_extractors/__init__.py
@@ -0,0 +1,4 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from .single_level_roi_extractor import SingleRoINExtractor
+
+__all__ = ['SingleRoINExtractor']
diff --git a/modelscope/models/cv/abnormal_object_detection/mmdet_ms/roi_head/roi_extractors/single_level_roi_extractor.py b/modelscope/models/cv/abnormal_object_detection/mmdet_ms/roi_head/roi_extractors/single_level_roi_extractor.py
new file mode 100644
index 00000000..e9b2c0f2
--- /dev/null
+++ b/modelscope/models/cv/abnormal_object_detection/mmdet_ms/roi_head/roi_extractors/single_level_roi_extractor.py
@@ -0,0 +1,153 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+# Implementation in this file is modified based on mmdetection
+# Originally Apache 2.0 License and publicly avaialbe at https://github.com/open-mmlab/mmdetection
+import torch
+from mmcv.runner import force_fp32
+from mmdet.models.builder import ROI_EXTRACTORS
+from mmdet.models.roi_heads.roi_extractors.base_roi_extractor import \
+    BaseRoIExtractor
+
+
+@ROI_EXTRACTORS.register_module()
+class SingleRoINExtractor(BaseRoIExtractor):
+    """Extract RoI features from a single level feature map.
+
+    If there are multiple input feature levels, each RoI is mapped to a level
+    according to its scale. The mapping rule is proposed in
+    `FPN <https://arxiv.org/abs/1612.03144>`_.
+
+    Args:
+        roi_layer (dict): Specify RoI layer type and arguments.
+        out_channels (int): Output channels of RoI layers.
+        featmap_strides (List[int]): Strides of input feature maps.
+        finest_scale (int): Scale threshold of mapping to level 0. Default: 56.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None
+    """
+
+    def __init__(self,
+                 roi_layer,
+                 out_channels,
+                 featmap_strides,
+                 finest_scale=56,
+                 init_cfg=None,
+                 gc_context=False,
+                 offset_feature=False):
+        super(SingleRoINExtractor, self).__init__(roi_layer, out_channels,
+                                                  featmap_strides, init_cfg)
+        self.finest_scale = finest_scale
+        self.gc_context = gc_context
+        self.offset_feature = offset_feature
+        self.pool = torch.nn.AdaptiveAvgPool2d(7)
+
+    def map_roi_levels(self, rois, num_levels):
+        """Map rois to corresponding feature levels by scales.
+
+        - scale < finest_scale * 2: level 0
+        - finest_scale * 2 <= scale < finest_scale * 4: level 1
+        - finest_scale * 4 <= scale < finest_scale * 8: level 2
+        - scale >= finest_scale * 8: level 3
+
+        Args:
+            rois (Tensor): Input RoIs, shape (k, 5).
+            num_levels (int): Total level number.
+
+        Returns:
+            Tensor: Level index (0-based) of each RoI, shape (k, )
+        """
+        a = rois[:, 3] - rois[:, 1]
+        b = rois[:, 4] - rois[:, 2]
+        scale = torch.sqrt(a * b)
+        target_lvls = torch.floor(torch.log2(scale / self.finest_scale + 1e-6))
+        target_lvls = target_lvls.clamp(min=0, max=num_levels - 1).long()
+        return target_lvls
+
+    @force_fp32(apply_to=('feats', ), out_fp16=True)
+    def forward(self, feats, rois, roi_scale_factor=None):
+        """Forward function."""
+        out_size = self.roi_layers[0].output_size
+        num_levels = len(feats)
+        expand_dims = (-1, self.out_channels * out_size[0] * out_size[1])
+        if torch.onnx.is_in_onnx_export():
+            # Work around to export mask-rcnn to onnx
+            roi_feats = rois[:, :1].clone().detach()
+            roi_feats = roi_feats.expand(*expand_dims)
+            roi_feats = roi_feats.reshape(-1, self.out_channels, *out_size)
+            roi_feats = roi_feats * 0
+        else:
+            roi_feats = feats[0].new_zeros(
+                rois.size(0), self.out_channels, *out_size)
+        # TODO: remove this when parrots supports
+        if torch.__version__ == 'parrots':
+            roi_feats.requires_grad = True
+
+        if num_levels == 1:
+            if len(rois) == 0:
+                return roi_feats
+            return self.roi_layers[0](feats[0], rois)
+
+        if self.gc_context:
+            context = []
+            for feat in feats:
+                context.append(self.pool(feat))
+
+        batch_size = feats[0].shape[0]
+        target_lvls = self.map_roi_levels(rois, num_levels)
+
+        if roi_scale_factor is not None:
+            rois = self.roi_rescale(rois, roi_scale_factor)
+
+        for i in range(num_levels):
+            mask = target_lvls == i
+            if torch.onnx.is_in_onnx_export():
+                # To keep all roi_align nodes exported to onnx
+                # and skip nonzero op
+                mask = mask.float().unsqueeze(-1)
+                # select target level rois and reset the rest rois to zero.
+                rois_i = rois.clone().detach()
+                rois_i *= mask
+                mask_exp = mask.expand(*expand_dims).reshape(roi_feats.shape)
+                roi_feats_t = self.roi_layers[i](feats[i], rois_i)
+                roi_feats_t *= mask_exp
+                roi_feats += roi_feats_t
+                continue
+            inds = mask.nonzero(as_tuple=False).squeeze(1)
+            if inds.numel() > 0:
+                rois_ = rois[inds]
+                # todo offset
+                rois_offset = rois[inds]
+                offset = torch.zeros(rois_.size(0), 5)
+                _, _, x_max, y_max = rois_[:, 1].min().item(), rois_[:, 2].min(
+                ).item(), rois_[:, 3].max().item(), rois_[:, 4].max().item()
+                offset[:, 1:3] = -100 * torch.ones(rois_.size(0), 1)
+                offset[:, 3:5] = 100 * torch.ones(rois_.size(0), 1)
+                rois_offset += offset.cuda()
+                rois_offset_thsxy = torch.clamp(rois_offset[:, 1:3], min=0.)
+                rois_offset_ths_xmax = torch.clamp(
+                    rois_offset[:, 3], max=x_max)
+                rois_offset_ths_ymax = torch.clamp(
+                    rois_offset[:, 4], max=y_max)
+                rois_offset[:, 1:3] = rois_offset_thsxy
+                rois_offset[:,
+                            3], rois_offset[:,
+                                            4] = rois_offset_ths_xmax, rois_offset_ths_ymax
+                roi_feats_t = self.roi_layers[i](feats[i], rois_)
+                roi_feats_t_offset = self.roi_layers[i](feats[i], rois_offset)
+                if self.gc_context:
+                    for j in range(batch_size):
+                        roi_feats_t[rois_[:, 0] == j] += context[i][j]
+                elif self.offset_feature:
+                    roi_feats_t += roi_feats_t_offset
+
+                roi_feats[inds] = roi_feats_t
+            else:
+                # Sometimes some pyramid levels will not be used for RoI
+                # feature extraction and this will cause an incomplete
+                # computation graph in one GPU, which is different from those
+                # in other GPUs and will cause a hanging error.
+                # Therefore, we add it to ensure each feature pyramid is
+                # included in the computation graph to avoid runtime bugs.
+                roi_feats += sum(
+                    x.view(-1)[0]
+                    for x in self.parameters()) * 0. + feats[i].sum() * 0.
+        return roi_feats
diff --git a/modelscope/models/cv/bad_image_detecting/__init__.py b/modelscope/models/cv/bad_image_detecting/__init__.py
new file mode 100644
index 00000000..8fb20550
--- /dev/null
+++ b/modelscope/models/cv/bad_image_detecting/__init__.py
@@ -0,0 +1,20 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .bad_image_detecting import BadImageDetecting
+
+else:
+    _import_structure = {'bad_image_detecting': ['BadImageDetecting']}
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/models/cv/bad_image_detecting/bad_image_detecting.py b/modelscope/models/cv/bad_image_detecting/bad_image_detecting.py
new file mode 100644
index 00000000..f8cb866c
--- /dev/null
+++ b/modelscope/models/cv/bad_image_detecting/bad_image_detecting.py
@@ -0,0 +1,80 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os
+from typing import Any, Dict, Union
+
+import numpy as np
+import torch.cuda
+from torchvision import models
+
+from modelscope.metainfo import Models
+from modelscope.models.base import Tensor
+from modelscope.models.base.base_torch_model import TorchModel
+from modelscope.models.builder import MODELS
+from modelscope.utils.config import Config
+from modelscope.utils.constant import ModelFile, Tasks
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+__all__ = ['BadImageDetecting']
+
+
+@MODELS.register_module(
+    Tasks.bad_image_detecting, module_name=Models.bad_image_detecting)
+class BadImageDetecting(TorchModel):
+
+    def __init__(self, model_dir: str, *args, **kwargs):
+        """initialize the image_quality_assessment_mos model from the `model_dir` path.
+
+        Args:
+            model_dir (str): the model path.
+
+        """
+        super().__init__(model_dir, *args, **kwargs)
+        self.model_dir = model_dir
+        self.config = Config.from_file(
+            os.path.join(self.model_dir, ModelFile.CONFIGURATION))
+        model_path = os.path.join(model_dir, ModelFile.TORCH_MODEL_FILE)
+
+        self.model = models.mobilenet_v2(
+            pretrained=False, width_mult=0.35, num_classes=3)
+        self.model = self._load_pretrained(self.model, model_path)
+        self.model.eval()
+
+    def _train_forward(self, input: Tensor,
+                       target: Tensor) -> Dict[str, Tensor]:
+        losses = dict()
+        return losses
+
+    def _inference_forward(self, input: Tensor) -> Dict[str, Tensor]:
+
+        ret = self.model(input)
+
+        return {'output': ret}
+
+    def _evaluate_postprocess(self, input: Tensor,
+                              target: Tensor) -> Dict[str, list]:
+        torch.cuda.empty_cache()
+        with torch.no_grad():
+            preds = self.model(input)
+            _, pred_ = torch.max(preds, dim=1)
+        del input
+        torch.cuda.empty_cache()
+        return {'pred': pred_, 'target': target}
+
+    def forward(self, inputs: Dict[str,
+                                   Tensor]) -> Dict[str, Union[list, Tensor]]:
+        """return the result by the model
+
+        Args:
+            inputs (Tensor): the preprocessed data
+
+        Returns:
+            Dict[str, Tensor]: results
+        """
+        if self.training:
+            return self._train_forward(**inputs)
+        elif 'target' in inputs:
+            return self._evaluate_postprocess(**inputs)
+        else:
+            return self._inference_forward(**inputs)
diff --git a/modelscope/models/cv/body_3d_keypoints/__init__.py b/modelscope/models/cv/body_3d_keypoints/__init__.py
index 4bb83936..2672ba9a 100644
--- a/modelscope/models/cv/body_3d_keypoints/__init__.py
+++ b/modelscope/models/cv/body_3d_keypoints/__init__.py
@@ -4,12 +4,12 @@ from typing import TYPE_CHECKING
 from modelscope.utils.import_utils import LazyImportModule
 
 if TYPE_CHECKING:
-
-    from .body_3d_pose import BodyKeypointsDetection3D
-
+    from .cannonical_pose import BodyKeypointsDetection3D
+    from .hdformer import HDFormerDetector
 else:
     _import_structure = {
-        'body_3d_pose': ['BodyKeypointsDetection3D'],
+        'cannonical_pose': ['BodyKeypointsDetection3D'],
+        'hdformer': ['HDFormerDetector'],
     }
 
     import sys
diff --git a/modelscope/models/cv/body_3d_keypoints/cannonical_pose/__init__.py b/modelscope/models/cv/body_3d_keypoints/cannonical_pose/__init__.py
new file mode 100644
index 00000000..928d0a50
--- /dev/null
+++ b/modelscope/models/cv/body_3d_keypoints/cannonical_pose/__init__.py
@@ -0,0 +1 @@
+from .body_3d_pose import BodyKeypointsDetection3D
diff --git a/modelscope/models/cv/body_3d_keypoints/body_3d_pose.py b/modelscope/models/cv/body_3d_keypoints/cannonical_pose/body_3d_pose.py
similarity index 98%
rename from modelscope/models/cv/body_3d_keypoints/body_3d_pose.py
rename to modelscope/models/cv/body_3d_keypoints/cannonical_pose/body_3d_pose.py
index 6bedf2f3..3205ee95 100644
--- a/modelscope/models/cv/body_3d_keypoints/body_3d_pose.py
+++ b/modelscope/models/cv/body_3d_keypoints/cannonical_pose/body_3d_pose.py
@@ -10,7 +10,7 @@ import torch
 from modelscope.metainfo import Models
 from modelscope.models.base.base_torch_model import TorchModel
 from modelscope.models.builder import MODELS
-from modelscope.models.cv.body_3d_keypoints.canonical_pose_modules import (
+from modelscope.models.cv.body_3d_keypoints.cannonical_pose.canonical_pose_modules import (
     TemporalModel, TransCan3Dkeys)
 from modelscope.utils.config import Config
 from modelscope.utils.constant import ModelFile, Tasks
diff --git a/modelscope/models/cv/body_3d_keypoints/canonical_pose_modules.py b/modelscope/models/cv/body_3d_keypoints/cannonical_pose/canonical_pose_modules.py
similarity index 100%
rename from modelscope/models/cv/body_3d_keypoints/canonical_pose_modules.py
rename to modelscope/models/cv/body_3d_keypoints/cannonical_pose/canonical_pose_modules.py
diff --git a/modelscope/models/cv/body_3d_keypoints/hdformer/__init__.py b/modelscope/models/cv/body_3d_keypoints/hdformer/__init__.py
new file mode 100644
index 00000000..d9e40d12
--- /dev/null
+++ b/modelscope/models/cv/body_3d_keypoints/hdformer/__init__.py
@@ -0,0 +1,2 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from .hdformer_detector import HDFormerDetector
diff --git a/modelscope/models/cv/body_3d_keypoints/hdformer/backbone.py b/modelscope/models/cv/body_3d_keypoints/hdformer/backbone.py
new file mode 100644
index 00000000..7850eda8
--- /dev/null
+++ b/modelscope/models/cv/body_3d_keypoints/hdformer/backbone.py
@@ -0,0 +1,306 @@
+# --------------------------------------------------------
+# The implementation is also open-sourced by the authors as Hanyuan Chen, and is available publicly on
+# https://github.com/hyer/HDFormer
+# --------------------------------------------------------
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from modelscope.models.cv.body_3d_keypoints.hdformer.block import \
+    HightOrderAttentionBlock
+from modelscope.models.cv.body_3d_keypoints.hdformer.directed_graph import (
+    DiGraph, Graph)
+from modelscope.models.cv.body_3d_keypoints.hdformer.skeleton import \
+    get_skeleton
+
+
+class HDFormerNet(nn.Module):
+
+    def __init__(self, cfg):
+        super(HDFormerNet, self).__init__()
+        in_channels = cfg.in_channels
+        dropout = cfg.dropout
+        self.cfg = cfg
+        self.PLANES = [16, 32, 64, 128, 256]
+
+        # load graph
+        skeleton = get_skeleton()
+        self.di_graph = DiGraph(skeleton=skeleton)
+        self.graph = Graph(
+            skeleton=skeleton, strategy='agcn', max_hop=1, dilation=1)
+        self.A = torch.tensor(
+            self.graph.A,
+            dtype=torch.float32,
+            requires_grad=True,
+            device='cuda')
+
+        # build networks
+        spatial_kernel_size = self.A.size(0)
+        temporal_kernel_size = 9
+        kernel_size = (temporal_kernel_size, spatial_kernel_size)
+
+        if not cfg.data_bn:
+            self.data_bn = None
+        else:
+            n_joints = self.cfg.IN_NUM_JOINTS \
+                if hasattr(self.cfg, 'IN_NUM_JOINTS') \
+                else self.cfg.n_joints
+            self.data_bn = nn.BatchNorm1d(in_channels * n_joints) if hasattr(cfg, 'PJN') and cfg.PJN \
+                else nn.BatchNorm2d(in_channels)
+
+        self.downsample = nn.ModuleList(
+            (
+                HightOrderAttentionBlock(
+                    in_channels,
+                    self.PLANES[0],
+                    kernel_size,
+                    A=self.A,
+                    di_graph=self.di_graph,
+                    residual=False,
+                    adj_len=self.A.size(1),
+                    attention=cfg.attention_down if hasattr(
+                        cfg, 'attention_down') else False,
+                    dropout=0),
+                HightOrderAttentionBlock(
+                    self.PLANES[0],
+                    self.PLANES[1],
+                    kernel_size,
+                    A=self.A,
+                    di_graph=self.di_graph,
+                    stride=2,
+                    adj_len=self.A.size(1),
+                    attention=cfg.attention_down if hasattr(
+                        cfg, 'attention_down') else False,
+                    dropout=0),
+                HightOrderAttentionBlock(
+                    self.PLANES[1],
+                    self.PLANES[1],
+                    kernel_size,
+                    A=self.A,
+                    di_graph=self.di_graph,
+                    adj_len=self.A.size(1),
+                    attention=cfg.attention_down if hasattr(
+                        cfg, 'attention_down') else False,
+                    dropout=0),
+                HightOrderAttentionBlock(
+                    self.PLANES[1],
+                    self.PLANES[2],
+                    kernel_size,
+                    A=self.A,
+                    di_graph=self.di_graph,
+                    stride=2,
+                    adj_len=self.A.size(1),
+                    attention=cfg.attention_down if hasattr(
+                        cfg, 'attention_down') else False,
+                    dropout=0),
+                HightOrderAttentionBlock(
+                    self.PLANES[2],
+                    self.PLANES[2],
+                    kernel_size,
+                    A=self.A,
+                    di_graph=self.di_graph,
+                    adj_len=self.A.size(1),
+                    attention=cfg.attention_down if hasattr(
+                        cfg, 'attention_down') else False,
+                    dropout=0),
+                HightOrderAttentionBlock(
+                    self.PLANES[2],
+                    self.PLANES[3],
+                    kernel_size,
+                    A=self.A,
+                    di_graph=self.di_graph,
+                    stride=2,
+                    adj_len=self.A.size(1),
+                    attention=cfg.attention_down if hasattr(
+                        cfg, 'attention_down') else False,
+                    dropout=dropout),
+                HightOrderAttentionBlock(
+                    self.PLANES[3],
+                    self.PLANES[3],
+                    kernel_size,
+                    A=self.A,
+                    di_graph=self.di_graph,
+                    adj_len=self.A.size(1),
+                    attention=cfg.attention_down if hasattr(
+                        cfg, 'attention_down') else False,
+                    dropout=dropout),
+                HightOrderAttentionBlock(
+                    self.PLANES[3],
+                    self.PLANES[4],
+                    kernel_size,
+                    A=self.A,
+                    di_graph=self.di_graph,
+                    stride=2,
+                    adj_len=self.A.size(1),
+                    attention=cfg.attention_down if hasattr(
+                        cfg, 'attention_down') else False,
+                    dropout=dropout),
+                HightOrderAttentionBlock(
+                    self.PLANES[4],
+                    self.PLANES[4],
+                    kernel_size,
+                    A=self.A,
+                    di_graph=self.di_graph,
+                    adj_len=self.A.size(1),
+                    attention=cfg.attention_down if hasattr(
+                        cfg, 'attention_down') else False,
+                    dropout=dropout),
+            ))
+
+        self.upsample = nn.ModuleList((
+            HightOrderAttentionBlock(
+                self.PLANES[4],
+                self.PLANES[3],
+                kernel_size,
+                A=self.A,
+                di_graph=self.di_graph,
+                attention=cfg.attention_up
+                if hasattr(cfg, 'attention_up') else False,
+                adj_len=self.A.size(1),
+                dropout=dropout),
+            HightOrderAttentionBlock(
+                self.PLANES[3],
+                self.PLANES[2],
+                kernel_size,
+                A=self.A,
+                di_graph=self.di_graph,
+                attention=cfg.attention_up
+                if hasattr(cfg, 'attention_up') else False,
+                adj_len=self.A.size(1),
+                dropout=dropout),
+            HightOrderAttentionBlock(
+                self.PLANES[2],
+                self.PLANES[1],
+                kernel_size,
+                A=self.A,
+                di_graph=self.di_graph,
+                attention=cfg.attention_up
+                if hasattr(cfg, 'attention_up') else False,
+                adj_len=self.A.size(1),
+                dropout=0),
+            HightOrderAttentionBlock(
+                self.PLANES[1],
+                self.PLANES[0],
+                kernel_size,
+                A=self.A,
+                di_graph=self.di_graph,
+                attention=cfg.attention_up
+                if hasattr(cfg, 'attention_up') else False,
+                adj_len=self.A.size(1),
+                dropout=0),
+        ))
+
+        self.merge = nn.ModuleList((
+            HightOrderAttentionBlock(
+                self.PLANES[4],
+                self.PLANES[0],
+                kernel_size,
+                A=self.A,
+                di_graph=self.di_graph,
+                attention=cfg.attention_merge if hasattr(
+                    cfg, 'attention_merge') else False,
+                adj_len=self.A.size(1),
+                dropout=dropout,
+                max_hop=self.cfg.max_hop),
+            HightOrderAttentionBlock(
+                self.PLANES[3],
+                self.PLANES[0],
+                kernel_size,
+                A=self.A,
+                di_graph=self.di_graph,
+                attention=cfg.attention_merge if hasattr(
+                    cfg, 'attention_merge') else False,
+                adj_len=self.A.size(1),
+                dropout=dropout,
+                max_hop=self.cfg.max_hop),
+            HightOrderAttentionBlock(
+                self.PLANES[2],
+                self.PLANES[0],
+                kernel_size,
+                A=self.A,
+                di_graph=self.di_graph,
+                attention=cfg.attention_merge if hasattr(
+                    cfg, 'attention_merge') else False,
+                adj_len=self.A.size(1),
+                dropout=0,
+                max_hop=self.cfg.max_hop),
+            HightOrderAttentionBlock(
+                self.PLANES[1],
+                self.PLANES[0],
+                kernel_size,
+                A=self.A,
+                di_graph=self.di_graph,
+                attention=cfg.attention_merge if hasattr(
+                    cfg, 'attention_merge') else False,
+                adj_len=self.A.size(1),
+                dropout=0,
+                max_hop=self.cfg.max_hop),
+        ))
+
+    def get_edge_fea(self, x_v):
+        x_e = (x_v[..., [c for p, c in self.di_graph.directed_edges_hop1]]
+               - x_v[..., [p for p, c in self.di_graph.directed_edges_hop1]]
+               ).contiguous()
+        N, C, T, V = x_v.shape
+        edeg_append = torch.zeros((N, C, T, 1), device=x_e.device)
+        x_e = torch.cat((x_e, edeg_append), dim=-1)
+        return x_e
+
+    def forward(self, x_v: torch.Tensor):
+        """
+        x: shape [B,C,T,V_v]
+        """
+        B, C, T, V = x_v.shape
+        # data normalization
+        if self.data_bn is not None:
+            if hasattr(self.cfg, 'PJN') and self.cfg.PJN:
+                x_v = self.data_bn(x_v.permute(0, 1, 3, 2).contiguous().view(B, -1, T)).view(B, C, V, T) \
+                    .contiguous().permute(0, 1, 3, 2)
+            else:
+                x_v = self.data_bn(x_v)
+
+        x_e = self.get_edge_fea(x_v)
+
+        # forward
+        feature = []
+        for idx, hoa_block in enumerate(self.downsample):
+            x_v, x_e = hoa_block(x_v, x_e)
+            if idx == 0 or idx == 2 or idx == 4 or idx == 6:
+                feature.append((x_v, x_e))
+
+        feature.append((x_v, x_e))
+        feature = feature[::-1]
+
+        x_v, x_e = feature[0]
+        identity_feature = feature[1:]
+
+        ushape_feature = []
+        ushape_feature.append((x_v, x_e))
+        for idx, (hoa_block, id) in \
+                enumerate(zip(self.upsample, identity_feature)):
+            x_v, x_e = hoa_block(x_v, x_e)
+            if hasattr(self.cfg, 'deterministic') and self.cfg.deterministic:
+                x_v = F.interpolate(x_v, scale_factor=(2, 1), mode='nearest')
+            else:
+                x_v = F.interpolate(
+                    x_v,
+                    scale_factor=(2, 1),
+                    mode='bilinear',
+                    align_corners=False)
+            x_v += id[0]
+            ushape_feature.append((x_v, x_e))
+
+        ushape_feature = ushape_feature[:-1]
+        for idx, (hoa_block, u) in \
+                enumerate(zip(self.merge, ushape_feature)):
+            x_v2, x_e2 = hoa_block(*u)
+            if hasattr(self.cfg, 'deterministic') and self.cfg.deterministic:
+                x_v += F.interpolate(
+                    x_v2, scale_factor=(2**(4 - idx), 1), mode='nearest')
+            else:
+                x_v += F.interpolate(
+                    x_v2,
+                    scale_factor=(2**(4 - idx), 1),
+                    mode='bilinear',
+                    align_corners=False)
+        return x_v, x_e
diff --git a/modelscope/models/cv/body_3d_keypoints/hdformer/block.py b/modelscope/models/cv/body_3d_keypoints/hdformer/block.py
new file mode 100644
index 00000000..5886658e
--- /dev/null
+++ b/modelscope/models/cv/body_3d_keypoints/hdformer/block.py
@@ -0,0 +1,380 @@
+# Part of the implementation is borrowed and modified from 2s-AGCN, publicly available at
+# https://github.com/lshiwjx/2s-AGCN
+import math
+
+import torch
+import torch.nn as nn
+from einops import rearrange
+
+
+def import_class(name):
+    components = name.split('.')
+    mod = __import__(components[0])
+    for comp in components[1:]:
+        mod = getattr(mod, comp)
+    return mod
+
+
+def conv_branch_init(conv, branches):
+    weight = conv.weight
+    n = weight.size(0)
+    k1 = weight.size(1)
+    k2 = weight.size(2)
+    nn.init.normal_(weight, 0, math.sqrt(2. / (n * k1 * k2 * branches)))
+    if conv.bias is not None:
+        nn.init.constant_(conv.bias, 0)
+
+
+def conv_init(conv):
+    if conv.weight is not None:
+        nn.init.kaiming_normal_(conv.weight, mode='fan_out')
+    if conv.bias is not None:
+        nn.init.constant_(conv.bias, 0)
+
+
+def bn_init(bn, scale):
+    nn.init.constant_(bn.weight, scale)
+    nn.init.constant_(bn.bias, 0)
+
+
+def zero(x):
+    """return zero."""
+    return 0
+
+
+def iden(x):
+    """return input itself."""
+    return x
+
+
+class Mlp(nn.Module):
+
+    def __init__(self,
+                 in_features,
+                 hidden_features=None,
+                 out_features=None,
+                 act_layer=nn.GELU,
+                 drop=0.,
+                 changedim=False,
+                 currentdim=0,
+                 depth=0):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+
+
+class Attention(nn.Module):
+
+    def __init__(self,
+                 dim,
+                 num_heads=8,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 attn_drop=0.,
+                 proj_drop=0.,
+                 comb=False,
+                 vis=False):
+        """Attention is all you need
+
+        Args:
+            dim (_type_): _description_
+            num_heads (int, optional): _description_. Defaults to 8.
+            qkv_bias (bool, optional): _description_. Defaults to False.
+            qk_scale (_type_, optional): _description_. Defaults to None.
+            attn_drop (_type_, optional): _description_. Defaults to 0..
+            proj_drop (_type_, optional): _description_. Defaults to 0..
+            comb (bool, optional): Defaults to False.
+                True: q transpose * k.
+                False: q * k transpose.
+            vis (bool, optional): _description_. Defaults to False.
+        """
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        # NOTE scale factor was wrong in my original version, can set manually to be compat with prev weights
+        self.scale = qk_scale or head_dim**-0.5
+
+        self.to_q = nn.Linear(dim, dim, bias=qkv_bias)
+        self.to_k = nn.Linear(dim, dim, bias=qkv_bias)
+        self.to_v = nn.Linear(dim, dim, bias=qkv_bias)
+
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+
+        self.proj_drop = nn.Dropout(proj_drop)
+        self.comb = comb
+        self.vis = vis
+
+    def forward(self, fv, fe):
+        B, N, C = fv.shape
+        B, E, C = fe.shape
+        q = self.to_q(fv).reshape(B, N, self.num_heads,
+                                  C // self.num_heads).permute(0, 2, 1, 3)
+        k = self.to_k(fe).reshape(B, E, self.num_heads,
+                                  C // self.num_heads).permute(0, 2, 1, 3)
+        v = self.to_v(fe).reshape(B, E, self.num_heads,
+                                  C // self.num_heads).permute(0, 2, 1, 3)
+        # Now fv shape (B, H, N, C//heads)
+        attn = (q @ k.transpose(-2, -1)) * self.scale
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+
+        if self.comb:
+            fv = (attn @ v.transpose(-2, -1)).transpose(-2, -1)
+            fv = rearrange(fv, 'B H N C -> B N (H C)')
+        elif self.comb is False:
+            fv = (attn @ v).transpose(1, 2).reshape(B, N, C)
+        fv = self.proj(fv)
+        fv = self.proj_drop(fv)
+        return fv
+
+
+class FirstOrderAttention(nn.Module):
+    """First Order Attention block for spatial relationship.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 A,
+                 t_kernel_size=1,
+                 t_stride=1,
+                 t_padding=0,
+                 t_dilation=1,
+                 adj_len=17,
+                 bias=True):
+        super().__init__()
+
+        self.kernel_size = kernel_size
+        self.A = A
+        self.PA = nn.Parameter(torch.FloatTensor(3, adj_len, adj_len))
+        torch.nn.init.constant_(self.PA, 1e-6)
+
+        self.num_subset = 3
+        inter_channels = out_channels // 4
+        self.inter_c = inter_channels
+        self.conv_a = nn.ModuleList()
+        self.conv_b = nn.ModuleList()
+        self.conv_d = nn.ModuleList()
+        self.linears = nn.ModuleList()
+        for i in range(self.num_subset):
+            self.conv_a.append(nn.Conv2d(in_channels, inter_channels, 1))
+            self.conv_b.append(nn.Conv2d(in_channels, inter_channels, 1))
+            self.conv_d.append(nn.Conv2d(in_channels, out_channels, 1))
+            self.linears.append(nn.Linear(in_channels, in_channels))
+
+        if in_channels != out_channels:
+            self.down = nn.Sequential(
+                nn.Conv2d(in_channels, out_channels, 1),
+                nn.BatchNorm2d(out_channels))
+        else:
+            self.down = lambda x: x
+
+        self.bn = nn.BatchNorm2d(out_channels)
+        self.soft = nn.Softmax(-2)
+        self.relu = nn.ReLU()
+
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                conv_init(m)
+            elif isinstance(m, nn.BatchNorm2d):
+                bn_init(m, 1)
+        bn_init(self.bn, 1e-6)
+        for i in range(self.num_subset):
+            conv_branch_init(self.conv_d[i], self.num_subset)
+
+    def forward(self, x):
+        assert self.A.shape[0] == self.kernel_size[1]
+
+        N, C, T, V = x.size()
+        A = self.A + self.PA
+
+        y = None
+        for i in range(self.num_subset):
+            x_in = rearrange(x, 'N C T V -> N T V C')
+            x_in = self.linears[i](x_in)
+            A0 = rearrange(x_in, 'N T V C -> N (C T) V')
+
+            A1 = self.conv_a[i](x).permute(0, 3, 1, 2).contiguous().view(
+                N, V, self.inter_c * T)
+            A2 = self.conv_b[i](x).view(N, self.inter_c * T, V)
+            A1 = self.soft(torch.matmul(A1, A2) / A1.size(-1))
+            A1 = A1 + A[i]
+            z = self.conv_d[i](torch.matmul(A0, A1).view(N, C, T, V))
+            y = z + y if y is not None else z
+        y = self.bn(y)
+        y += self.down(x)
+
+        return self.relu(y)
+
+
+class HightOrderAttentionBlock(nn.Module):
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 A,
+                 di_graph,
+                 attention=False,
+                 stride=1,
+                 adj_len=17,
+                 dropout=0,
+                 residual=True,
+                 norm_layer=nn.BatchNorm2d,
+                 edge_importance=False,
+                 graph=None,
+                 conditional=False,
+                 experts=4,
+                 bias=True,
+                 share_tcn=False,
+                 max_hop=2):
+        super().__init__()
+
+        t_kernel_size = kernel_size[0]
+        assert t_kernel_size % 2 == 1
+        padding = ((t_kernel_size - 1) // 2, 0)
+        self.max_hop = max_hop
+        self.attention = attention
+        self.di_graph = di_graph
+
+        self.foa_block = FirstOrderAttention(
+            in_channels,
+            out_channels,
+            kernel_size,
+            A,
+            bias=bias,
+            adj_len=adj_len)
+
+        self.tcn_v = nn.Sequential(
+            norm_layer(out_channels),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(
+                out_channels,
+                out_channels, (t_kernel_size, 1), (stride, 1),
+                padding,
+                bias=bias),
+            norm_layer(out_channels),
+            nn.Dropout(dropout, inplace=True),
+        )
+
+        if not residual:
+            self.residual_v = zero
+        elif (in_channels == out_channels) and (stride == 1):
+            self.residual_v = iden
+        else:
+            self.residual_v = nn.Sequential(
+                nn.Conv2d(
+                    in_channels,
+                    out_channels,
+                    kernel_size=1,
+                    stride=(stride, 1),
+                    bias=bias),
+                norm_layer(out_channels),
+            )
+
+        self.relu = nn.ReLU(inplace=True)
+
+        if self.attention:
+            self.cross_attn = Attention(
+                dim=out_channels,
+                num_heads=8,
+                qkv_bias=True,
+                qk_scale=None,
+                attn_drop=dropout,
+                proj_drop=dropout)
+            self.norm_v = nn.LayerNorm(out_channels)
+            self.mlp = Mlp(
+                in_features=out_channels,
+                out_features=out_channels,
+                hidden_features=out_channels * 2,
+                act_layer=nn.GELU,
+                drop=dropout)
+            self.norm_mlp = nn.LayerNorm(out_channels)
+
+            # linear to change fep channels
+            self.linears = nn.ModuleList()
+            for hop_i in range(self.max_hop - 1):
+                hop_linear = nn.ModuleList()
+                for i in range(
+                        len(
+                            eval(f'self.di_graph.directed_edges_hop{hop_i+2}'))
+                ):
+                    hop_linear.append(nn.Linear(hop_i + 2, 1))
+                self.linears.append(hop_linear)
+
+    def forward(self, fv, fe):
+        # `fv` (node features) has shape (B, C, T, V_node)
+        # `fe` (edge features) has shape (B, C, T, V_edge)
+        N, C, T, V = fv.size()
+
+        res_v = self.residual_v(fv)
+
+        fvp = self.foa_block(fv)
+        fep_out = (
+            fvp[..., [c for p, c in self.di_graph.directed_edges_hop1]]
+            - fvp[..., [p for p, c in self.di_graph.directed_edges_hop1]]
+        ).contiguous()
+
+        if self.attention:
+            fep_concat = None
+            for hop_i in range(self.max_hop):
+                if 0 == hop_i:
+                    fep_hop_i = (fvp[..., [
+                        c for p, c in eval(
+                            f'self.di_graph.directed_edges_hop{hop_i+1}')
+                    ]] - fvp[..., [
+                        p for p, c in eval(
+                            f'self.di_graph.directed_edges_hop{hop_i+1}')
+                    ]]).contiguous()
+                    fep_hop_i = rearrange(fep_hop_i, 'N C T E -> (N T) E C')
+                else:
+                    joints_parts = eval(
+                        f'self.di_graph.directed_edges_hop{hop_i+1}')
+                    fep_hop_i = None
+                    for part_idx, part in enumerate(joints_parts):
+                        fep_part = None
+                        for j in range(len(part) - 1):
+                            fep = (fvp[..., part[j + 1]]
+                                   - fvp[..., part[j]]).contiguous().unsqueeze(
+                                       dim=-1)
+                            if fep_part is None:
+                                fep_part = fep
+                            else:
+                                fep_part = torch.cat((fep_part, fep), dim=-1)
+                        fep_part = self.linears[hop_i - 1][part_idx](fep_part)
+                        if fep_hop_i is None:
+                            fep_hop_i = fep_part
+                        else:
+                            fep_hop_i = torch.cat((fep_hop_i, fep_part),
+                                                  dim=-1)
+
+                    fep_hop_i = rearrange(fep_hop_i, 'N C T E -> (N T) E C')
+
+                if fep_concat is None:
+                    fep_concat = fep_hop_i
+                else:
+                    fep_concat = torch.cat((fep_concat, fep_hop_i),
+                                           dim=-2)  # dim=-2 represent edge dim
+            fvp = rearrange(fvp, 'N C T V -> (N T) V C')
+            fvp = self.norm_v(self.cross_attn(fvp, fep_concat)) + iden(fvp)
+            fvp = self.mlp(self.norm_mlp(fvp)) + iden(
+                fvp)  # make output joint number = adj_len
+            fvp = rearrange(fvp, '(N T) V C -> N C T V', N=N)
+
+        fvp = self.tcn_v(fvp) + res_v
+
+        return self.relu(fvp), fep_out
diff --git a/modelscope/models/cv/body_3d_keypoints/hdformer/directed_graph.py b/modelscope/models/cv/body_3d_keypoints/hdformer/directed_graph.py
new file mode 100644
index 00000000..3127cf1c
--- /dev/null
+++ b/modelscope/models/cv/body_3d_keypoints/hdformer/directed_graph.py
@@ -0,0 +1,209 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import sys
+from typing import List, Tuple
+
+import numpy as np
+
+sys.path.insert(0, './')
+
+
+def edge2mat(link, num_node):
+    """According to the directed edge link, the adjacency matrix is constructed.
+        link: [V, 2], each row is a tuple(start node, end node).
+    """
+    A = np.zeros((num_node, num_node))
+    for i, j in link:
+        A[j, i] = 1
+    return A
+
+
+def normalize_incidence_matrix(im: np.ndarray) -> np.ndarray:
+    Dl = im.sum(-1)
+    num_node = im.shape[0]
+    Dn = np.zeros((num_node, num_node))
+    for i in range(num_node):
+        if Dl[i] > 0:
+            Dn[i, i] = Dl[i]**(-1)
+    res = Dn @ im
+    return res
+
+
+def build_digraph_incidence_matrix(num_nodes: int,
+                                   edges: List[Tuple]) -> np.ndarray:
+    source_graph = np.zeros((num_nodes, len(edges)), dtype='float32')
+    target_graph = np.zeros((num_nodes, len(edges)), dtype='float32')
+    for edge_id, (source_node, target_node) in enumerate(edges):
+        source_graph[source_node, edge_id] = 1.
+        target_graph[target_node, edge_id] = 1.
+    source_graph = normalize_incidence_matrix(source_graph)
+    target_graph = normalize_incidence_matrix(target_graph)
+    return source_graph, target_graph
+
+
+class DiGraph():
+
+    def __init__(self, skeleton):
+        super().__init__()
+        self.num_nodes = len(skeleton.parents())
+        self.directed_edges_hop1 = [
+            (parrent, child)
+            for child, parrent in enumerate(skeleton.parents()) if parrent >= 0
+        ]
+        self.directed_edges_hop2 = [(0, 1, 2), (0, 4, 5), (0, 7, 8), (1, 2, 3),
+                                    (4, 5, 6), (7, 8, 9),
+                                    (7, 8, 11), (7, 8, 14), (8, 9, 10),
+                                    (8, 11, 12), (8, 14, 15), (11, 12, 13),
+                                    (14, 15, 16)]  # (parrent, child)
+        self.directed_edges_hop3 = [(0, 1, 2, 3), (0, 4, 5, 6), (0, 7, 8, 9),
+                                    (7, 8, 9, 10), (7, 8, 11, 12),
+                                    (7, 8, 14, 15), (8, 11, 12, 13),
+                                    (8, 14, 15, 16)]
+        self.directed_edges_hop4 = [(0, 7, 8, 9, 10), (0, 7, 8, 11, 12),
+                                    (0, 7, 8, 14, 15), (7, 8, 11, 12, 13),
+                                    (7, 8, 14, 15, 16)]
+
+        self.num_edges = len(self.directed_edges_hop1)
+        self.edge_left = [0, 1, 2, 10, 11, 12]
+        self.edge_right = [3, 4, 5, 13, 14, 15]
+        self.edge_middle = [6, 7, 8, 9]
+        self.center = 0  # for h36m data skeleton
+        # Incidence matrices
+        self.source_M, self.target_M = \
+            build_digraph_incidence_matrix(self.num_nodes, self.directed_edges_hop1)
+
+
+class Graph():
+    """ The Graph to model the skeletons extracted by the openpose
+    Args:
+        strategy (string): must be one of the follow candidates
+        - uniform: Uniform Labeling
+        - distance: Distance Partitioning
+        - spatial: Spatial Configuration
+        - agcn: AGCN Configuration
+        For more information, please refer to the section 'Partition Strategies'
+            in our paper (https://arxiv.org/abs/1801.07455).
+        layout (string): must be one of the follow candidates
+        - openpose: Is consists of 18 joints. For more information, please
+            refer to https://github.com/CMU-Perceptual-Computing-Lab/openpose#output
+        - ntu-rgb+d: Is consists of 25 joints. For more information, please
+            refer to https://github.com/shahroudy/NTURGB-D
+        max_hop (int): the maximal distance between two connected nodes
+        dilation (int): controls the spacing between the kernel points
+    """
+
+    def __init__(self,
+                 skeleton=None,
+                 strategy='uniform',
+                 max_hop=1,
+                 dilation=1):
+        self.max_hop = max_hop
+        self.dilation = dilation
+
+        assert strategy in ['uniform', 'distance', 'spatial', 'agcn']
+        self.get_edge(skeleton)
+        self.hop_dis = get_hop_distance(
+            self.num_node, self.edge, max_hop=max_hop)
+        self.get_adjacency(strategy)
+
+    def __str__(self):
+        return self.A
+
+    def get_edge(self, skeleton):
+        # edge is a list of [child, parent] paris
+        self.num_node = len(skeleton.parents())
+        self_link = [(i, i) for i in range(self.num_node)]
+        neighbor_link = [(child, parrent)
+                         for child, parrent in enumerate(skeleton.parents())]
+        self.self_link = self_link
+        self.neighbor_link = neighbor_link
+        self.edge = self_link + neighbor_link
+        self.center = 0  # for h36m data skeleton, root node idx
+
+    def get_adjacency(self, strategy):
+        valid_hop = range(0, self.max_hop + 1, self.dilation)
+        adjacency = np.zeros((self.num_node, self.num_node))
+        for hop in valid_hop:
+            adjacency[self.hop_dis == hop] = 1
+        normalize_adjacency = normalize_digraph(adjacency)
+
+        if strategy == 'uniform':
+            A = np.zeros((1, self.num_node, self.num_node))
+            A[0] = normalize_adjacency
+            self.A = A
+        elif strategy == 'distance':
+            A = np.zeros((len(valid_hop), self.num_node, self.num_node))
+            for i, hop in enumerate(valid_hop):
+                A[i][self.hop_dis == hop] = \
+                    normalize_adjacency[self.hop_dis == hop]
+            self.A = A
+        elif strategy == 'spatial':
+            A = []
+            for hop in valid_hop:
+                a_root = np.zeros((self.num_node, self.num_node))
+                a_close = np.zeros((self.num_node, self.num_node))
+                a_further = np.zeros((self.num_node, self.num_node))
+                for i in range(self.num_node):
+                    for j in range(self.num_node):
+                        if self.hop_dis[j, i] == hop:
+                            if self.hop_dis[j, self.center] == self.hop_dis[
+                                    i, self.center]:
+                                a_root[j, i] = normalize_adjacency[j, i]
+                            elif self.hop_dis[j, self.center] > self.hop_dis[
+                                    i, self.center]:
+                                a_close[j, i] = normalize_adjacency[j, i]
+                            else:
+                                a_further[j, i] = normalize_adjacency[j, i]
+                if hop == 0:
+                    A.append(a_root)
+                else:
+                    A.append(a_root + a_close)
+                    A.append(a_further)
+            A = np.stack(A)
+            self.A = A
+        elif strategy == 'agcn':
+            A = []
+            link_mat = edge2mat(self.self_link, self.num_node)
+            In = normalize_digraph(edge2mat(self.neighbor_link, self.num_node))
+            outward = [(j, i) for (i, j) in self.neighbor_link]
+            Out = normalize_digraph(edge2mat(outward, self.num_node))
+            A = np.stack((link_mat, In, Out))
+            self.A = A
+        else:
+            raise ValueError('Do Not Exist This Strategy')
+
+
+def get_hop_distance(num_node, edge, max_hop=1):
+    A = np.zeros((num_node, num_node))
+    for i, j in edge:
+        A[j, i] = 1
+        A[i, j] = 1
+
+    # compute hop steps
+    hop_dis = np.zeros((num_node, num_node)) + np.inf
+    transfer_mat = [np.linalg.matrix_power(A, d) for d in range(max_hop + 1)]
+    arrive_mat = (np.stack(transfer_mat) > 0)
+    for d in range(max_hop, -1, -1):
+        hop_dis[arrive_mat[d]] = d
+    return hop_dis
+
+
+def normalize_digraph(A):
+    Dl = np.sum(A, 0)
+    num_node = A.shape[0]
+    Dn = np.zeros((num_node, num_node))
+    for i in range(num_node):
+        if Dl[i] > 0:
+            Dn[i, i] = Dl[i]**(-1)
+    AD = np.dot(A, Dn)
+    return AD
+
+
+def normalize_undigraph(A):
+    Dl = np.sum(A, 0)
+    num_node = A.shape[0]
+    Dn = np.zeros((num_node, num_node))
+    for i in range(num_node):
+        if Dl[i] > 0:
+            Dn[i, i] = Dl[i]**(-0.5)
+    DAD = np.dot(np.dot(Dn, A), Dn)
+    return DAD
diff --git a/modelscope/models/cv/body_3d_keypoints/hdformer/hdformer.py b/modelscope/models/cv/body_3d_keypoints/hdformer/hdformer.py
new file mode 100644
index 00000000..2873657e
--- /dev/null
+++ b/modelscope/models/cv/body_3d_keypoints/hdformer/hdformer.py
@@ -0,0 +1,64 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import torch
+import torch.nn as nn
+
+from modelscope.models.cv.body_3d_keypoints.hdformer.backbone import \
+    HDFormerNet
+
+
+class HDFormer(nn.Module):
+
+    def __init__(self, cfg):
+        super(HDFormer, self).__init__()
+        self.regress_with_edge = hasattr(
+            cfg, 'regress_with_edge') and cfg.regress_with_edge
+        self.backbone = HDFormerNet(cfg)
+        num_v, num_e = self.backbone.di_graph.source_M.shape
+        self.regressor_type = cfg.regressor_type if hasattr(
+            cfg, 'regressor_type') else 'conv'
+        if self.regressor_type == 'conv':
+            self.joint_regressor = nn.Conv2d(
+                self.backbone.PLANES[0],
+                3 * (num_v - 1),
+                kernel_size=(3, num_v + num_e) if self.regress_with_edge else
+                (3, num_v),
+                padding=(1, 0),
+                bias=True)
+        elif self.regressor_type == 'fc':
+            self.joint_regressor = nn.Conv1d(
+                self.backbone.PLANES[0] * (num_v + num_e)
+                if self.regress_with_edge else self.backbone.PLANES[0] * num_v,
+                3 * (num_v - 1),
+                kernel_size=3,
+                padding=1,
+                bias=True)
+        else:
+            raise NotImplementedError
+
+    def forward(self, x_v: torch.Tensor, mean_3d: torch.Tensor,
+                std_3d: torch.Tensor):
+        """
+        x: shape [B,C,T,V_v]
+        """
+        fv, fe = self.backbone(x_v)
+        B, C, T, V = fv.shape
+
+        if self.regressor_type == 'conv':
+            pre_joints = self.joint_regressor(torch.cat([
+                fv, fe
+            ], dim=-1)) if self.regress_with_edge else self.joint_regressor(fv)
+        elif self.regressor_type == 'fc':
+            x = (torch.cat([fv, fe], dim=-1) if self.regress_with_edge else fv) \
+                .permute(0, 1, 3, 2).contiguous().view(B, -1, T)
+            pre_joints = self.joint_regressor(x)
+        else:
+            raise NotImplementedError
+        pre_joints = pre_joints.view(B, 3, V - 1,
+                                     T).permute(0, 1, 3,
+                                                2).contiguous()  # [B,3,T,V-1]
+        root_node = torch.zeros((B, 3, T, 1),
+                                dtype=pre_joints.dtype,
+                                device=pre_joints.device)
+        pre_joints = torch.cat((root_node, pre_joints), dim=-1)
+        pre_joints = pre_joints * std_3d + mean_3d
+        return pre_joints
diff --git a/modelscope/models/cv/body_3d_keypoints/hdformer/hdformer_detector.py b/modelscope/models/cv/body_3d_keypoints/hdformer/hdformer_detector.py
new file mode 100644
index 00000000..73c9b4be
--- /dev/null
+++ b/modelscope/models/cv/body_3d_keypoints/hdformer/hdformer_detector.py
@@ -0,0 +1,196 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os.path as osp
+from typing import Any, Dict, List, Union
+
+import numpy as np
+import torch
+import torch.backends.cudnn as cudnn
+
+from modelscope.metainfo import Models
+from modelscope.models.base.base_torch_model import TorchModel
+from modelscope.models.builder import MODELS
+from modelscope.models.cv.body_3d_keypoints.hdformer.hdformer import HDFormer
+from modelscope.utils.config import Config
+from modelscope.utils.constant import ModelFile, Tasks
+from modelscope.utils.logger import get_logger
+
+
+class KeypointsTypes(object):
+    POSES_CAMERA = 'poses_camera'
+
+
+logger = get_logger()
+
+
+@MODELS.register_module(
+    Tasks.body_3d_keypoints, module_name=Models.body_3d_keypoints_hdformer)
+class HDFormerDetector(TorchModel):
+
+    def __init__(self, model_dir: str, *args, **kwargs):
+        super().__init__(model_dir, *args, **kwargs)
+        self.model_dir = model_dir
+
+        cudnn.benchmark = True
+        self.model_path = osp.join(self.model_dir, ModelFile.TORCH_MODEL_FILE)
+        self.mean_std_2d = np.load(
+            osp.join(self.model_dir, 'mean_std_2d.npy'), allow_pickle=True)
+        self.mean_std_3d = np.load(
+            osp.join(self.model_dir, 'mean_std_3d.npy'), allow_pickle=True)
+        self.left_right_symmetry_2d = np.array(
+            [0, 4, 5, 6, 1, 2, 3, 7, 8, 9, 10, 14, 15, 16, 11, 12, 13])
+        cfg_path = osp.join(self.model_dir, ModelFile.CONFIGURATION)
+        self.cfg = Config.from_file(cfg_path)
+        if torch.cuda.is_available():
+            self.device = torch.device('cuda')
+        else:
+            self.device = torch.device('cpu')
+        self.net = HDFormer(self.cfg.model.MODEL)
+
+        self.load_model()
+        self.net = self.net.to(self.device)
+
+    def load_model(self, load_to_cpu=False):
+        pretrained_dict = torch.load(
+            self.model_path,
+            map_location=torch.device('cuda')
+            if torch.cuda.is_available() else torch.device('cpu'))
+        self.net.load_state_dict(pretrained_dict['state_dict'], strict=False)
+        self.net.eval()
+
+    def preprocess(self, input: Dict[str, Any]) -> Dict[str, Any]:
+        """Proprocess of 2D input joints.
+
+        Args:
+            input (Dict[str, Any]): [NUM_FRAME, NUM_JOINTS, 2], input 2d human body keypoints.
+
+        Returns:
+            Dict[str, Any]: canonical 2d points and root relative joints.
+        """
+        if 'cuda' == input.device.type:
+            input = input.data.cpu().numpy()
+        elif 'cpu' == input.device.type:
+            input = input.data.numpy()
+        pose2d = input
+        num_frames, num_joints, in_channels = pose2d.shape
+        logger.info(f'2d pose frame number: {num_frames}')
+
+        # [NUM_FRAME, NUM_JOINTS, 2]
+        c = np.array(self.cfg.model.INPUT.center)
+        f = np.array(self.cfg.model.INPUT.focal_length)
+        self.window_size = self.cfg.model.INPUT.window_size
+        receptive_field = self.cfg.model.INPUT.n_frames
+
+        # split the 2D pose sequences into fixed length frames
+        inputs_2d = []
+        inputs_2d_flip = []
+        n = 0
+        indices = []
+        while n + receptive_field <= num_frames:
+            indices.append((n, n + receptive_field))
+            n += self.window_size
+        self.valid_length = n - self.window_size + receptive_field
+
+        if 0 == len(indices):
+            logger.warn(
+                f'Fail to construct test sequences, total_frames = {num_frames}, \
+                while receptive_filed ={receptive_field}')
+
+        self.mean_2d = self.mean_std_2d[0]
+        self.std_2d = self.mean_std_2d[1]
+        for (start, end) in indices:
+            data_2d = pose2d[start:end]
+            data_2d = (data_2d - 0.5 - c) / f
+            data_2d_flip = data_2d.copy()
+            data_2d_flip[:, :, 0] *= -1
+            data_2d_flip = data_2d_flip[:, self.left_right_symmetry_2d, :]
+            data_2d_flip = (data_2d_flip - self.mean_2d) / self.std_2d
+
+            data_2d = (data_2d - self.mean_2d) / self.std_2d
+            data_2d = torch.from_numpy(data_2d.transpose(
+                (2, 0, 1))).float()  # [C,T,V]
+
+            data_2d_flip = torch.from_numpy(data_2d_flip.transpose(
+                (2, 0, 1))).float()  # [C,T,V]
+
+            inputs_2d.append(data_2d)
+            inputs_2d_flip.append(data_2d_flip)
+
+        self.mean_3d = self.mean_std_3d[0]
+        self.std_3d = self.mean_std_3d[1]
+        mean_3d = torch.from_numpy(self.mean_3d).float().unsqueeze(-1)
+        mean_3d = mean_3d.permute(1, 2, 0)  # [3, 1, 17]
+        std_3d = torch.from_numpy(self.std_3d).float().unsqueeze(-1)
+        std_3d = std_3d.permute(1, 2, 0)
+
+        return {
+            'inputs_2d': inputs_2d,
+            'inputs_2d_flip': inputs_2d_flip,
+            'mean_3d': mean_3d,
+            'std_3d': std_3d
+        }
+
+    def avg_flip(self, pre, pre_flip):
+        left_right_symmetry = [
+            0, 4, 5, 6, 1, 2, 3, 7, 8, 9, 10, 14, 15, 16, 11, 12, 13
+        ]
+        pre_flip[:, 0, :, :] *= -1
+        pre_flip = pre_flip[:, :, :, left_right_symmetry]
+        pred_avg = (pre + pre_flip) / 2.
+        return pred_avg
+
+    def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
+        """3D human pose estimation.
+
+        Args:
+            input (Dict):
+                inputs_2d:  [1, NUM_FRAME, NUM_JOINTS, 2]
+
+        Returns:
+            Dict[str, Any]:
+                "camera_pose": Tensor, [1, NUM_FRAME, OUT_NUM_JOINTS, OUT_3D_FEATURE_DIM],
+                    3D human pose keypoints in camera frame.
+                "success": 3D pose estimation success or failed.
+        """
+        inputs_2d = input['inputs_2d']
+        inputs_2d_flip = input['inputs_2d_flip']
+        mean_3d = input['mean_3d']
+        std_3d = input['std_3d']
+        preds_3d = None
+        vertex_pre = None
+
+        if [] == inputs_2d:
+            predict_dict = {'success': False, KeypointsTypes.POSES_CAMERA: []}
+            return predict_dict
+
+        with torch.no_grad():
+            for i, pose_2d in enumerate(inputs_2d):
+                pose_2d = pose_2d.unsqueeze(0).cuda(non_blocking=True) \
+                    if torch.cuda.is_available() else pose_2d.unsqueeze(0)
+                pose_2d_flip = inputs_2d_flip[i]
+                pose_2d_flip = pose_2d_flip.unsqueeze(0).cuda(non_blocking=True) \
+                    if torch.cuda.is_available() else pose_2d_flip.unsqueeze(0)
+                mean_3d = mean_3d.unsqueeze(0).cuda(non_blocking=True) \
+                    if torch.cuda.is_available() else mean_3d.unsqueeze(0)
+                std_3d = std_3d.unsqueeze(0).cuda(non_blocking=True) \
+                    if torch.cuda.is_available() else std_3d.unsqueeze(0)
+
+                vertex_pre = self.net(pose_2d, mean_3d, std_3d)
+                vertex_pre_flip = self.net(pose_2d_flip, mean_3d, std_3d)
+                vertex_pre = self.avg_flip(vertex_pre, vertex_pre_flip)
+
+                # concat the prediction results for each window_size
+                predict_3d = vertex_pre.permute(
+                    0, 2, 3, 1).contiguous()[0][:self.window_size]
+                if preds_3d is None:
+                    preds_3d = predict_3d
+                else:
+                    preds_3d = torch.concat((preds_3d, predict_3d), dim=0)
+            remain_pose_results = vertex_pre.permute(
+                0, 2, 3, 1).contiguous()[0][self.window_size:]
+            preds_3d = torch.concat((preds_3d, remain_pose_results), dim=0)
+
+        preds_3d = preds_3d.unsqueeze(0)  # add batch dim
+        preds_3d = preds_3d / self.cfg.model.INPUT.res_w  # Normalize to [-1, 1]
+        predict_dict = {'success': True, KeypointsTypes.POSES_CAMERA: preds_3d}
+
+        return predict_dict
diff --git a/modelscope/models/cv/body_3d_keypoints/hdformer/skeleton.py b/modelscope/models/cv/body_3d_keypoints/hdformer/skeleton.py
new file mode 100644
index 00000000..1abf90d7
--- /dev/null
+++ b/modelscope/models/cv/body_3d_keypoints/hdformer/skeleton.py
@@ -0,0 +1,103 @@
+# Copyright (c) 2018-present, Facebook, Inc. All rights reserved.
+import numpy as np
+
+
+class Skeleton:
+
+    def __init__(self, parents, joints_left, joints_right):
+        assert len(joints_left) == len(joints_right)
+
+        self._parents = np.array(parents)
+        self._joints_left = joints_left
+        self._joints_right = joints_right
+        self._compute_metadata()
+
+    def num_joints(self):
+        return len(self._parents)
+
+    def parents(self):
+        return self._parents
+
+    def has_children(self):
+        return self._has_children
+
+    def children(self):
+        return self._children
+
+    def remove_joints(self, joints_to_remove):
+        """
+        Remove the joints specified in 'joints_to_remove'.
+        """
+        valid_joints = []
+        for joint in range(len(self._parents)):
+            if joint not in joints_to_remove:
+                valid_joints.append(joint)
+
+        for i in range(len(self._parents)):
+            while self._parents[i] in joints_to_remove:
+                self._parents[i] = self._parents[self._parents[i]]
+
+        index_offsets = np.zeros(len(self._parents), dtype=int)
+        new_parents = []
+        for i, parent in enumerate(self._parents):
+            if i not in joints_to_remove:
+                new_parents.append(parent - index_offsets[parent])
+            else:
+                index_offsets[i:] += 1
+        self._parents = np.array(new_parents)
+
+        if self._joints_left is not None:
+            new_joints_left = []
+            for joint in self._joints_left:
+                if joint in valid_joints:
+                    new_joints_left.append(joint - index_offsets[joint])
+            self._joints_left = new_joints_left
+        if self._joints_right is not None:
+            new_joints_right = []
+            for joint in self._joints_right:
+                if joint in valid_joints:
+                    new_joints_right.append(joint - index_offsets[joint])
+            self._joints_right = new_joints_right
+
+        self._compute_metadata()
+
+        return valid_joints
+
+    def joints_left(self):
+        return self._joints_left
+
+    def joints_right(self):
+        return self._joints_right
+
+    def _compute_metadata(self):
+        self._has_children = np.zeros(len(self._parents)).astype(bool)
+        for i, parent in enumerate(self._parents):
+            if parent != -1:
+                self._has_children[parent] = True
+
+        self._children = []
+        for i, parent in enumerate(self._parents):
+            self._children.append([])
+        for i, parent in enumerate(self._parents):
+            if parent != -1:
+                self._children[parent].append(i)
+
+
+def get_skeleton():
+    skeleton = Skeleton(
+        parents=[
+            -1, 0, 1, 2, 3, 4, 0, 6, 7, 8, 9, 0, 11, 12, 13, 14, 12, 16, 17,
+            18, 19, 20, 19, 22, 12, 24, 25, 26, 27, 28, 27, 30
+        ],
+        joints_left=[6, 7, 8, 9, 10, 16, 17, 18, 19, 20, 21, 22, 23],
+        joints_right=[1, 2, 3, 4, 5, 24, 25, 26, 27, 28, 29, 30, 31])
+    # Bring the skeleton to 17 joints instead of the original 32
+    skeleton.remove_joints(
+        [4, 5, 9, 10, 11, 16, 20, 21, 22, 23, 24, 28, 29, 30, 31])
+    # Rewire shoulders to the correct parents
+    skeleton._parents[11] = 8
+    skeleton._parents[14] = 8
+    # Fix children error
+    skeleton._children[7] = [8]
+    skeleton._children[8] = [9, 11, 14]
+    return skeleton
diff --git a/modelscope/models/cv/face_detection/scrfd/mmdet_patch/models/backbones/resnet.py b/modelscope/models/cv/face_detection/scrfd/mmdet_patch/models/backbones/resnet.py
index a5862a58..190d1570 100644
--- a/modelscope/models/cv/face_detection/scrfd/mmdet_patch/models/backbones/resnet.py
+++ b/modelscope/models/cv/face_detection/scrfd/mmdet_patch/models/backbones/resnet.py
@@ -51,7 +51,8 @@ class ResNet(nn.Module):
         zero_init_residual (bool): Whether to use zero init for last norm layer
             in resblocks to let them behave as identity.
 
-    Example:
+    Examples:
+
         >>> from mmdet.models import ResNet
         >>> import torch
         >>> self = ResNet(depth=18)
diff --git a/modelscope/models/cv/face_detection/scrfd/mmdet_patch/models/dense_heads/scrfd_head.py b/modelscope/models/cv/face_detection/scrfd/mmdet_patch/models/dense_heads/scrfd_head.py
index 77ec99cf..e43ed6e5 100755
--- a/modelscope/models/cv/face_detection/scrfd/mmdet_patch/models/dense_heads/scrfd_head.py
+++ b/modelscope/models/cv/face_detection/scrfd/mmdet_patch/models/dense_heads/scrfd_head.py
@@ -83,6 +83,7 @@ class SCRFDHead(AnchorHead):
         reg_max (int): Max value of integral set :math: `{0, ..., reg_max}`
             in QFL setting. Default: 16.
     Example:
+
         >>> self = GFLHead(11, 7)
         >>> feats = [torch.rand(1, 7, s, s) for s in [4, 8, 16, 32, 64]]
         >>> cls_quality_score, bbox_pred = self.forward(feats)
diff --git a/modelscope/models/cv/face_detection/scrfd/preprocessor.py b/modelscope/models/cv/face_detection/scrfd/preprocessor.py
index 67f58c59..4e400b95 100644
--- a/modelscope/models/cv/face_detection/scrfd/preprocessor.py
+++ b/modelscope/models/cv/face_detection/scrfd/preprocessor.py
@@ -50,17 +50,16 @@ class SCRFDPreprocessor(Preprocessor):
         Args:
             data (str or dict):  image path or data dict containing following info:
                 filename, ori_filename, img, img_shape, ori_shape, img_fields
-                example:
-                    ```python
-                    {
-                        "filename": "xxx.jpg"
-                        "ori_filename": "xxx.jpg",
-                        "img": np.ndarray,
-                        "img_shape": (300, 300, 3)
-                        "ori_shape": (300, 300, 3)
-                        "img_fields": "img"
-                    }
-                    ```
+                Example:
+                    >>> {
+                    >>>     "filename": "xxx.jpg"
+                    >>>     "ori_filename": "xxx.jpg",
+                    >>>     "img": np.ndarray,
+                    >>>     "img_shape": (300, 300, 3)
+                    >>>     "ori_shape": (300, 300, 3)
+                    >>>     "img_fields": "img"
+                    >>> }
+
         Returns:
             Dict[str, Any]: the preprocessed data
         """
diff --git a/modelscope/models/cv/face_emotion/efficient/model.py b/modelscope/models/cv/face_emotion/efficient/model.py
index db303016..19ab4c3c 100644
--- a/modelscope/models/cv/face_emotion/efficient/model.py
+++ b/modelscope/models/cv/face_emotion/efficient/model.py
@@ -212,18 +212,18 @@ class EfficientNet(nn.Module):
         Returns:
             Dictionary of last intermediate features
             with reduction levels i in [1, 2, 3, 4, 5].
-            Example:
-                >>> import torch
-                >>> from efficientnet.model import EfficientNet
-                >>> inputs = torch.rand(1, 3, 224, 224)
-                >>> model = EfficientNet.from_pretrained('efficientnet-b0')
-                >>> endpoints = model.extract_endpoints(inputs)
-                >>> print(endpoints['reduction_1'].shape)  # torch.Size([1, 16, 112, 112])
-                >>> print(endpoints['reduction_2'].shape)  # torch.Size([1, 24, 56, 56])
-                >>> print(endpoints['reduction_3'].shape)  # torch.Size([1, 40, 28, 28])
-                >>> print(endpoints['reduction_4'].shape)  # torch.Size([1, 112, 14, 14])
-                >>> print(endpoints['reduction_5'].shape)  # torch.Size([1, 320, 7, 7])
-                >>> print(endpoints['reduction_6'].shape)  # torch.Size([1, 1280, 7, 7])
+        Example:
+            >>> import torch
+            >>> from efficientnet.model import EfficientNet
+            >>> inputs = torch.rand(1, 3, 224, 224)
+            >>> model = EfficientNet.from_pretrained('efficientnet-b0')
+            >>> endpoints = model.extract_endpoints(inputs)
+            >>> print(endpoints['reduction_1'].shape)  # torch.Size([1, 16, 112, 112])
+            >>> print(endpoints['reduction_2'].shape)  # torch.Size([1, 24, 56, 56])
+            >>> print(endpoints['reduction_3'].shape)  # torch.Size([1, 40, 28, 28])
+            >>> print(endpoints['reduction_4'].shape)  # torch.Size([1, 112, 14, 14])
+            >>> print(endpoints['reduction_5'].shape)  # torch.Size([1, 320, 7, 7])
+            >>> print(endpoints['reduction_6'].shape)  # torch.Size([1, 1280, 7, 7])
         """
         endpoints = dict()
 
diff --git a/modelscope/models/cv/face_recognition/torchkit/rts_backbone.py b/modelscope/models/cv/face_recognition/torchkit/rts_backbone.py
index 6bd627eb..812f8af8 100644
--- a/modelscope/models/cv/face_recognition/torchkit/rts_backbone.py
+++ b/modelscope/models/cv/face_recognition/torchkit/rts_backbone.py
@@ -18,7 +18,7 @@ from modelscope.utils.logger import get_logger
 logger = get_logger()
 
 
-@MODELS.register_module('face-recognition-ood', 'rts-backbone')
+@MODELS.register_module('face-recognition', 'rts-backbone')
 class RTSBackbone(TorchModel):
 
     def __init__(self, *args, **kwargs):
diff --git a/modelscope/models/cv/realtime_object_detection/yolox/__init__.py b/modelscope/models/cv/face_reconstruction/__init__.py
similarity index 100%
rename from modelscope/models/cv/realtime_object_detection/yolox/__init__.py
rename to modelscope/models/cv/face_reconstruction/__init__.py
diff --git a/modelscope/models/cv/realtime_object_detection/yolox/data/__init__.py b/modelscope/models/cv/face_reconstruction/models/__init__.py
similarity index 100%
rename from modelscope/models/cv/realtime_object_detection/yolox/data/__init__.py
rename to modelscope/models/cv/face_reconstruction/models/__init__.py
diff --git a/modelscope/models/cv/face_reconstruction/models/bfm.py b/modelscope/models/cv/face_reconstruction/models/bfm.py
new file mode 100644
index 00000000..be4455bf
--- /dev/null
+++ b/modelscope/models/cv/face_reconstruction/models/bfm.py
@@ -0,0 +1,591 @@
+# Part of the implementation is borrowed and modified from Deep3DFaceRecon_pytorch,
+# publicly available at https://github.com/sicxu/Deep3DFaceRecon_pytorch
+import os
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+from scipy.io import loadmat
+
+from ..utils import read_obj, transferBFM09
+
+
+def perspective_projection(focal, center):
+    # return p.T (N, 3) @ (3, 3)
+    return np.array([focal, 0, center, 0, focal, center, 0, 0,
+                     1]).reshape([3, 3]).astype(np.float32).transpose()
+
+
+class SH:
+
+    def __init__(self):
+        self.a = [np.pi, 2 * np.pi / np.sqrt(3.), 2 * np.pi / np.sqrt(8.)]
+        self.c = [
+            1 / np.sqrt(4 * np.pi),
+            np.sqrt(3.) / np.sqrt(4 * np.pi),
+            3 * np.sqrt(5.) / np.sqrt(12 * np.pi)
+        ]
+
+
+class ParametricFaceModel:
+
+    def __init__(self,
+                 bfm_folder='./asset/BFM',
+                 recenter=True,
+                 camera_distance=10.,
+                 init_lit=np.array([0.8, 0, 0, 0, 0, 0, 0, 0, 0]),
+                 focal=1015.,
+                 center=112.,
+                 is_train=True,
+                 default_name='BFM_model_front.mat'):
+
+        if not os.path.isfile(os.path.join(bfm_folder, default_name)):
+            transferBFM09(bfm_folder)
+        model = loadmat(os.path.join(bfm_folder, default_name))
+        # mean face shape. [3*N,1]
+        self.mean_shape = model['meanshape'].astype(np.float32)
+
+        # identity basis. [3*N,80]
+        self.id_base = model['idBase'].astype(np.float32)
+
+        # expression basis. [3*N,64]
+        self.exp_base = model['exBase'].astype(np.float32)
+
+        # mean face texture. [3*N,1] (0-255)
+        self.mean_tex = model['meantex'].astype(np.float32)
+
+        # texture basis. [3*N,80]
+        self.tex_base = model['texBase'].astype(np.float32)
+
+        # face indices for each vertex that lies in. starts from 0. [N,8]
+        self.point_buf = model['point_buf'].astype(np.int64) - 1
+
+        # vertex indices for each face. starts from 0. [F,3]
+        self.face_buf = model['tri'].astype(np.int64) - 1
+
+        # vertex indices for 68 landmarks. starts from 0. [68,1]
+        self.keypoints = np.squeeze(model['keypoints']).astype(np.int64) - 1
+
+        self.mean_shape_ori = model['meanshape_ori'].astype(np.float32)
+        self.bfm_keep_inds = model['bfm_keep_inds'][0]
+        self.nose_reduced_part = model['nose_reduced_part'].reshape(
+            (1, -1)) - self.mean_shape
+        self.nonlinear_UVs = model['nonlinear_UVs']
+
+        if default_name == 'head_model_for_maas.mat':
+            self.ours_hair_area_inds = model['hair_area_inds'][0]
+
+            self.mean_tex = self.mean_tex.reshape(1, -1, 3)
+            mean_tex_keep = self.mean_tex[:, self.bfm_keep_inds]
+            self.mean_tex[:, :len(self.bfm_keep_inds)] = mean_tex_keep
+            self.mean_tex[:,
+                          len(self.bfm_keep_inds):] = np.array([200, 146,
+                                                                118])[None,
+                                                                      None]
+            self.mean_tex[:, self.ours_hair_area_inds] = 40.0
+            self.mean_tex = self.mean_tex.reshape(1, -1)
+            self.mean_tex = np.ascontiguousarray(self.mean_tex)
+
+            self.tex_base = self.tex_base.reshape(-1, 3, 80)
+            tex_base_keep = self.tex_base[self.bfm_keep_inds]
+            self.tex_base[:len(self.bfm_keep_inds)] = tex_base_keep
+            self.tex_base[len(self.bfm_keep_inds):] = 0.0
+            self.tex_base = self.tex_base.reshape(-1, 80)
+            self.tex_base = np.ascontiguousarray(self.tex_base)
+
+            self.point_buf = self.point_buf[:, :8] + 1
+
+            self.neck_adjust_part = model['neck_adjust_part'].reshape(
+                (1, -1)) - self.mean_shape
+            self.eyes_adjust_part = model['eyes_adjust_part'].reshape(
+                (1, -1)) - self.mean_shape
+
+            self.eye_corner_inds = model['eye_corner_inds'][0]
+            self.eye_corner_lines = model['eye_corner_lines']
+
+        if recenter:
+            mean_shape = self.mean_shape.reshape([-1, 3])
+            mean_shape_ori = self.mean_shape_ori.reshape([-1, 3])
+            mean_shape = mean_shape - np.mean(
+                mean_shape_ori[:35709, ...], axis=0, keepdims=True)
+            self.mean_shape = mean_shape.reshape([-1, 1])
+
+        self.center = center
+        self.persc_proj = perspective_projection(focal, self.center)
+        self.device = 'cpu'
+        self.camera_distance = camera_distance
+        self.SH = SH()
+        self.init_lit = init_lit.reshape([1, 1, -1]).astype(np.float32)
+
+    def to(self, device):
+        self.device = device
+        for key, value in self.__dict__.items():
+            if type(value).__module__ == np.__name__:
+                setattr(self, key, torch.tensor(value).to(device))
+
+    def compute_shape(self,
+                      id_coeff,
+                      exp_coeff,
+                      nose_coeff=0.0,
+                      neck_coeff=0.0,
+                      eyes_coeff=0.0):
+        """
+        Return:
+            face_shape       -- torch.tensor, size (B, N, 3)
+
+        Parameters:
+            id_coeff         -- torch.tensor, size (B, 80), identity coeffs
+            exp_coeff        -- torch.tensor, size (B, 64), expression coeffs
+        """
+        batch_size = id_coeff.shape[0]
+        id_part = torch.einsum('ij,aj->ai', self.id_base, id_coeff)
+        exp_part = torch.einsum('ij,aj->ai', self.exp_base, exp_coeff)
+        face_shape = id_part + exp_part + self.mean_shape.reshape([1, -1])
+
+        if nose_coeff != 0:
+            face_shape = face_shape + nose_coeff * self.nose_reduced_part
+        if neck_coeff != 0:
+            face_shape = face_shape + neck_coeff * self.neck_adjust_part
+        if eyes_coeff != 0 and self.eyes_adjust_part is not None:
+            face_shape = face_shape + eyes_coeff * self.eyes_adjust_part
+
+        return face_shape.reshape([batch_size, -1, 3])
+
+    def compute_texture(self, tex_coeff, normalize=True):
+        """
+        Return:
+            face_texture     -- torch.tensor, size (B, N, 3), in RGB order, range (0, 1.)
+
+        Parameters:
+            tex_coeff        -- torch.tensor, size (B, 80)
+        """
+        batch_size = tex_coeff.shape[0]
+        face_texture = torch.einsum('ij,aj->ai', self.tex_base,
+                                    tex_coeff) + self.mean_tex
+        if normalize:
+            face_texture = face_texture / 255.
+        return face_texture.reshape([batch_size, -1, 3])
+
+    def compute_norm(self, face_shape):
+        """
+        Return:
+            vertex_norm      -- torch.tensor, size (B, N, 3)
+
+        Parameters:
+            face_shape       -- torch.tensor, size (B, N, 3)
+        """
+
+        v1 = face_shape[:, self.face_buf[:, 0]]
+        v2 = face_shape[:, self.face_buf[:, 1]]
+        v3 = face_shape[:, self.face_buf[:, 2]]
+        e1 = v1 - v2
+        e2 = v2 - v3
+        face_norm = torch.cross(e1, e2, dim=-1)
+        face_norm = F.normalize(face_norm, dim=-1, p=2)
+        face_norm = torch.cat(
+            [face_norm,
+             torch.zeros(face_norm.shape[0], 1, 3).to(self.device)],
+            dim=1)
+
+        vertex_norm = torch.sum(face_norm[:, self.point_buf], dim=2)
+        vertex_norm = F.normalize(vertex_norm, dim=-1, p=2)
+        return vertex_norm
+
+    def compute_color(self, face_texture, face_norm, gamma):
+        """
+        Return:
+            face_color       -- torch.tensor, size (B, N, 3), range (0, 1.)
+
+        Parameters:
+            face_texture     -- torch.tensor, size (B, N, 3), from texture model, range (0, 1.)
+            face_norm        -- torch.tensor, size (B, N, 3), rotated face normal
+            gamma            -- torch.tensor, size (B, 27), SH coeffs
+        """
+        batch_size = gamma.shape[0]
+        a, c = self.SH.a, self.SH.c
+        gamma = gamma.reshape([batch_size, 3, 9])
+        gamma = gamma + self.init_lit
+        gamma = gamma.permute(0, 2, 1)
+
+        y1 = a[0] * c[0] * torch.ones_like(face_norm[..., :1]).to(self.device)
+        y2 = -a[1] * c[1] * face_norm[..., 1:2]
+        y3 = a[1] * c[1] * face_norm[..., 2:]
+        y4 = -a[1] * c[1] * face_norm[..., :1]
+        y5 = a[2] * c[2] * face_norm[..., :1] * face_norm[..., 1:2]
+        y6 = -a[2] * c[2] * face_norm[..., 1:2] * face_norm[..., 2:]
+        y7 = 0.5 * a[2] * c[2] / np.sqrt(3.) * (3 * face_norm[..., 2:]**2 - 1)
+        y8 = -a[2] * c[2] * face_norm[..., :1] * face_norm[..., 2:]
+        y9 = 0.5 * a[2] * c[2] * (
+            face_norm[..., :1]**2 - face_norm[..., 1:2]**2)
+        Y = torch.cat([y1, y2, y3, y4, y5, y6, y7, y8, y9], dim=-1)
+        r = Y @ gamma[..., :1]
+        g = Y @ gamma[..., 1:2]
+        b = Y @ gamma[..., 2:]
+        face_color = torch.cat([r, g, b], dim=-1) * face_texture
+        return face_color
+
+    def compute_rotation(self, angles):
+        """
+        Return:
+            rot              -- torch.tensor, size (B, 3, 3) pts @ trans_mat
+
+        Parameters:
+            angles           -- torch.tensor, size (B, 3), radian
+        """
+
+        batch_size = angles.shape[0]
+        ones = torch.ones([batch_size, 1]).to(self.device)
+        zeros = torch.zeros([batch_size, 1]).to(self.device)
+        x, y, z = angles[:, :1], angles[:, 1:2], angles[:, 2:],
+
+        value_list = [
+            ones, zeros, zeros, zeros,
+            torch.cos(x), -torch.sin(x), zeros,
+            torch.sin(x),
+            torch.cos(x)
+        ]
+        rot_x = torch.cat(value_list, dim=1).reshape([batch_size, 3, 3])
+
+        value_list = [
+            torch.cos(y), zeros,
+            torch.sin(y), zeros, ones, zeros, -torch.sin(y), zeros,
+            torch.cos(y)
+        ]
+        rot_y = torch.cat(value_list, dim=1).reshape([batch_size, 3, 3])
+
+        value_list = [
+            torch.cos(z), -torch.sin(z), zeros,
+            torch.sin(z),
+            torch.cos(z), zeros, zeros, zeros, ones
+        ]
+        rot_z = torch.cat(value_list, dim=1).reshape([batch_size, 3, 3])
+
+        rot = rot_z @ rot_y @ rot_x
+        return rot.permute(0, 2, 1)
+
+    def to_camera(self, face_shape):
+        face_shape[..., -1] = self.camera_distance - face_shape[..., -1]
+        return face_shape
+
+    def to_image(self, face_shape):
+        """
+        Return:
+            face_proj        -- torch.tensor, size (B, N, 2), y direction is opposite to v direction
+
+        Parameters:
+            face_shape       -- torch.tensor, size (B, N, 3)
+        """
+        # to image_plane
+        face_proj = face_shape @ self.persc_proj
+        face_proj = face_proj[..., :2] / face_proj[..., 2:]
+
+        return face_proj
+
+    def transform(self, face_shape, rot, trans):
+        """
+        Return:
+            face_shape       -- torch.tensor, size (B, N, 3) pts @ rot + trans
+
+        Parameters:
+            face_shape       -- torch.tensor, size (B, N, 3)
+            rot              -- torch.tensor, size (B, 3, 3)
+            trans            -- torch.tensor, size (B, 3)
+        """
+        return face_shape @ rot + trans.unsqueeze(1)
+
+    def get_landmarks(self, face_proj):
+        """
+        Return:
+            face_lms         -- torch.tensor, size (B, 68, 2)
+
+        Parameters:
+            face_proj       -- torch.tensor, size (B, N, 2)
+        """
+        return face_proj[:, self.keypoints]
+
+    def split_coeff(self, coeffs):
+        """
+        Return:
+            coeffs_dict     -- a dict of torch.tensors
+
+        Parameters:
+            coeffs          -- torch.tensor, size (B, 256)
+        """
+        if type(coeffs) == dict and 'id' in coeffs:
+            return coeffs
+
+        id_coeffs = coeffs[:, :80]
+        exp_coeffs = coeffs[:, 80:144]
+        tex_coeffs = coeffs[:, 144:224]
+        angles = coeffs[:, 224:227]
+        gammas = coeffs[:, 227:254]
+        translations = coeffs[:, 254:]
+        return {
+            'id': id_coeffs,
+            'exp': exp_coeffs,
+            'tex': tex_coeffs,
+            'angle': angles,
+            'gamma': gammas,
+            'trans': translations
+        }
+
+    def merge_coeff(self, coeffs):
+        """
+        Return:
+            coeffs_dict     -- a dict of torch.tensors
+
+        Parameters:
+            coeffs          -- torch.tensor, size (B, 256)
+        """
+        names = ['id', 'exp', 'tex', 'angle', 'gamma', 'trans']
+        coeffs_merge = []
+        for name in names:
+            coeffs_merge.append(coeffs[name].detach())
+        coeffs_merge = torch.cat(coeffs_merge, dim=1)
+
+        return coeffs_merge
+
+    def compute_for_render(self, coeffs, coeffs_mvs=None):
+        """
+        Return:
+            face_vertex     -- torch.tensor, size (B, N, 3), in camera coordinate
+            face_color      -- torch.tensor, size (B, N, 3), in RGB order
+            landmark        -- torch.tensor, size (B, 68, 2), y direction is opposite to v direction
+        Parameters:
+            coeffs          -- torch.tensor, size (B, 257)
+        """
+        if type(coeffs) == dict:
+            coef_dict = coeffs
+        elif type(coeffs) == torch.Tensor:
+            coef_dict = self.split_coeff(coeffs)
+
+        face_shape = self.compute_shape(
+            coef_dict['id'], coef_dict['exp'], nose_coeff=0.4, neck_coeff=0.6)
+
+        rotation = self.compute_rotation(coef_dict['angle'])
+
+        face_shape_transformed = self.transform(face_shape, rotation,
+                                                coef_dict['trans'])
+        face_vertex = self.to_camera(face_shape_transformed)
+        face_vertex_ori = self.to_camera(face_shape)
+
+        face_proj = self.to_image(face_vertex)
+        landmark = self.get_landmarks(face_proj)
+
+        face_texture = self.compute_texture(coef_dict['tex'])
+        face_norm = self.compute_norm(face_shape)
+        face_norm_roted = face_norm @ rotation
+        face_color = self.compute_color(face_texture, face_norm_roted,
+                                        coef_dict['gamma'])
+
+        if coeffs_mvs is not None:
+            mvs_face_shape = self.compute_shape(coeffs_mvs['id'],
+                                                coeffs_mvs['exp'])
+
+            mvs_face_shape_transformed = self.transform(
+                mvs_face_shape, rotation, coef_dict['trans'])
+            mvs_face_vertex = self.to_camera(mvs_face_shape_transformed)
+            return face_vertex, face_texture, face_color, landmark, mvs_face_vertex
+        else:
+            return face_vertex, face_texture, face_color, landmark, face_vertex_ori
+
+    def reverse_recenter(self, face_shape):
+        batch_size = face_shape.shape[0]
+        face_shape = face_shape.reshape([-1, 3])
+        mean_shape_ori = self.mean_shape_ori.reshape([-1, 3])
+        face_shape = face_shape + torch.mean(
+            mean_shape_ori[:35709, ...], dim=0, keepdim=True)
+        face_shape = face_shape.reshape([batch_size, -1, 3])
+        return face_shape
+
+    def add_nonlinear_offset_eyes(self, face_shape, shape_offset):
+        assert face_shape.shape[0] == 1 and shape_offset.shape[0] == 1
+        face_shape = face_shape[0]
+        shape_offset = shape_offset[0]
+
+        corner_inds = self.eye_corner_inds
+        lines = self.eye_corner_lines
+
+        corner_shape = face_shape[-625:, :]
+        corner_offset = shape_offset[corner_inds]
+        for i in range(len(lines)):
+            corner_shape[lines[i]] += corner_offset[i][None, ...]
+        face_shape[-625:, :] = corner_shape
+
+        l_eye_landmarks = [11540, 11541]
+        r_eye_landmarks = [4271, 4272]
+
+        l_eye_offset = torch.mean(
+            shape_offset[l_eye_landmarks], dim=0, keepdim=True)
+        face_shape[37082:37082 + 609] += l_eye_offset
+
+        r_eye_offset = torch.mean(
+            shape_offset[r_eye_landmarks], dim=0, keepdim=True)
+        face_shape[37082 + 609:37082 + 609 + 608] += r_eye_offset
+
+        face_shape = face_shape[None, ...]
+
+        return face_shape
+
+    def add_nonlinear_offset(self, face_shape, shape_offset_uv, UVs):
+        """
+
+        Args:
+            face_shape: torch.tensor, size (1, N, 3)
+            shape_offset_uv: torch.tensor, size (1, h, w, 3)
+            UVs: torch.tensor, size (N, 2)
+
+        Returns:
+
+        """
+        assert face_shape.shape[0] == 1 and shape_offset_uv.shape[0] == 1
+        face_shape = face_shape[0]
+        shape_offset_uv = shape_offset_uv[0]
+
+        h, w = shape_offset_uv.shape[:2]
+        UVs_coords = UVs.clone()
+        UVs_coords[:, 0] *= w
+        UVs_coords[:, 1] *= h
+        UVs_coords_int = torch.floor(UVs_coords)
+        UVs_coords_float = UVs_coords - UVs_coords_int
+        UVs_coords_int = UVs_coords_int.long()
+
+        shape_lt = shape_offset_uv[(h - 1
+                                    - UVs_coords_int[:, 1]).clamp(0, h - 1),
+                                   UVs_coords_int[:, 0].clamp(0, w - 1)]
+        shape_lb = shape_offset_uv[(h - UVs_coords_int[:, 1]).clamp(0, h - 1),
+                                   UVs_coords_int[:, 0].clamp(0, w - 1)]
+        shape_rt = shape_offset_uv[(h - 1
+                                    - UVs_coords_int[:, 1]).clamp(0, h - 1),
+                                   (UVs_coords_int[:, 0] + 1).clamp(0, w - 1)]
+        shape_rb = shape_offset_uv[(h - UVs_coords_int[:, 1]).clamp(0, h - 1),
+                                   (UVs_coords_int[:, 0] + 1).clamp(0, w - 1)]
+
+        value1 = shape_lt * (
+            1 - UVs_coords_float[:, :1]) * UVs_coords_float[:, 1:]
+        value2 = shape_lb * (1 - UVs_coords_float[:, :1]) * (
+            1 - UVs_coords_float[:, 1:])
+        value3 = shape_rt * UVs_coords_float[:, :1] * UVs_coords_float[:, 1:]
+        value4 = shape_rb * UVs_coords_float[:, :1] * (
+            1 - UVs_coords_float[:, 1:])
+        offset_shape = value1 + value2 + value3 + value4  # (N, 3)
+
+        face_shape = (face_shape + offset_shape)[None, ...]
+
+        return face_shape, offset_shape[None, ...]
+
+    def compute_for_render_train_nonlinear(self,
+                                           coeffs,
+                                           shape_offset_uv,
+                                           tex_offset_uv,
+                                           UVs,
+                                           reverse_recenter=True):
+        if type(coeffs) == dict:
+            coef_dict = coeffs
+        elif type(coeffs) == torch.Tensor:
+            coef_dict = self.split_coeff(coeffs)
+
+        face_shape = self.compute_shape(coef_dict['id'],
+                                        coef_dict['exp'])  # (1, n, 3)
+        if reverse_recenter:
+            face_shape_ori_noRecenter = self.reverse_recenter(
+                face_shape.clone())
+        else:
+            face_shape_ori_noRecenter = face_shape.clone()
+        face_vertex_ori = self.to_camera(face_shape_ori_noRecenter)
+
+        face_shape, shape_offset = self.add_nonlinear_offset(
+            face_shape, shape_offset_uv, UVs[:35709, :])  # (1, n, 3)
+        if reverse_recenter:
+            face_shape_offset_noRecenter = self.reverse_recenter(
+                face_shape.clone())
+        else:
+            face_shape_offset_noRecenter = face_shape.clone()
+        face_vertex_offset = self.to_camera(face_shape_offset_noRecenter)
+
+        rotation = self.compute_rotation(coef_dict['angle'])
+
+        face_shape_transformed = self.transform(face_shape, rotation,
+                                                coef_dict['trans'])
+        face_vertex = self.to_camera(face_shape_transformed)
+
+        face_proj = self.to_image(face_vertex)
+        landmark = self.get_landmarks(face_proj)
+
+        face_texture = self.compute_texture(coef_dict['tex'])  # (1, n, 3)
+        face_texture, texture_offset = self.add_nonlinear_offset(
+            face_texture, tex_offset_uv, UVs[:35709, :])
+        face_norm = self.compute_norm(face_shape)
+        face_norm_roted = face_norm @ rotation
+        face_color = self.compute_color(face_texture, face_norm_roted,
+                                        coef_dict['gamma'])
+
+        return face_vertex, face_texture, face_color, landmark, face_vertex_ori, face_vertex_offset, face_proj
+
+    def compute_for_render_nonlinear_full(self,
+                                          coeffs,
+                                          shape_offset_uv,
+                                          UVs,
+                                          nose_coeff=0.0,
+                                          eyes_coeff=0.0):
+        if type(coeffs) == dict:
+            coef_dict = coeffs
+        elif type(coeffs) == torch.Tensor:
+            coef_dict = self.split_coeff(coeffs)
+
+        face_shape = self.compute_shape(
+            coef_dict['id'],
+            coef_dict['exp'],
+            nose_coeff=nose_coeff,
+            neck_coeff=0.6,
+            eyes_coeff=eyes_coeff)  # (1, n, 3)
+        face_vertex_ori = self.to_camera(face_shape.clone())
+
+        face_shape[:, :35241, :], shape_offset = self.add_nonlinear_offset(
+            face_shape[:, :35241, :], shape_offset_uv,
+            UVs[:35709, :][self.bfm_keep_inds])
+        face_shape = self.add_nonlinear_offset_eyes(face_shape, shape_offset)
+        face_shape_noRecenter = self.reverse_recenter(face_shape.clone())
+        face_vertex_offset = self.to_camera(face_shape_noRecenter)
+
+        rotation = self.compute_rotation(coef_dict['angle'])
+
+        face_shape_transformed = self.transform(face_shape, rotation,
+                                                coef_dict['trans'])
+        face_vertex = self.to_camera(face_shape_transformed)
+
+        return face_vertex, face_vertex_ori, face_vertex_offset
+
+    def compute_for_render_train(self, coeffs):
+        """
+        Return:
+            face_vertex     -- torch.tensor, size (B, N, 3), in camera coordinate
+            face_color      -- torch.tensor, size (B, N, 3), in RGB order
+            landmark        -- torch.tensor, size (B, 68, 2), y direction is opposite to v direction
+        Parameters:
+            coeffs          -- torch.tensor, size (B, 257)
+        """
+        if type(coeffs) == dict:
+            coef_dict = coeffs
+        elif type(coeffs) == torch.Tensor:
+            coef_dict = self.split_coeff(coeffs)
+
+        face_shape = self.compute_shape(coef_dict['id'], coef_dict['exp'])
+        uv_geometry = self.render.world2uv(face_shape)
+
+        rotation = self.compute_rotation(coef_dict['angle'])
+
+        face_shape_transformed = self.transform(face_shape, rotation,
+                                                coef_dict['trans'])
+        face_vertex = self.to_camera(face_shape_transformed)
+
+        face_proj = self.to_image(face_vertex)
+        landmark = self.get_landmarks(face_proj)
+
+        face_texture = self.compute_texture(coef_dict['tex'])
+        face_norm = self.compute_norm(face_shape)
+        face_norm_roted = face_norm @ rotation
+        face_color = self.compute_color(face_texture, face_norm_roted,
+                                        coef_dict['gamma'])
+
+        return face_vertex, face_texture, face_color, landmark, uv_geometry
diff --git a/modelscope/models/cv/face_reconstruction/models/facelandmark/__init__.py b/modelscope/models/cv/face_reconstruction/models/facelandmark/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/modelscope/models/cv/face_reconstruction/models/facelandmark/large_base_lmks_infer.py b/modelscope/models/cv/face_reconstruction/models/facelandmark/large_base_lmks_infer.py
new file mode 100644
index 00000000..b9e329ee
--- /dev/null
+++ b/modelscope/models/cv/face_reconstruction/models/facelandmark/large_base_lmks_infer.py
@@ -0,0 +1,91 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import numpy as np
+import torch
+
+from .nets.large_base_lmks_net import LargeBaseLmksNet
+
+BASE_LANDMARK_NUM = 106
+INPUT_SIZE = 224
+ENLARGE_RATIO = 1.35
+
+
+class LargeBaseLmkInfer:
+
+    @staticmethod
+    def model_preload(model_path, use_gpu=True):
+        model = LargeBaseLmksNet(infer=False)
+        # using gpu
+        if use_gpu:
+            model = model.cuda()
+
+        checkpoint = []
+        if use_gpu:
+            checkpoint = torch.load(model_path, map_location='cuda')
+        else:
+            checkpoint = torch.load(model_path, map_location='cpu')
+
+        model.load_state_dict(
+            {
+                k.replace('module.', ''): v
+                for k, v in checkpoint['state_dict'].items()
+            },
+            strict=False)
+        model.eval()
+        return model
+
+    @staticmethod
+    def process_img(model, image, use_gpu=True):
+        img_resize = image
+
+        img_resize = (img_resize
+                      - [103.94, 116.78, 123.68]) / 255.0  # important
+        img_resize = img_resize.transpose([2, 0, 1])
+
+        if use_gpu:
+            img_resize = torch.from_numpy(img_resize).cuda()
+        else:
+            img_resize = torch.from_numpy(img_resize)
+
+        w_new = INPUT_SIZE
+        h_new = INPUT_SIZE
+        img_in = torch.zeros([1, 3, h_new, w_new], dtype=torch.float32)
+        if use_gpu:
+            img_in = img_in.cuda()
+
+        img_in[0, :] = img_resize
+
+        with torch.no_grad():
+            output = model(img_in)
+            output = output * INPUT_SIZE
+
+        if use_gpu:
+            output = output.cpu().numpy()
+        else:
+            output = output.numpy()
+
+        return output
+
+    @staticmethod
+    def smooth(cur_lmks, prev_lmks):
+        smooth_lmks = np.zeros((106, 2))
+
+        cur_rect_x1 = np.min(cur_lmks[:, 0])
+        cur_rect_x2 = np.max(cur_lmks[:, 0])
+
+        smooth_param = 60.0
+        factor = smooth_param / (cur_rect_x1 - cur_rect_x2)
+        for i in range(BASE_LANDMARK_NUM):
+            weightX = np.exp(factor * np.abs(cur_lmks[i][0] - prev_lmks[i][0]))
+            weightY = np.exp(factor * np.abs(cur_lmks[i][1] - prev_lmks[i][1]))
+
+            smooth_lmks[i][0] = (
+                1 - weightX) * cur_lmks[i][0] + weightX * prev_lmks[i][0]
+            smooth_lmks[i][1] = (
+                1 - weightY) * cur_lmks[i][1] + weightY * prev_lmks[i][1]
+
+        return smooth_lmks
+
+    @staticmethod
+    def infer_img(img, model, use_gpu=True):
+        lmks = LargeBaseLmkInfer.process_img(model, img, use_gpu)
+        return lmks
diff --git a/modelscope/models/cv/face_reconstruction/models/facelandmark/large_model_infer.py b/modelscope/models/cv/face_reconstruction/models/facelandmark/large_model_infer.py
new file mode 100644
index 00000000..c61a0d13
--- /dev/null
+++ b/modelscope/models/cv/face_reconstruction/models/facelandmark/large_model_infer.py
@@ -0,0 +1,430 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os
+
+import cv2
+import numpy as np
+import torch
+
+from modelscope.models.cv.skin_retouching.retinaface.predict_single import \
+    Model
+from ...utils import image_warp_grid1, spread_flow
+from .large_base_lmks_infer import LargeBaseLmkInfer
+
+INPUT_SIZE = 224
+ENLARGE_RATIO = 1.35
+
+
+def resize_on_long_side(img, long_side=800):
+    src_height = img.shape[0]
+    src_width = img.shape[1]
+
+    if src_height > src_width:
+        scale = long_side * 1.0 / src_height
+        _img = cv2.resize(
+            img, (int(src_width * scale), long_side),
+            interpolation=cv2.INTER_CUBIC)
+
+    else:
+        scale = long_side * 1.0 / src_width
+        _img = cv2.resize(
+            img, (long_side, int(src_height * scale)),
+            interpolation=cv2.INTER_CUBIC)
+
+    return _img, scale
+
+
+def draw_line(im, points, color, stroke_size=2, closed=False):
+    points = points.astype(np.int32)
+    for i in range(len(points) - 1):
+        cv2.line(im, tuple(points[i]), tuple(points[i + 1]), color,
+                 stroke_size)
+    if closed:
+        cv2.line(im, tuple(points[0]), tuple(points[-1]), color, stroke_size)
+
+
+def enlarged_bbox(bbox, img_width, img_height, enlarge_ratio=0.2):
+    '''
+    :param bbox: [xmin,ymin,xmax,ymax]
+    :return: bbox: [xmin,ymin,xmax,ymax]
+    '''
+
+    left = bbox[0]
+    top = bbox[1]
+
+    right = bbox[2]
+    bottom = bbox[3]
+
+    roi_width = right - left
+    roi_height = bottom - top
+
+    new_left = left - int(roi_width * enlarge_ratio)
+    new_left = 0 if new_left < 0 else new_left
+
+    new_top = top - int(roi_height * enlarge_ratio)
+    new_top = 0 if new_top < 0 else new_top
+
+    new_right = right + int(roi_width * enlarge_ratio)
+    new_right = img_width if new_right > img_width else new_right
+
+    new_bottom = bottom + int(roi_height * enlarge_ratio)
+    new_bottom = img_height if new_bottom > img_height else new_bottom
+
+    bbox = [new_left, new_top, new_right, new_bottom]
+
+    bbox = [int(x) for x in bbox]
+
+    return bbox
+
+
+class FaceInfo:
+
+    def __init__(self):
+        self.rect = np.asarray([0, 0, 0, 0])
+        self.points_array = np.zeros((106, 2))
+        self.eye_left = np.zeros((22, 2))
+        self.eye_right = np.zeros((22, 2))
+        self.eyebrow_left = np.zeros((13, 2))
+        self.eyebrow_right = np.zeros((13, 2))
+        self.lips = np.zeros((64, 2))
+
+
+class LargeModelInfer:
+
+    def __init__(self, ckpt, device='cuda'):
+        self.large_base_lmks_model = LargeBaseLmkInfer.model_preload(
+            ckpt,
+            device.lower() == 'cuda')
+        self.device = device.lower()
+        self.detector = Model(max_size=512, device=device)
+        detector_ckpt_name = 'retinaface_resnet50_2020-07-20_old_torch.pth'
+        state_dict = torch.load(
+            os.path.join(os.path.dirname(ckpt), detector_ckpt_name),
+            map_location='cpu')
+        self.detector.load_state_dict(state_dict)
+        self.detector.eval()
+
+    def infer(self, img_bgr):
+        landmarks = []
+
+        rgb_image = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2RGB)
+        results = self.detector.predict_jsons(rgb_image)
+
+        boxes = []
+        for anno in results:
+            if anno['score'] == -1:
+                break
+            boxes.append({
+                'x1': anno['bbox'][0],
+                'y1': anno['bbox'][1],
+                'x2': anno['bbox'][2],
+                'y2': anno['bbox'][3]
+            })
+
+        for detect_result in boxes:
+            x1 = detect_result['x1']
+            y1 = detect_result['y1']
+            x2 = detect_result['x2']
+            y2 = detect_result['y2']
+
+            w = x2 - x1 + 1
+            h = y2 - y1 + 1
+
+            cx = (x2 + x1) / 2
+            cy = (y2 + y1) / 2
+
+            sz = max(h, w) * ENLARGE_RATIO
+
+            x1 = cx - sz / 2
+            y1 = cy - sz / 2
+            trans_x1 = x1
+            trans_y1 = y1
+            x2 = x1 + sz
+            y2 = y1 + sz
+
+            height, width, _ = rgb_image.shape
+            dx = max(0, -x1)
+            dy = max(0, -y1)
+            x1 = max(0, x1)
+            y1 = max(0, y1)
+
+            edx = max(0, x2 - width)
+            edy = max(0, y2 - height)
+            x2 = min(width, x2)
+            y2 = min(height, y2)
+
+            crop_img = rgb_image[int(y1):int(y2), int(x1):int(x2)]
+            if dx > 0 or dy > 0 or edx > 0 or edy > 0:
+                crop_img = cv2.copyMakeBorder(
+                    crop_img,
+                    int(dy),
+                    int(edy),
+                    int(dx),
+                    int(edx),
+                    cv2.BORDER_CONSTANT,
+                    value=(103.94, 116.78, 123.68))
+            crop_img = cv2.resize(crop_img, (INPUT_SIZE, INPUT_SIZE))
+
+            base_lmks = LargeBaseLmkInfer.infer_img(crop_img,
+                                                    self.large_base_lmks_model,
+                                                    self.device == 'cuda')
+
+            inv_scale = sz / INPUT_SIZE
+
+            affine_base_lmks = np.zeros((106, 2))
+            for idx in range(106):
+                affine_base_lmks[idx][
+                    0] = base_lmks[0][idx * 2 + 0] * inv_scale + trans_x1
+                affine_base_lmks[idx][
+                    1] = base_lmks[0][idx * 2 + 1] * inv_scale + trans_y1
+
+            x1 = np.min(affine_base_lmks[:, 0])
+            y1 = np.min(affine_base_lmks[:, 1])
+            x2 = np.max(affine_base_lmks[:, 0])
+            y2 = np.max(affine_base_lmks[:, 1])
+
+            w = x2 - x1 + 1
+            h = y2 - y1 + 1
+
+            cx = (x2 + x1) / 2
+            cy = (y2 + y1) / 2
+
+            sz = max(h, w) * ENLARGE_RATIO
+
+            x1 = cx - sz / 2
+            y1 = cy - sz / 2
+            trans_x1 = x1
+            trans_y1 = y1
+            x2 = x1 + sz
+            y2 = y1 + sz
+
+            height, width, _ = rgb_image.shape
+            dx = max(0, -x1)
+            dy = max(0, -y1)
+            x1 = max(0, x1)
+            y1 = max(0, y1)
+
+            edx = max(0, x2 - width)
+            edy = max(0, y2 - height)
+            x2 = min(width, x2)
+            y2 = min(height, y2)
+
+            crop_img = rgb_image[int(y1):int(y2), int(x1):int(x2)]
+            if dx > 0 or dy > 0 or edx > 0 or edy > 0:
+                crop_img = cv2.copyMakeBorder(
+                    crop_img,
+                    int(dy),
+                    int(edy),
+                    int(dx),
+                    int(edx),
+                    cv2.BORDER_CONSTANT,
+                    value=(103.94, 116.78, 123.68))
+            crop_img = cv2.resize(crop_img, (INPUT_SIZE, INPUT_SIZE))
+
+            base_lmks = LargeBaseLmkInfer.infer_img(
+                crop_img, self.large_base_lmks_model,
+                self.device.lower() == 'cuda')
+
+            inv_scale = sz / INPUT_SIZE
+
+            affine_base_lmks = np.zeros((106, 2))
+            for idx in range(106):
+                affine_base_lmks[idx][
+                    0] = base_lmks[0][idx * 2 + 0] * inv_scale + trans_x1
+                affine_base_lmks[idx][
+                    1] = base_lmks[0][idx * 2 + 1] * inv_scale + trans_y1
+
+            landmarks.append(affine_base_lmks)
+
+        return boxes, landmarks
+
+    def find_face_contour(self, image):
+
+        boxes, landmarks = self.infer(image)
+        landmarks = np.array(landmarks)
+
+        args = [[0, 33, False], [33, 38, False], [42, 47, False],
+                [51, 55, False], [57, 64, False], [66, 74, True],
+                [75, 83, True], [84, 96, True]]
+
+        roi_bboxs = []
+
+        for i in range(len(boxes)):
+            roi_bbox = enlarged_bbox([
+                boxes[i]['x1'], boxes[i]['y1'], boxes[i]['x2'], boxes[i]['y2']
+            ], image.shape[1], image.shape[0], 0.5)
+            roi_bbox = [int(x) for x in roi_bbox]
+            roi_bboxs.append(roi_bbox)
+
+        people_maps = []
+
+        for i in range(landmarks.shape[0]):
+            landmark = landmarks[i, :, :]
+            maps = []
+            whole_mask = np.zeros((image.shape[0], image.shape[1]), np.uint8)
+
+            roi_box = roi_bboxs[i]
+            roi_box_width = roi_box[2] - roi_box[0]
+            roi_box_height = roi_box[3] - roi_box[1]
+            short_side_length = roi_box_width if roi_box_width < roi_box_height else roi_box_height
+
+            line_width = short_side_length // 10
+
+            if line_width == 0:
+                line_width = 1
+
+            kernel_size = line_width * 2
+            gaussian_kernel = kernel_size if kernel_size % 2 == 1 else kernel_size + 1
+
+            for t, arg in enumerate(args):
+                mask = np.zeros((image.shape[0], image.shape[1]), np.uint8)
+                draw_line(mask, landmark[arg[0]:arg[1]], (255, 255, 255),
+                          line_width, arg[2])
+                mask = cv2.GaussianBlur(mask,
+                                        (gaussian_kernel, gaussian_kernel), 0)
+                if t >= 1:
+                    draw_line(whole_mask, landmark[arg[0]:arg[1]],
+                              (255, 255, 255), line_width * 2, arg[2])
+                maps.append(mask)
+            whole_mask = cv2.GaussianBlur(whole_mask,
+                                          (gaussian_kernel, gaussian_kernel),
+                                          0)
+            maps.append(whole_mask)
+            people_maps.append(maps)
+
+        return people_maps[0], boxes
+
+    def face2contour(self, image, stack_mode='column'):
+        '''
+
+        :param facer:
+        :param image:
+        :param stack_mode:
+        :return: final_maps: [map0, map1,....]
+                 roi_bboxs: [bbox0, bbox1, ...]
+        '''
+
+        boxes, landmarks = self.infer(image)
+        landmarks = np.array(landmarks)
+
+        args = [[0, 33, False], [33, 38, False], [42, 47, False],
+                [51, 55, False], [57, 64, False], [66, 74, True],
+                [75, 83, True], [84, 96, True]]
+
+        roi_bboxs = []
+
+        for i in range(len(boxes)):
+            roi_bbox = enlarged_bbox([
+                boxes[i]['x1'], boxes[i]['y1'], boxes[i]['x2'], boxes[i]['y2']
+            ], image.shape[1], image.shape[0], 0.5)
+            roi_bbox = [int(x) for x in roi_bbox]
+            roi_bboxs.append(roi_bbox)
+
+        people_maps = []
+
+        for i in range(landmarks.shape[0]):
+            landmark = landmarks[i, :, :]
+            maps = []
+            whole_mask = np.zeros((image.shape[0], image.shape[1]), np.uint8)
+
+            roi_box = roi_bboxs[i]
+            roi_box_width = roi_box[2] - roi_box[0]
+            roi_box_height = roi_box[3] - roi_box[1]
+            short_side_length = roi_box_width if roi_box_width < roi_box_height else roi_box_height
+
+            line_width = short_side_length // 50
+
+            if line_width == 0:
+                line_width = 1
+
+            kernel_size = line_width * 4
+            gaussian_kernel = kernel_size if kernel_size % 2 == 1 else kernel_size + 1
+
+            for arg in args:
+                mask = np.zeros((image.shape[0], image.shape[1]), np.uint8)
+                draw_line(mask, landmark[arg[0]:arg[1]], (255, 255, 255),
+                          line_width, arg[2])
+                mask = cv2.GaussianBlur(mask,
+                                        (gaussian_kernel, gaussian_kernel), 0)
+                draw_line(whole_mask, landmark[arg[0]:arg[1]], (255, 255, 255),
+                          line_width, arg[2])
+                maps.append(mask)
+            whole_mask = cv2.GaussianBlur(whole_mask,
+                                          (gaussian_kernel, gaussian_kernel),
+                                          0)
+            maps.append(whole_mask)
+            people_maps.append(maps)
+
+        if stack_mode == 'depth':
+            final_maps = []
+            for i, maps in enumerate(people_maps):
+                final_map = np.dstack(maps)
+                final_map = final_map[roi_bboxs[i][1]:roi_bboxs[i][3],
+                                      roi_bboxs[i][0]:roi_bboxs[i][2], :]
+                final_maps.append(final_map)
+            return final_maps, roi_bboxs
+
+        elif stack_mode == 'column':
+            final_maps = []
+            for i, maps in enumerate(people_maps):
+                joint_maps = [
+                    x[roi_bboxs[i][1]:roi_bboxs[i][3],
+                      roi_bboxs[i][0]:roi_bboxs[i][2]] for x in maps
+                ]
+                final_map = np.column_stack(joint_maps)
+                final_maps.append(final_map)
+            return final_maps, roi_bboxs
+
+    def fat_face(self, img, degree=0.1):
+
+        _img, scale = resize_on_long_side(img, 800)
+
+        contour_maps, boxes = self.find_face_contour(_img)
+
+        contour_map = contour_maps[0]
+
+        boxes = boxes[0]
+
+        Flow = np.zeros(
+            shape=(contour_map.shape[0], contour_map.shape[1], 2),
+            dtype=np.float32)
+
+        box_center = [(boxes['x1'] + boxes['x2']) / 2,
+                      (boxes['y1'] + boxes['y2']) / 2]
+
+        box_length = max(
+            abs(boxes['y1'] - boxes['y2']), abs(boxes['x1'] - boxes['x2']))
+
+        value_1 = 2 * (Flow.shape[0] - box_center[1] - 1)
+        value_2 = 2 * (Flow.shape[1] - box_center[0] - 1)
+        value_list = [
+            box_length * 2, 2 * (box_center[0] - 1), 2 * (box_center[1] - 1),
+            value_1, value_2
+        ]
+        flow_box_length = min(value_list)
+        flow_box_length = int(flow_box_length)
+
+        sf = spread_flow(100, flow_box_length * degree)
+        sf = cv2.resize(sf, (flow_box_length, flow_box_length))
+
+        Flow[int(box_center[1]
+                 - flow_box_length / 2):int(box_center[1]
+                                            + flow_box_length / 2),
+             int(box_center[0]
+                 - flow_box_length / 2):int(box_center[0]
+                                            + flow_box_length / 2)] = sf
+
+        Flow = Flow * np.dstack((contour_map, contour_map)) / 255.0
+
+        inter_face_maps = contour_maps[-1]
+
+        Flow = Flow * (1.0 - np.dstack(
+            (inter_face_maps, inter_face_maps)) / 255.0)
+
+        Flow = cv2.resize(Flow, (img.shape[1], img.shape[0]))
+
+        Flow = Flow / scale
+
+        pred, top_bound, bottom_bound, left_bound, right_bound = image_warp_grid1(
+            Flow[..., 0], Flow[..., 1], img, 1.0, [0, 0, 0, 0])
+
+        return pred
diff --git a/modelscope/models/cv/face_reconstruction/models/facelandmark/nets/__init__.py b/modelscope/models/cv/face_reconstruction/models/facelandmark/nets/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/modelscope/models/cv/face_reconstruction/models/facelandmark/nets/large_base_lmks_net.py b/modelscope/models/cv/face_reconstruction/models/facelandmark/nets/large_base_lmks_net.py
new file mode 100644
index 00000000..f81fea1b
--- /dev/null
+++ b/modelscope/models/cv/face_reconstruction/models/facelandmark/nets/large_base_lmks_net.py
@@ -0,0 +1,201 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+
+INPUT_SIZE = 224
+
+
+def constant_init(module, val, bias=0):
+    if hasattr(module, 'weight') and module.weight is not None:
+        nn.init.constant_(module.weight, val)
+    if hasattr(module, 'bias') and module.bias is not None:
+        nn.init.constant_(module.bias, bias)
+
+
+def kaiming_init(module,
+                 a=0,
+                 mode='fan_out',
+                 nonlinearity='relu',
+                 bias=0,
+                 distribution='normal'):
+    assert distribution in ['uniform', 'normal']
+    if distribution == 'uniform':
+        nn.init.kaiming_uniform_(
+            module.weight, a=a, mode=mode, nonlinearity=nonlinearity)
+    else:
+        nn.init.kaiming_normal_(
+            module.weight, a=a, mode=mode, nonlinearity=nonlinearity)
+    if hasattr(module, 'bias') and module.bias is not None:
+        nn.init.constant_(module.bias, bias)
+
+
+def conv_bn(inp, oup, kernel, stride, padding=1):
+    return nn.Sequential(
+        nn.Conv2d(inp, oup, kernel, stride, padding, bias=False),
+        nn.BatchNorm2d(oup), nn.PReLU(oup))
+
+
+def conv_1x1_bn(inp, oup):
+    return nn.Sequential(
+        nn.Conv2d(inp, oup, 1, 1, 0, bias=False), nn.BatchNorm2d(oup),
+        nn.ReLU(inplace=True))
+
+
+class InvertedResidual(nn.Module):
+
+    def __init__(self,
+                 inp,
+                 oup,
+                 stride,
+                 padding,
+                 use_res_connect,
+                 expand_ratio=6):
+
+        super(InvertedResidual, self).__init__()
+
+        self.stride = stride
+        assert stride in [1, 2]
+
+        self.use_res_connect = use_res_connect
+        hid_channels = inp * expand_ratio
+        self.conv = nn.Sequential(
+            nn.Conv2d(inp, hid_channels, 1, 1, 0, bias=False),
+            nn.BatchNorm2d(hid_channels),
+            nn.PReLU(hid_channels),
+            nn.Conv2d(
+                hid_channels,
+                hid_channels,
+                3,
+                stride,
+                padding,
+                groups=hid_channels,
+                bias=False),
+            nn.BatchNorm2d(hid_channels),
+            nn.PReLU(hid_channels),
+            nn.Conv2d(hid_channels, oup, 1, 1, 0, bias=False),
+            nn.BatchNorm2d(oup),
+        )
+
+    def forward(self, x):
+        if self.use_res_connect:
+            return x + self.conv(x)
+        else:
+            return self.conv(x)
+
+
+class SoftArgmax(nn.Module):
+
+    def __init__(self, beta: int = 1, infer=False):
+        if not 0.0 <= beta:
+            raise ValueError(f'Invalid beta: {beta}')
+        super().__init__()
+        self.beta = beta
+        self.infer = infer
+
+    def forward(self, heatmap: torch.Tensor) -> torch.Tensor:
+        heatmap = heatmap.mul(self.beta)
+        batch_size, num_channel, height, width = heatmap.size()
+        device: str = heatmap.device
+
+        if not self.infer:
+            softmax: torch.Tensor = F.softmax(
+                heatmap.view(batch_size, num_channel, height * width),
+                dim=2).view(batch_size, num_channel, height, width)
+
+            xx, yy = torch.meshgrid(list(map(torch.arange, [width, height])))
+
+            approx_x = (
+                softmax.mul(xx.float().to(device)).view(
+                    batch_size, num_channel,
+                    height * width).sum(2).unsqueeze(2))
+            approx_y = (
+                softmax.mul(yy.float().to(device)).view(
+                    batch_size, num_channel,
+                    height * width).sum(2).unsqueeze(2))
+
+            output = [approx_x / width, approx_y / height]
+            output = torch.cat(output, 2)
+            output = output.view(-1, output.size(1) * output.size(2))
+            return output
+        else:
+            softmax: torch.Tensor = F.softmax(
+                heatmap.view(batch_size, num_channel, height * width), dim=2)
+
+            return softmax
+
+
+class LargeBaseLmksNet(nn.Module):
+
+    def __init__(self, er=1.0, infer=False):
+
+        super(LargeBaseLmksNet, self).__init__()
+
+        self.infer = infer
+
+        self.block1 = conv_bn(3, int(64 * er), 3, 2, 1)
+        self.block2 = InvertedResidual(
+            int(64 * er), int(64 * er), 1, 1, False, 2)
+
+        self.block3 = InvertedResidual(
+            int(64 * er), int(64 * er), 2, 1, False, 2)
+        self.block4 = InvertedResidual(
+            int(64 * er), int(64 * er), 1, 1, True, 2)
+        self.block5 = InvertedResidual(
+            int(64 * er), int(64 * er), 1, 1, True, 2)
+        self.block6 = InvertedResidual(
+            int(64 * er), int(64 * er), 1, 1, True, 2)
+        self.block7 = InvertedResidual(
+            int(64 * er), int(64 * er), 1, 1, True, 2)
+
+        self.block8 = InvertedResidual(
+            int(64 * er), int(128 * er), 2, 1, False, 2)
+
+        self.block9 = InvertedResidual(
+            int(128 * er), int(128 * er), 1, 1, False, 4)
+        self.block10 = InvertedResidual(
+            int(128 * er), int(128 * er), 1, 1, True, 4)
+        self.block11 = InvertedResidual(
+            int(128 * er), int(128 * er), 1, 1, True, 4)
+        self.block12 = InvertedResidual(
+            int(128 * er), int(128 * er), 1, 1, True, 4)
+        self.block13 = InvertedResidual(
+            int(128 * er), int(128 * er), 1, 1, True, 4)
+        self.block14 = InvertedResidual(
+            int(128 * er), int(128 * er), 1, 1, True, 4)
+
+        self.block15 = InvertedResidual(
+            int(128 * er), int(128 * er), 1, 1, False, 2)  # [128, 14, 14]
+        self.block16 = InvertedResidual(
+            int(128 * er), int(128 * er), 2, 1, False, 2)
+        self.block17 = InvertedResidual(
+            int(128 * er), int(128 * er), 1, 1, False, 2)
+
+        self.block18 = conv_bn(int(128 * er), int(256 * er), 3, 1, 1)
+        self.block19 = nn.Conv2d(int(256 * er), 106, 3, 1, 1, bias=False)
+        self.softargmax = SoftArgmax(infer=infer)
+
+    def forward(self, x):  # x: 3, 224, 224
+
+        x = self.block1(x)
+        x = self.block2(x)
+        x = self.block3(x)
+        x = self.block4(x)
+        x = self.block5(x)
+        x = self.block6(x)
+        x = self.block7(x)
+        x = self.block8(x)
+        x = self.block9(x)
+        x = self.block10(x)
+        x = self.block11(x)
+        x = self.block12(x)
+        x = self.block13(x)
+        x = self.block14(x)
+        x = self.block15(x)
+        x = self.block16(x)
+        x = self.block17(x)
+        x = self.block18(x)
+        x = self.block19(x)
+        x = self.softargmax(x)
+
+        return x
diff --git a/modelscope/models/cv/face_reconstruction/models/facelandmark/nets/large_eyeball_net.py b/modelscope/models/cv/face_reconstruction/models/facelandmark/nets/large_eyeball_net.py
new file mode 100644
index 00000000..97d36b2c
--- /dev/null
+++ b/modelscope/models/cv/face_reconstruction/models/facelandmark/nets/large_eyeball_net.py
@@ -0,0 +1,160 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import torch.nn as nn
+
+FACE_PART_SIZE = 56
+
+
+class InvertedResidual(nn.Module):
+
+    def __init__(self,
+                 inp,
+                 oup,
+                 kernel_size,
+                 stride,
+                 padding,
+                 expand_ratio=2,
+                 use_connect=False,
+                 activation='relu'):
+        super(InvertedResidual, self).__init__()
+
+        hid_channels = int(inp * expand_ratio)
+        if activation == 'relu':
+            self.conv = nn.Sequential(
+                nn.Conv2d(inp, hid_channels, 1, 1, 0, bias=False),
+                nn.BatchNorm2d(hid_channels), nn.ReLU(inplace=True),
+                nn.Conv2d(
+                    hid_channels,
+                    hid_channels,
+                    kernel_size,
+                    stride,
+                    padding,
+                    groups=hid_channels,
+                    bias=False), nn.BatchNorm2d(hid_channels),
+                nn.ReLU(inplace=True),
+                nn.Conv2d(hid_channels, oup, 1, 1, 0, bias=False),
+                nn.BatchNorm2d(oup))
+        elif activation == 'prelu':
+            self.conv = nn.Sequential(
+                nn.Conv2d(inp, hid_channels, 1, 1, 0, bias=False),
+                nn.BatchNorm2d(hid_channels), nn.PReLU(hid_channels),
+                nn.Conv2d(
+                    hid_channels,
+                    hid_channels,
+                    kernel_size,
+                    stride,
+                    padding,
+                    groups=hid_channels,
+                    bias=False), nn.BatchNorm2d(hid_channels),
+                nn.PReLU(hid_channels),
+                nn.Conv2d(hid_channels, oup, 1, 1, 0, bias=False),
+                nn.BatchNorm2d(oup))
+        self.use_connect = use_connect
+
+    def forward(self, x):
+        if self.use_connect:
+            return x + self.conv(x)
+        else:
+            return self.conv(x)
+
+
+class Residual(nn.Module):
+
+    def __init__(self,
+                 inp,
+                 oup,
+                 kernel_size,
+                 stride,
+                 padding,
+                 use_connect=False,
+                 activation='relu'):
+        super(Residual, self).__init__()
+
+        self.use_connect = use_connect
+
+        if activation == 'relu':
+            self.conv = nn.Sequential(
+                nn.Conv2d(
+                    inp,
+                    inp,
+                    kernel_size,
+                    stride,
+                    padding,
+                    groups=inp,
+                    bias=False), nn.BatchNorm2d(inp), nn.ReLU(inplace=True),
+                nn.Conv2d(inp, oup, 1, 1, 0, bias=False), nn.BatchNorm2d(oup),
+                nn.ReLU(inplace=True))
+        elif activation == 'prelu':
+            self.conv = nn.Sequential(
+                nn.Conv2d(
+                    inp,
+                    inp,
+                    kernel_size,
+                    stride,
+                    padding,
+                    groups=inp,
+                    bias=False), nn.BatchNorm2d(inp), nn.PReLU(inp),
+                nn.Conv2d(inp, oup, 1, 1, 0, bias=False), nn.BatchNorm2d(oup),
+                nn.PReLU(oup))
+
+    def forward(self, x):
+        if self.use_connect:
+            return x + self.conv(x)
+        else:
+            return self.conv(x)
+
+
+def conv_bn(inp, oup, kernel, stride, padding=1):
+    return nn.Sequential(
+        nn.Conv2d(inp, oup, kernel, stride, padding, bias=False),
+        nn.BatchNorm2d(oup), nn.PReLU(oup))
+
+
+def conv_no_relu(inp, oup, kernel, stride, padding=1):
+    return nn.Sequential(
+        nn.Conv2d(inp, oup, kernel, stride, padding, bias=False),
+        nn.BatchNorm2d(oup))
+
+
+class View(nn.Module):
+
+    def __init__(self, shape):
+        super(View, self).__init__()
+        self.shape = shape
+
+    def forward(self, x):
+        return x.view(*self.shape)
+
+
+class Softmax(nn.Module):
+
+    def __init__(self, dim):
+        super(Softmax, self).__init__()
+        self.softmax = nn.Softmax(dim)
+
+    def forward(self, x):
+        return self.softmax(x)
+
+
+class LargeEyeballNet(nn.Module):
+
+    def __init__(self):
+        super(LargeEyeballNet, self).__init__()
+
+        # v6/v7/v9
+        # iris : -1*2, 3, FACE_PART_SIZE, FACE_PART_SIZE
+        self.net = nn.Sequential(
+            conv_bn(3, 16, 3, 2, 0),
+            InvertedResidual(16, 16, 3, 1, 1, 2, True, activation='prelu'),
+            InvertedResidual(16, 32, 3, 2, 0, 2, False, activation='prelu'),
+            InvertedResidual(32, 32, 3, 1, 1, 2, True, activation='prelu'),
+            InvertedResidual(32, 64, 3, 2, 1, 2, False, activation='prelu'),
+            InvertedResidual(64, 64, 3, 1, 1, 2, True, activation='prelu'),
+            InvertedResidual(64, 64, 3, 2, 0, 2, False, activation='prelu'),
+            InvertedResidual(64, 64, 3, 1, 1, 2, True, activation='prelu'),
+            View((-1, 64 * 3 * 3, 1, 1)), conv_bn(64 * 3 * 3, 64, 1, 1, 0),
+            conv_no_relu(64, 40, 1, 1, 0), View((-1, 40)))
+
+    def forward(self, x):  # x: -1, 3, FACE_PART_SIZE, FACE_PART_SIZE
+        iris = self.net(x)
+
+        return iris
diff --git a/modelscope/models/cv/face_reconstruction/models/facerecon_model.py b/modelscope/models/cv/face_reconstruction/models/facerecon_model.py
new file mode 100644
index 00000000..c94cf1f9
--- /dev/null
+++ b/modelscope/models/cv/face_reconstruction/models/facerecon_model.py
@@ -0,0 +1,564 @@
+# Part of the implementation is borrowed and modified from Deep3DFaceRecon_pytorch,
+# publicly available at https://github.com/sicxu/Deep3DFaceRecon_pytorch
+import os
+
+import cv2
+import numpy as np
+import torch
+
+from modelscope.models import MODELS, TorchModel
+from modelscope.models.cv.face_reconstruction.models import opt
+from .. import utils
+from . import networks
+from .bfm import ParametricFaceModel
+from .losses import (CLIPLoss_relative, TVLoss, TVLoss_std, landmark_loss,
+                     perceptual_loss, photo_loss, points_loss_horizontal,
+                     reflectance_loss, reg_loss)
+from .nv_diffrast import MeshRenderer
+
+
+@MODELS.register_module('face-reconstruction', 'face_reconstruction')
+class FaceReconModel(TorchModel):
+
+    def __init__(self,
+                 model_dir,
+                 w_color=1.92,
+                 w_exp=0.8,
+                 w_gamma=10.0,
+                 w_id=1.0,
+                 w_lm=0.0016,
+                 w_reg=0.0003,
+                 w_tex=0.017,
+                 *args,
+                 **kwargs):
+        """The FaceReconModel is implemented based on Deep3DFaceRecon_pytorch, publicly available at
+        https://github.com/sicxu/Deep3DFaceRecon_pytorch
+
+        Args:
+            model_dir: the root directory of the model files
+            w_color: the weight of color loss
+            w_exp: the regularization weight of expression
+            w_gamma: the regularization weight of lighting
+            w_id: the regularization weight of identity
+            w_lm: the weight of landmark loss
+            w_reg: the weight of regularization loss
+            w_tex: the regularization weight of texture
+        """
+        super().__init__(model_dir, *args, **kwargs)
+
+        opt.bfm_folder = os.path.join(model_dir, 'assets')
+        self.opt = opt
+        self.w_color = w_color
+        self.w_exp = w_exp
+        self.w_gamma = w_gamma
+        self.w_id = w_id
+        self.w_lm = w_lm
+        self.w_reg = w_reg
+        self.w_tex = w_tex
+        self.device = torch.device('cpu')
+        self.isTrain = opt.isTrain
+        self.visual_names = ['output_vis']
+        self.model_names = ['net_recon']
+        self.parallel_names = self.model_names + ['renderer']
+
+        self.net_recon = networks.define_net_recon(
+            net_recon=opt.net_recon,
+            use_last_fc=opt.use_last_fc,
+            init_path=None)
+
+        self.facemodel = ParametricFaceModel(
+            bfm_folder=opt.bfm_folder,
+            camera_distance=opt.camera_d,
+            focal=opt.focal,
+            center=opt.center,
+            is_train=self.isTrain,
+            default_name=opt.bfm_model)
+
+        self.facemodel_front = ParametricFaceModel(
+            bfm_folder=opt.bfm_folder,
+            camera_distance=opt.camera_d,
+            focal=opt.focal,
+            center=opt.center,
+            is_train=self.isTrain,
+            default_name='face_model_for_maas.mat')
+
+        fov = 2 * np.arctan(opt.center / opt.focal) * 180 / np.pi
+        self.renderer = MeshRenderer(
+            rasterize_fov=fov,
+            znear=opt.z_near,
+            zfar=opt.z_far,
+            rasterize_size=int(2 * opt.center))
+
+        self.renderer_fitting = MeshRenderer(
+            rasterize_fov=fov,
+            znear=opt.z_near,
+            zfar=opt.z_far,
+            rasterize_size=int(2 * opt.center))
+
+        self.nonlinear_UVs = self.facemodel.nonlinear_UVs
+        self.nonlinear_UVs = torch.from_numpy(self.nonlinear_UVs).to(
+            torch.device('cuda'))
+
+        template_obj_path = os.path.join(opt.bfm_folder, 'template_mesh.obj')
+        self.template_mesh = utils.read_obj(template_obj_path)
+
+        self.input_imgs = []
+        self.input_img_hds = []
+        self.input_fat_img_hds = []
+        self.atten_masks = []
+        self.gt_lms = []
+        self.gt_lm_hds = []
+        self.trans_ms = []
+        self.img_names = []
+        self.face_masks = []
+        self.head_masks = []
+        self.input_imgs_coeff = []
+        self.gt_lms_coeff = []
+
+        self.loss_names = [
+            'all', 'feat', 'color', 'lm', 'reg', 'gamma', 'reflc'
+        ]
+
+        # loss func name: (compute_%s_loss) % loss_name
+        self.compute_feat_loss = perceptual_loss
+        self.comupte_color_loss = photo_loss
+        self.compute_lm_loss = landmark_loss
+        self.compute_reg_loss = reg_loss
+        self.compute_reflc_loss = reflectance_loss
+
+    def load_networks(self, load_path):
+        state_dict = torch.load(load_path, map_location=self.device)
+        print('loading the model from %s' % load_path)
+
+        for name in self.model_names:
+            if isinstance(name, str):
+                net = getattr(self, name)
+                if isinstance(net, torch.nn.DataParallel):
+                    net = net.module
+                net.load_state_dict(state_dict[name], strict=False)
+
+        if self.opt.phase != 'test':
+            if self.opt.continue_train:
+
+                try:
+                    for i, sched in enumerate(self.schedulers):
+                        sched.load_state_dict(state_dict['sched_%02d' % i])
+                except Exception as e:
+                    print(e)
+                    for i, sched in enumerate(self.schedulers):
+                        sched.last_epoch = self.opt.epoch_count - 1
+
+    def setup(self, checkpoint_path):
+        """Load and print networks; create schedulers
+
+        Parameters:
+            opt (Option class) -- stores all the experiment flags; needs to be a subclass of BaseOptions
+        """
+        self.load_networks(checkpoint_path)
+
+    def parallelize(self, convert_sync_batchnorm=True):
+        if not self.opt.use_ddp:
+            for name in self.parallel_names:
+                if isinstance(name, str):
+                    module = getattr(self, name)
+                    setattr(self, name, module.to(self.device))
+        else:
+            for name in self.model_names:
+                if isinstance(name, str):
+                    module = getattr(self, name)
+                    if convert_sync_batchnorm:
+                        module = torch.nn.SyncBatchNorm.convert_sync_batchnorm(
+                            module)
+                    setattr(
+                        self, name,
+                        torch.nn.parallel.DistributedDataParallel(
+                            module.to(self.device),
+                            device_ids=[self.device.index],
+                            find_unused_parameters=True,
+                            broadcast_buffers=True))
+
+            # DistributedDataParallel is not needed when a module doesn't have any parameter that requires a gradient.
+            for name in self.parallel_names:
+                if isinstance(name, str) and name not in self.model_names:
+                    module = getattr(self, name)
+                    setattr(self, name, module.to(self.device))
+
+        # put state_dict of optimizer to gpu device
+        if self.opt.phase != 'test':
+            if self.opt.continue_train:
+                for optim in self.optimizers:
+                    for state in optim.state.values():
+                        for k, v in state.items():
+                            if isinstance(v, torch.Tensor):
+                                state[k] = v.to(self.device)
+
+    def eval(self):
+        """Make models eval mode"""
+        for name in self.model_names:
+            if isinstance(name, str):
+                net = getattr(self, name)
+                net.eval()
+
+    def set_render(self, image_res):
+        fov = 2 * np.arctan(self.opt.center / self.opt.focal) * 180 / np.pi
+        if image_res is None:
+            image_res = int(2 * self.opt.center)
+
+        self.renderer = MeshRenderer(
+            rasterize_fov=fov,
+            znear=self.opt.z_near,
+            zfar=self.opt.z_far,
+            rasterize_size=image_res)
+
+    def set_input(self, input):
+        """Unpack input data from the dataloader and perform necessary pre-processing steps.
+
+        Parameters:
+            input: a dictionary that contains the data itself and its metadata information.
+        """
+        self.input_img = input['imgs'].to(self.device)
+        self.input_img_hd = input['imgs_hd'].to(
+            self.device) if 'imgs_hd' in input else None
+
+        if 'imgs_fat_hd' not in input or input['imgs_fat_hd'] is None:
+            self.input_fat_img_hd = self.input_img_hd
+        else:
+            self.input_fat_img_hd = input['imgs_fat_hd'].to(self.device)
+
+        self.atten_mask = input['msks'].to(
+            self.device) if 'msks' in input else None
+        self.gt_lm = input['lms'].to(self.device) if 'lms' in input else None
+        self.gt_lm_hd = input['lms_hd'].to(
+            self.device) if 'lms_hd' in input else None
+        self.trans_m = input['M'].to(self.device) if 'M' in input else None
+        self.image_paths = input['im_paths'] if 'im_paths' in input else None
+        self.img_name = input['img_name'] if 'img_name' in input else None
+        self.face_mask = input['face_mask'].to(
+            self.device) if 'face_mask' in input else None
+        self.head_mask = input['head_mask'].to(
+            self.device) if 'head_mask' in input else None
+        self.gt_normals = input['normals'].to(
+            self.device) if 'normals' in input else None
+        self.input_img_coeff = input['imgs_coeff'].to(
+            self.device) if 'imgs_coeff' in input else None
+        self.gt_lm_coeff = input['lms_coeff'].to(
+            self.device) if 'lms_coeff' in input else None
+
+    def get_edge_points_horizontal(self):
+        left_points = []
+        right_points = []
+        for i in range(self.face_mask.shape[2]):
+            inds = torch.where(self.face_mask[0, 0, i, :] > 0.5)  # 0.9
+            if len(inds[0]) > 0:  # i > 112 and len(inds[0]) > 0
+                left_points.append(int(inds[0][0]) + 1)
+                right_points.append(int(inds[0][-1]))
+            else:
+                left_points.append(0)
+                right_points.append(self.face_mask.shape[3] - 1)
+        self.left_points = torch.tensor(left_points).long().to(self.device)
+        self.right_points = torch.tensor(right_points).long().to(self.device)
+
+    def get_edge_points_vertical(self):
+        top_points = []
+        bottom_points = []
+        for i in range(self.face_mask.shape[3]):
+            inds = torch.where(self.face_mask[0, 0, :, i] > 0.9)
+            if len(inds[0]) > 0:
+                top_points.append(int(inds[0][0]))
+                bottom_points.append(int(inds[0][-1]))
+            else:
+                top_points.append(0)
+                bottom_points.append(self.face_mask.shape[2] - 1)
+        self.top_points = torch.tensor(top_points).long().to(self.device)
+        self.bottom_points = torch.tensor(bottom_points).long().to(self.device)
+
+    def blur_shape_offset_uv(self, global_blur=False, blur_size=3):
+        if self.edge_mask is not None:
+            shape_offset_uv_blur = self.shape_offset_uv[0].detach().cpu(
+            ).numpy()
+            shape_offset_uv_blur = cv2.blur(shape_offset_uv_blur, (15, 15))
+            shape_offset_uv_blur = torch.from_numpy(
+                shape_offset_uv_blur).float().to(self.device)[None, ...]
+            value_1 = shape_offset_uv_blur * self.edge_mask[None, ..., None]
+            value_2 = self.shape_offset_uv * (
+                1 - self.edge_mask[None, ..., None])
+            self.shape_offset_uv = value_1 + value_2
+
+        self.shape_offset_uv = self.shape_offset_uv * self.fusion_mask[None,
+                                                                       ...,
+                                                                       None]
+
+        if global_blur and blur_size > 0:
+            shape_offset_uv_blur = self.shape_offset_uv[0].detach().cpu(
+            ).numpy()
+            shape_offset_uv_blur = cv2.blur(shape_offset_uv_blur,
+                                            (blur_size, blur_size))
+            shape_offset_uv_blur = torch.from_numpy(
+                shape_offset_uv_blur).float().to(self.device)[None, ...]
+            self.shape_offset_uv = shape_offset_uv_blur
+
+    def get_fusion_mask(self):
+
+        h, w = self.shape_offset_uv.shape[1:3]
+        self.fusion_mask = torch.zeros((h, w)).to(self.device).float()
+        UVs_coords = self.nonlinear_UVs.clone()[:35709]
+        UVs_coords[:, 0] *= w
+        UVs_coords[:, 1] *= h
+        UVs_coords_int = torch.floor(UVs_coords)
+        UVs_coords_int = UVs_coords_int.long()
+
+        self.fusion_mask[h - 1 - UVs_coords_int[:, 1], UVs_coords_int[:,
+                                                                      0]] = 1
+
+        # blur mask
+        self.fusion_mask = self.fusion_mask.cpu().numpy()
+        new_kernel1 = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (5, 5))
+        new_kernel2 = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (8, 8))
+        self.fusion_mask = cv2.dilate(self.fusion_mask, new_kernel1, 1)
+        self.fusion_mask = cv2.erode(self.fusion_mask, new_kernel2, 1)
+        self.fusion_mask = cv2.blur(self.fusion_mask, (17, 17))
+        self.fusion_mask = torch.from_numpy(self.fusion_mask).float().to(
+            self.device)
+
+    def get_edge_mask(self):
+
+        h, w = self.shape_offset_uv.shape[1:3]
+        self.edge_mask = torch.zeros((h, w)).to(self.device).float()
+        UVs_coords = self.nonlinear_UVs.clone()[self.edge_points_inds]
+        UVs_coords[:, 0] *= w
+        UVs_coords[:, 1] *= h
+        UVs_coords_int = torch.floor(UVs_coords)
+        UVs_coords_int = UVs_coords_int.long()
+
+        self.edge_mask[h - 1 - UVs_coords_int[:, 1], UVs_coords_int[:, 0]] = 1
+
+        # blur mask
+        self.edge_mask = self.edge_mask.cpu().numpy()
+        new_kernel1 = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (8, 8))
+        self.edge_mask = cv2.dilate(self.edge_mask, new_kernel1, 1)
+        self.edge_mask = cv2.blur(self.edge_mask, (5, 5))
+        self.edge_mask = torch.from_numpy(self.edge_mask).float().to(
+            self.device)
+
+    def fitting_nonlinear(self, coeff, debug=False, n_iters=100, out_dir=None):
+        output_coeff = coeff.detach().clone()
+
+        output_coeff = self.facemodel_front.split_coeff(output_coeff)
+        output_coeff['id'].requires_grad = True
+        output_coeff['exp'].requires_grad = True
+        output_coeff['tex'].requires_grad = True
+        output_coeff['angle'].requires_grad = True
+        output_coeff['gamma'].requires_grad = True
+        output_coeff['trans'].requires_grad = True
+
+        self.shape_offset_uv = torch.zeros(
+            (1, 300, 300, 3),
+            dtype=torch.float32).to(self.device)  # (1, 180, 256, 3)
+        self.shape_offset_uv.requires_grad = True
+
+        self.texture_offset_uv = torch.zeros(
+            (1, 300, 300, 3),
+            dtype=torch.float32).to(self.device)  # (1, 180, 256, 3)
+        self.texture_offset_uv.requires_grad = True
+
+        value_list = [
+            self.shape_offset_uv, self.texture_offset_uv, output_coeff['id'],
+            output_coeff['exp'], output_coeff['tex'], output_coeff['angle'],
+            output_coeff['gamma'], output_coeff['trans']
+        ]
+        optim = torch.optim.Adam(value_list, lr=1e-3)
+
+        self.get_edge_points_horizontal()
+        self.get_edge_points_vertical()
+
+        self.cur_iter = 0
+        for i in range(n_iters):  # 500
+            self.pred_vertex, _, self.pred_color, self.pred_lm, _, face_shape_offset, self.verts_proj = \
+                self.facemodel_front.compute_for_render_train_nonlinear(output_coeff, self.shape_offset_uv,
+                                                                        self.texture_offset_uv,
+                                                                        self.nonlinear_UVs[:35709, ...])
+            self.pred_mask, _, self.pred_face, self.occ = self.renderer_fitting(
+                self.pred_vertex,
+                self.facemodel_front.face_buf,
+                feat=self.pred_color)
+
+            self.pred_coeffs_dict = self.facemodel_front.split_coeff(
+                output_coeff)
+            self.compute_losses_fitting()
+            if debug and i % 10 == 0:
+                print('{}: total loss: {:.6f}'.format(i, self.loss_all.item()))
+
+            optim.zero_grad()
+            self.loss_all.backward()
+            optim.step()
+
+            self.cur_iter += 1
+
+        output_coeff = self.facemodel_front.merge_coeff(output_coeff)
+
+        self.get_edge_mask()
+        self.get_fusion_mask()
+        self.blur_shape_offset_uv()
+
+        self.pred_vertex, _, self.pred_color, self.pred_lm, _, face_shape_offset, self.verts_proj = \
+            self.facemodel_front.compute_for_render_train_nonlinear(output_coeff, self.shape_offset_uv,
+                                                                    self.texture_offset_uv,
+                                                                    self.nonlinear_UVs[:35709, ...])
+
+        if out_dir is not None:
+            input_img_numpy = 255. * (self.input_img).detach().cpu().permute(
+                0, 2, 3, 1).numpy()
+            input_img_numpy = np.squeeze(input_img_numpy)
+
+            output_vis = self.pred_face
+            output_vis_numpy_raw = 255. * output_vis.detach().cpu().permute(
+                0, 2, 3, 1).numpy()
+            output_vis_numpy_raw = np.squeeze(output_vis_numpy_raw)
+
+            output_vis_numpy = np.concatenate(
+                (input_img_numpy, output_vis_numpy_raw), axis=-2)
+
+            output_vis = np.squeeze(output_vis_numpy)
+            output_vis = output_vis[..., ::-1]  # rgb->bgr
+            output_face_mask = self.pred_mask.detach().cpu().permute(
+                0, 2, 3, 1).squeeze().numpy() * 255.0
+            output_vis = np.column_stack(
+                (output_vis, cv2.cvtColor(output_face_mask,
+                                          cv2.COLOR_GRAY2BGR)))
+            output_input_vis = output_vis[:, :224]
+            output_pred_vis = output_vis[:, 224:448]
+            output_mask_vis = output_vis[:, 448:]
+
+            face_mask_vis = 255. * self.face_mask.detach().cpu()[0, 0].numpy()
+
+            shape_offset_vis = self.shape_offset_uv.detach().cpu().numpy()[0]
+            shape_offset_vis = (shape_offset_vis - shape_offset_vis.min()) / (
+                shape_offset_vis.max() - shape_offset_vis.min()) * 255.0
+
+            cv2.imwrite(
+                os.path.join(out_dir, 'fitting_01_input.jpg'),
+                output_input_vis)
+            cv2.imwrite(
+                os.path.join(out_dir, 'fitting_02_pred.jpg'), output_pred_vis)
+            cv2.imwrite(
+                os.path.join(out_dir, 'fitting_03_mask.jpg'), output_mask_vis)
+            cv2.imwrite(
+                os.path.join(out_dir, 'fitting_04_facemask.jpg'),
+                face_mask_vis)
+            cv2.imwrite(
+                os.path.join(out_dir, 'fitting_05_shape_offset.jpg'),
+                shape_offset_vis)
+
+        recon_shape_offset = face_shape_offset
+        recon_shape_offset[..., -1] = 10 - recon_shape_offset[
+            ..., -1]  # from camera space to world space
+        recon_shape_offset = recon_shape_offset.detach().cpu().numpy()[0]
+
+        tri = self.facemodel_front.face_buf.cpu().numpy()
+        pred_color = self.pred_color.detach().cpu().numpy()[0].clip(0, 1)
+
+        output = {
+            'coeffs': output_coeff,
+            'face_vertices': recon_shape_offset,
+            'face_faces': tri + 1,
+            'face_colors': pred_color
+        }
+        return output
+
+    def forward(self, out_dir=None):
+        self.facemodel.to(self.device)
+        self.facemodel_front.to(self.device)
+        with torch.no_grad():
+
+            output_coeff = self.net_recon(self.input_img)
+
+        with torch.enable_grad():
+            output = self.fitting_nonlinear(
+                output_coeff, debug=True, out_dir=out_dir)
+
+        output_coeff = output['coeffs']
+        output_coeff = self.facemodel.split_coeff(output_coeff)
+        eye_coeffs = output_coeff['exp'][0, 16] + output_coeff['exp'][
+            0, 17] + output_coeff['exp'][0, 19]
+        if eye_coeffs > 1.0:
+            degree = 0.5
+        else:
+            degree = 1.0
+        output_coeff['exp'][0, 16] += 1 * degree
+        output_coeff['exp'][0, 17] += 1 * degree
+        output_coeff['exp'][0, 19] += 1.5 * degree
+        output_coeff = self.facemodel.merge_coeff(output_coeff)
+
+        self.pred_vertex, face_shape_ori, head_shape = \
+            self.facemodel.compute_for_render_nonlinear_full(output_coeff, self.shape_offset_uv.detach(),
+                                                             self.nonlinear_UVs, nose_coeff=0.1)
+
+        UVs_tensor = torch.tensor(
+            self.template_mesh['uvs'],
+            dtype=torch.float32)[None, ...].to(self.pred_vertex.device)
+        target_img = self.input_fat_img_hd.permute(0, 2, 3, 1)
+        with torch.enable_grad():
+            _, _, _, texture_map, _ = self.renderer.pred_shape_and_texture(
+                self.pred_vertex, self.facemodel.face_buf, UVs_tensor,
+                target_img)
+
+        recon_shape = head_shape
+        recon_shape[
+            ...,
+            -1] = 10 - recon_shape[..., -1]  # from camera space to world space
+        recon_shape = recon_shape.cpu().numpy()[0]
+        tri = self.facemodel.face_buf.cpu().numpy()
+        normals = utils.estimate_normals(recon_shape, tri)
+
+        output['head_vertices'] = recon_shape
+        output['head_faces'] = tri + 1
+        output['head_tex_map'] = texture_map
+        output['head_UVs'] = self.template_mesh['uvs']
+        output['head_faces_uv'] = self.template_mesh['faces_uv']
+        output['head_normals'] = normals
+
+        return output
+
+    def compute_losses_fitting(self):
+        face_mask = self.pred_mask
+
+        face_mask = face_mask.detach()
+        self.loss_color = self.w_color * self.comupte_color_loss(
+            self.pred_face, self.input_img, face_mask)  # 1.0
+
+        self.loss_color_nose = torch.tensor(0.0).float().to(self.device)
+
+        loss_reg, loss_gamma = self.compute_reg_loss(self.pred_coeffs_dict,
+                                                     self.w_id, self.w_exp,
+                                                     self.w_tex)
+        self.loss_reg = self.w_reg * loss_reg  # 1.0
+        self.loss_gamma = self.w_gamma * loss_gamma  # 1.0
+
+        self.loss_lm = self.w_lm * self.compute_lm_loss(
+            self.pred_lm, self.gt_lm) * 0.1  # 0.1
+
+        self.loss_smooth_offset = TVLoss()(self.shape_offset_uv.permute(
+            0, 3, 1, 2)) * 10000  # 10000
+
+        self.loss_reg_offset = torch.tensor(0.0).float().to(self.device)
+
+        self.loss_reg_textureOff = torch.mean(
+            torch.abs(self.texture_offset_uv)) * 10  # 10
+
+        self.loss_smooth_offset_std = TVLoss_std()(
+            self.shape_offset_uv.permute(0, 3, 1, 2)) * 50000  # 50000
+
+        self.loss_points_horizontal, self.edge_points_inds = points_loss_horizontal(
+            self.verts_proj, self.left_points, self.right_points)  # 20
+        self.loss_points_horizontal *= 20
+        self.loss_points_horizontal_jaw = torch.tensor(0.0).float().to(
+            self.device)
+        self.loss_points_vertical = torch.tensor(0.0).float().to(self.device)
+        self.loss_normals = torch.tensor(0.0).float().to(self.device)
+
+        self.loss_all = self.loss_color + self.loss_lm + self.loss_reg + self.loss_gamma + self.loss_smooth_offset
+        self.loss_all += self.loss_reg_offset + self.loss_smooth_offset_std + self.loss_points_horizontal
+        self.loss_all += self.loss_points_vertical + self.loss_reg_textureOff
+        self.loss_all += self.loss_color_nose + self.loss_normals + self.loss_points_horizontal_jaw
+
+        self.loss_mask = torch.tensor(0.0).float().to(self.device)
diff --git a/modelscope/models/cv/face_reconstruction/models/losses.py b/modelscope/models/cv/face_reconstruction/models/losses.py
new file mode 100644
index 00000000..7a73d61e
--- /dev/null
+++ b/modelscope/models/cv/face_reconstruction/models/losses.py
@@ -0,0 +1,413 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import clip
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from kornia.geometry import warp_affine
+
+
+def resize_n_crop(image, M, dsize=112):
+    # image: (b, c, h, w)
+    # M   :  (b, 2, 3)
+    return warp_affine(image, M, dsize=(dsize, dsize))
+
+
+class CLIPLoss(torch.nn.Module):
+
+    def __init__(self):
+        super(CLIPLoss, self).__init__()
+        self.model, self.preprocess = clip.load('ViT-B/32', device='cuda')
+
+    def forward(self, image, text):
+        similarity = 1 - self.model(image, text)[0] / 100
+        return similarity
+
+
+class CLIPLoss_relative(torch.nn.Module):
+
+    def __init__(self):
+        super(CLIPLoss_relative, self).__init__()
+        self.model, self.preprocess = clip.load('ViT-B/32', device='cuda')
+
+    def forward(self, image, text, image_ori, text_ori):
+
+        image_features = self.model.encode_image(image)
+        text_features = self.model.encode_text(text)
+
+        # normalized features
+        image_features = image_features / image_features.norm(
+            dim=1, keepdim=True)
+        text_features = text_features / text_features.norm(dim=1, keepdim=True)
+
+        image_features_ori = self.model.encode_image(image_ori)
+        text_features_ori = self.model.encode_text(text_ori)
+
+        # normalized features
+        image_features_ori = image_features_ori / image_features_ori.norm(
+            dim=1, keepdim=True)
+        text_features_ori = text_features_ori / text_features_ori.norm(
+            dim=1, keepdim=True)
+
+        delta_image = image_features - image_features_ori
+        delta_text = text_features - text_features_ori
+
+        loss = 1 - torch.sum(delta_image * delta_text) / (
+            torch.norm(delta_image) * torch.norm(delta_text))
+
+        return loss
+
+
+# perceptual level loss
+class PerceptualLoss(nn.Module):
+
+    def __init__(self, recog_net, input_size=112):
+        super(PerceptualLoss, self).__init__()
+        self.recog_net = recog_net
+        self.preprocess = lambda x: 2 * x - 1
+        self.input_size = input_size
+
+    def forward(self, imageA, imageB, M):
+        """
+        1 - cosine distance
+        Parameters:
+            imageA       --torch.tensor (B, 3, H, W), range (0, 1) , RGB order
+            imageB       --same as imageA
+        """
+
+        imageA = self.preprocess(resize_n_crop(imageA, M, self.input_size))
+        imageB = self.preprocess(resize_n_crop(imageB, M, self.input_size))
+
+        # freeze bn
+        self.recog_net.eval()
+
+        id_featureA = F.normalize(self.recog_net(imageA), dim=-1, p=2)
+        id_featureB = F.normalize(self.recog_net(imageB), dim=-1, p=2)
+        cosine_d = torch.sum(id_featureA * id_featureB, dim=-1)
+        return torch.sum(1 - cosine_d) / cosine_d.shape[0]
+
+
+def perceptual_loss(id_featureA, id_featureB):
+    cosine_d = torch.sum(id_featureA * id_featureB, dim=-1)
+    return torch.sum(1 - cosine_d) / cosine_d.shape[0]
+
+
+# image level loss
+def photo_loss(imageA, imageB, mask, eps=1e-6):
+    """
+    l2 norm (with sqrt, to ensure backward stabililty, use eps, otherwise Nan may occur)
+    Parameters:
+        imageA       --torch.tensor (B, 3, H, W), range (0, 1), RGB order
+        imageB       --same as imageA
+    """
+    loss = torch.sqrt(eps + torch.sum(
+        (imageA - imageB)**2, dim=1, keepdims=True)) * mask
+    loss = torch.sum(loss) / torch.max(
+        torch.sum(mask),
+        torch.tensor(1.0).to(mask.device))
+    return loss
+
+
+def landmark_loss(predict_lm, gt_lm, weight=None):
+    """
+    weighted mse loss
+    Parameters:
+        predict_lm    --torch.tensor (B, 68, 2)
+        gt_lm         --torch.tensor (B, 68, 2)
+        weight        --numpy.array (1, 68)
+    """
+    if not weight:
+        weight = np.ones([68])
+        weight[28:31] = 20
+        weight[-8:] = 20
+        weight = np.expand_dims(weight, 0)
+        weight = torch.tensor(weight).to(predict_lm.device)
+    loss = torch.sum((predict_lm - gt_lm)**2, dim=-1) * weight
+    loss = torch.sum(loss) / (predict_lm.shape[0] * predict_lm.shape[1])
+    return loss
+
+
+# regulization
+def reg_loss(coeffs_dict, w_id=1, w_exp=1, w_tex=1):
+    """
+    l2 norm without the sqrt, from yu's implementation (mse)
+    tf.nn.l2_loss https://www.tensorflow.org/api_docs/python/tf/nn/l2_loss
+    Parameters:
+        coeffs_dict     -- a  dict of torch.tensors , keys: id, exp, tex, angle, gamma, trans
+
+    """
+    # coefficient regularization to ensure plausible 3d faces
+    value_1 = w_id * torch.sum(coeffs_dict['id']**2)
+    value_2 = w_exp * torch.sum(coeffs_dict['exp']**2)
+    value_3 = w_tex * torch.sum(coeffs_dict['tex']**2)
+    creg_loss = value_1 + value_2 + value_3
+    creg_loss = creg_loss / coeffs_dict['id'].shape[0]
+
+    # gamma regularization to ensure a nearly-monochromatic light
+    gamma = coeffs_dict['gamma'].reshape([-1, 3, 9])
+    gamma_mean = torch.mean(gamma, dim=1, keepdims=True)
+    gamma_loss = torch.mean((gamma - gamma_mean)**2)
+
+    return creg_loss, gamma_loss
+
+
+def reflectance_loss(texture, mask):
+    """
+    minimize texture variance (mse), albedo regularization to ensure an uniform skin albedo
+    Parameters:
+        texture       --torch.tensor, (B, N, 3)
+        mask          --torch.tensor, (N), 1 or 0
+
+    """
+    mask = mask.reshape([1, mask.shape[0], 1])
+    texture_mean = torch.sum(
+        mask * texture, dim=1, keepdims=True) / torch.sum(mask)
+    loss = torch.sum(((texture - texture_mean) * mask)**2) / (
+        texture.shape[0] * torch.sum(mask))
+    return loss
+
+
+def lm_3d_loss(pred_lm_3d, gt_lm_3d, mask):
+    loss = torch.abs(pred_lm_3d - gt_lm_3d)[mask, :]
+    loss = torch.mean(loss)
+    return loss
+
+
+class TVLoss(nn.Module):
+
+    def __init__(self, TVLoss_weight=1):
+        super(TVLoss, self).__init__()
+        self.TVLoss_weight = TVLoss_weight
+
+    def forward(self, x):
+        batch_size = x.size()[0]
+        h_x = x.size()[2]
+        w_x = x.size()[3]
+        count_h = self._tensor_size(x[:, :, 1:, :])
+        count_w = self._tensor_size(x[:, :, :, 1:])
+        h_tv = torch.pow((x[:, :, 1:, :] - x[:, :, :h_x - 1, :]), 2).sum()
+        w_tv = torch.pow((x[:, :, :, 1:] - x[:, :, :, :w_x - 1]), 2).sum()
+        return self.TVLoss_weight * 2 * (h_tv / count_h
+                                         + w_tv / count_w) / batch_size
+
+    def _tensor_size(self, t):
+        return t.size()[1] * t.size()[2] * t.size()[3]
+
+
+class TVLoss_std(nn.Module):
+
+    def __init__(self, TVLoss_weight=1):
+        super(TVLoss_std, self).__init__()
+        self.TVLoss_weight = TVLoss_weight
+
+    def forward(self, x):
+        batch_size = x.size()[0]
+        h_x = x.size()[2]
+        w_x = x.size()[3]
+        h_tv = torch.pow((x[:, :, 1:, :] - x[:, :, :h_x - 1, :]), 2)
+        h_tv = ((h_tv - torch.mean(h_tv))**2).sum()
+        w_tv = torch.pow((x[:, :, :, 1:] - x[:, :, :, :w_x - 1]), 2)
+        w_tv = ((w_tv - torch.mean(w_tv))**2).sum()
+        return self.TVLoss_weight * 2 * (h_tv + w_tv) / batch_size
+
+    def _tensor_size(self, t):
+        return t.size()[1] * t.size()[2] * t.size()[3]
+
+
+def photo_loss_sum(imageA, imageB, mask, eps=1e-6):
+    """
+    l2 norm (with sqrt, to ensure backward stabililty, use eps, otherwise Nan may occur)
+    Parameters:
+        imageA       --torch.tensor (B, 3, H, W), range (0, 1), RGB order
+        imageB       --same as imageA
+    """
+    loss = torch.sqrt(eps + torch.sum(
+        (imageA - imageB)**2, dim=1, keepdims=True)) * mask
+    loss = torch.sum(loss) / (
+        imageA.shape[0] * imageA.shape[2] * imageA.shape[3])
+    return loss
+
+
+def points_loss_horizontal(verts, left_points, right_points, width=224):
+    verts_int = torch.ceil(verts[0]).long().clamp(0, width - 1)  # (n, 2)
+    verts_left = left_points[width - 1 - verts_int[:, 1]].float()
+    verts_right = right_points[width - 1 - verts_int[:, 1]].float()
+    verts_x = verts[0, :, 0]
+    dist = (verts_left - verts_x) / width * (verts_right - verts_x) / width
+    dist /= torch.max(
+        torch.abs((verts_left - verts_x) / width),
+        torch.abs((verts_right - verts_x) / width))
+    edge_inds = torch.where(dist > 0)[0]
+    dist += 0.01
+    dist = torch.nn.functional.relu(dist).clone()
+    dist -= 0.01
+    dist = torch.abs(dist)
+    loss = torch.mean(dist)
+    return loss, edge_inds
+
+
+class LaplacianLoss(nn.Module):
+
+    def __init__(self):
+        super(LaplacianLoss, self).__init__()
+
+    def forward(self, x):
+        batch_size, slice_num = x.size()[:2]
+        z_x = x.size()[2]
+        h_x = x.size()[3]
+        w_x = x.size()[4]
+        count_z = self._tensor_size(x[:, :, 1:, :, :])
+        count_h = self._tensor_size(x[:, :, :, 1:, :])
+        count_w = self._tensor_size(x[:, :, :, :, 1:])
+        z_tv = torch.pow((x[:, :, 1:, :, :] - x[:, :, :z_x - 1, :, :]),
+                         2).sum()
+        h_tv = torch.pow((x[:, :, :, 1:, :] - x[:, :, :, :h_x - 1, :]),
+                         2).sum()
+        w_tv = torch.pow((x[:, :, :, :, 1:] - x[:, :, :, :, :w_x - 1]),
+                         2).sum()
+        return 2 * (z_tv / count_z + h_tv / count_h + w_tv / count_w) / (
+            batch_size * slice_num)
+
+    def _tensor_size(self, t):
+        return t.size()[2] * t.size()[3] * t.size()[4]
+
+
+class LaplacianLoss_L1(nn.Module):
+
+    def __init__(self):
+        super(LaplacianLoss_L1, self).__init__()
+
+    def forward(self, x):
+        batch_size, slice_num = x.size()[:2]
+        z_x = x.size()[2]
+        h_x = x.size()[3]
+        w_x = x.size()[4]
+        count_z = self._tensor_size(x[:, :, 1:, :, :])
+        count_h = self._tensor_size(x[:, :, :, 1:, :])
+        count_w = self._tensor_size(x[:, :, :, :, 1:])
+        z_tv = torch.abs((x[:, :, 1:, :, :] - x[:, :, :z_x - 1, :, :])).sum()
+        h_tv = torch.abs((x[:, :, :, 1:, :] - x[:, :, :, :h_x - 1, :])).sum()
+        w_tv = torch.abs((x[:, :, :, :, 1:] - x[:, :, :, :, :w_x - 1])).sum()
+        return 2 * (z_tv / count_z + h_tv / count_h + w_tv / count_w) / (
+            batch_size * slice_num)
+
+    def _tensor_size(self, t):
+        return t.size()[2] * t.size()[3] * t.size()[4]
+
+
+class GANLoss(nn.Module):
+
+    def __init__(self,
+                 gan_mode,
+                 target_real_label=1.0,
+                 target_fake_label=0.0,
+                 tensor=torch.FloatTensor):
+        super(GANLoss, self).__init__()
+        self.real_label = target_real_label
+        self.fake_label = target_fake_label
+        self.real_label_tensor = None
+        self.fake_label_tensor = None
+        self.zero_tensor = None
+        self.Tensor = tensor
+        self.gan_mode = gan_mode
+        if gan_mode == 'ls':
+            pass
+        elif gan_mode == 'original':
+            pass
+        elif gan_mode == 'w':
+            pass
+        elif gan_mode == 'hinge':
+            pass
+        else:
+            raise ValueError('Unexpected gan_mode {}'.format(gan_mode))
+
+    def get_target_tensor(self, input, target_is_real):
+        if target_is_real:
+            if self.real_label_tensor is None:
+                self.real_label_tensor = self.Tensor(1).fill_(self.real_label)
+                self.real_label_tensor.requires_grad_(False)
+            return self.real_label_tensor.expand_as(input)
+        else:
+            if self.fake_label_tensor is None:
+                self.fake_label_tensor = self.Tensor(1).fill_(self.fake_label)
+                self.fake_label_tensor.requires_grad_(False)
+            return self.fake_label_tensor.expand_as(input)
+
+    def get_zero_tensor(self, input):
+        if self.zero_tensor is None:
+            self.zero_tensor = self.Tensor(1).fill_(0)
+            self.zero_tensor.requires_grad_(False)
+        return self.zero_tensor.expand_as(input)
+
+    def loss(self, input, target_is_real, for_discriminator=True):
+        if self.gan_mode == 'original':  # cross entropy loss
+            target_tensor = self.get_target_tensor(input, target_is_real)
+            loss = F.binary_cross_entropy_with_logits(input, target_tensor)
+            return loss
+        elif self.gan_mode == 'ls':
+            target_tensor = self.get_target_tensor(input, target_is_real)
+            return F.mse_loss(input, target_tensor)
+        elif self.gan_mode == 'hinge':
+            if for_discriminator:
+                if target_is_real:
+                    minval = torch.min(input - 1, self.get_zero_tensor(input))
+                    loss = -torch.mean(minval)
+                else:
+                    minval = torch.min(-input - 1, self.get_zero_tensor(input))
+                    loss = -torch.mean(minval)
+            else:
+                assert target_is_real, "The generator's hinge loss must be aiming for real"
+                loss = -torch.mean(input)
+            return loss
+        else:
+            # wgan
+            if target_is_real:
+                return -input.mean()
+            else:
+                return input.mean()
+
+    def __call__(self, input, target_is_real, for_discriminator=True):
+        # computing loss is a bit complicated because |input| may not be
+        # a tensor, but list of tensors in case of multiscale discriminator
+        if isinstance(input, list):
+            loss = 0
+            for pred_i in input:
+                if isinstance(pred_i, list):
+                    pred_i = pred_i[-1]
+                loss_tensor = self.loss(pred_i, target_is_real,
+                                        for_discriminator)
+                bs = 1 if len(loss_tensor.size()) == 0 else loss_tensor.size(0)
+                new_loss = torch.mean(loss_tensor.view(bs, -1), dim=1)
+                loss += new_loss
+            return loss / len(input)
+        else:
+            return self.loss(input, target_is_real, for_discriminator)
+
+
+class BinaryDiceLoss(nn.Module):
+
+    def __init__(self, smooth=1, p=1, reduction='mean'):
+        super(BinaryDiceLoss, self).__init__()
+        self.smooth = smooth
+        self.p = p
+        self.reduction = reduction
+
+    def forward(self, predict, target):
+        assert predict.shape[0] == target.shape[
+            0], "predict & target batch size don't match"
+        predict = predict.contiguous().view(predict.shape[0], -1)
+        target = target.contiguous().view(target.shape[0], -1)
+
+        num = torch.sum(torch.mul(predict, target), dim=1)
+        den = torch.sum(predict + target, dim=1)
+
+        loss = 1 - (2 * num + self.smooth) / (den + self.smooth)
+
+        if self.reduction == 'mean':
+            return loss.mean()
+        elif self.reduction == 'sum':
+            return loss.sum()
+        elif self.reduction == 'none':
+            return loss
+        else:
+            raise Exception('Unexpected reduction {}'.format(self.reduction))
diff --git a/modelscope/models/cv/face_reconstruction/models/networks.py b/modelscope/models/cv/face_reconstruction/models/networks.py
new file mode 100644
index 00000000..1eb5770b
--- /dev/null
+++ b/modelscope/models/cv/face_reconstruction/models/networks.py
@@ -0,0 +1,577 @@
+# Part of the implementation is borrowed and modified from Deep3DFaceRecon_pytorch,
+# publicly available at https://github.com/sicxu/Deep3DFaceRecon_pytorch
+import os
+from typing import Any, Callable, List, Optional, Type, Union
+
+import torch
+import torch.nn as nn
+from kornia.geometry import warp_affine
+from torch import Tensor
+from torch.optim import lr_scheduler
+
+try:
+    from torch.hub import load_state_dict_from_url
+except ImportError:
+    from torch.utils.model_zoo import load_url as load_state_dict_from_url
+
+
+def resize_n_crop(image, M, dsize=112):
+    # image: (b, c, h, w)
+    # M   :  (b, 2, 3)
+    return warp_affine(image, M, dsize=(dsize, dsize))
+
+
+def filter_state_dict(state_dict, remove_name='fc'):
+    new_state_dict = {}
+    for key in state_dict:
+        if remove_name in key:
+            continue
+        new_state_dict[key] = state_dict[key]
+    return new_state_dict
+
+
+def define_net_recon(net_recon, use_last_fc=False, init_path=None):
+    return ReconNetWrapper(
+        net_recon, use_last_fc=use_last_fc, init_path=init_path)
+
+
+def define_net_recon2(net_recon, use_last_fc=False, init_path=None):
+    return ReconNetWrapper2(
+        net_recon, use_last_fc=use_last_fc, init_path=init_path)
+
+
+class ReconNetWrapper(nn.Module):
+    fc_dim = 257
+
+    def __init__(self, net_recon, use_last_fc=False, init_path=None):
+        super(ReconNetWrapper, self).__init__()
+        self.use_last_fc = use_last_fc
+        if net_recon not in func_dict:
+            return NotImplementedError('network [%s] is not implemented',
+                                       net_recon)
+        func, last_dim = func_dict[net_recon]
+        backbone = func(use_last_fc=use_last_fc, num_classes=self.fc_dim)
+        if init_path and os.path.isfile(init_path):
+            state_dict = filter_state_dict(
+                torch.load(init_path, map_location='cpu'))
+            backbone.load_state_dict(state_dict)
+            print('loading init net_recon %s from %s' % (net_recon, init_path))
+        self.backbone = backbone
+        if not use_last_fc:
+            self.final_layers = nn.ModuleList([
+                conv1x1(last_dim, 80, bias=True),  # id layer
+                conv1x1(last_dim, 64, bias=True),  # exp layer
+                conv1x1(last_dim, 80, bias=True),  # tex layer
+                conv1x1(last_dim, 3, bias=True),  # angle layer
+                conv1x1(last_dim, 27, bias=True),  # gamma layer
+                conv1x1(last_dim, 2, bias=True),  # tx, ty
+                conv1x1(last_dim, 1, bias=True)  # tz
+            ])
+            for m in self.final_layers:
+                nn.init.constant_(m.weight, 0.)
+                nn.init.constant_(m.bias, 0.)
+
+    def forward(self, x):
+        x = self.backbone(x)
+        if not self.use_last_fc:
+            output = []
+            for layer in self.final_layers:
+                output.append(layer(x))
+            x = torch.flatten(torch.cat(output, dim=1), 1)
+        return x
+
+
+class ReconNetWrapper2(nn.Module):
+    fc_dim = 264
+
+    def __init__(self, net_recon, use_last_fc=False, init_path=None):
+        super(ReconNetWrapper2, self).__init__()
+        self.use_last_fc = use_last_fc
+        if net_recon not in func_dict:
+            return NotImplementedError('network [%s] is not implemented',
+                                       net_recon)
+        func, last_dim = func_dict[net_recon]
+        backbone = func(use_last_fc=use_last_fc, num_classes=self.fc_dim)
+        if init_path and os.path.isfile(init_path):
+            state_dict = filter_state_dict(
+                torch.load(init_path, map_location='cpu'))
+            backbone.load_state_dict(state_dict)
+            print('loading init net_recon %s from %s' % (net_recon, init_path))
+        self.backbone = backbone
+        if not use_last_fc:
+            self.final_layers2 = nn.ModuleList([
+                conv1x1(last_dim, 80, bias=True),  # id layer
+                conv1x1(last_dim, 51, bias=True),  # exp layer
+                conv1x1(last_dim, 100, bias=True),  # tex layer
+                conv1x1(last_dim, 3, bias=True),  # angle layer
+                conv1x1(last_dim, 27, bias=True),  # gamma layer
+                conv1x1(last_dim, 2, bias=True),  # tx, ty
+                conv1x1(last_dim, 1, bias=True)  # tz
+            ])
+            for m in self.final_layers2:
+                nn.init.constant_(m.weight, 0.)
+                nn.init.constant_(m.bias, 0.)
+
+    def forward(self, x):
+        x = self.backbone(x)
+        if not self.use_last_fc:
+            output = []
+            for layer in self.final_layers2:
+                output.append(layer(x))
+            x = torch.flatten(torch.cat(output, dim=1), 1)
+        return x
+
+
+# adapted from https://github.com/pytorch/vision/edit/master/torchvision/models/resnet.py
+__all__ = [
+    'ResNet', 'resnet18', 'resnet34', 'resnet50', 'resnet101', 'resnet152',
+    'resnext50_32x4d', 'resnext101_32x8d', 'wide_resnet50_2',
+    'wide_resnet101_2'
+]
+
+model_urls = {
+    'resnet18':
+    'https://download.pytorch.org/models/resnet18-f37072fd.pth',
+    'resnet34':
+    'https://download.pytorch.org/models/resnet34-b627a593.pth',
+    'resnet50':
+    'https://download.pytorch.org/models/resnet50-0676ba61.pth',
+    'resnet101':
+    'https://download.pytorch.org/models/resnet101-63fe2227.pth',
+    'resnet152':
+    'https://download.pytorch.org/models/resnet152-394f9c45.pth',
+    'resnext50_32x4d':
+    'https://download.pytorch.org/models/resnext50_32x4d-7cdf4587.pth',
+    'resnext101_32x8d':
+    'https://download.pytorch.org/models/resnext101_32x8d-8ba56ff5.pth',
+    'wide_resnet50_2':
+    'https://download.pytorch.org/models/wide_resnet50_2-95faca4d.pth',
+    'wide_resnet101_2':
+    'https://download.pytorch.org/models/wide_resnet101_2-32ee1156.pth',
+}
+
+
+def conv3x3(in_planes: int,
+            out_planes: int,
+            stride: int = 1,
+            groups: int = 1,
+            dilation: int = 1) -> nn.Conv2d:
+    """3x3 convolution with padding"""
+    return nn.Conv2d(
+        in_planes,
+        out_planes,
+        kernel_size=3,
+        stride=stride,
+        padding=dilation,
+        groups=groups,
+        bias=False,
+        dilation=dilation)
+
+
+def conv1x1(in_planes: int,
+            out_planes: int,
+            stride: int = 1,
+            bias: bool = False) -> nn.Conv2d:
+    """1x1 convolution"""
+    return nn.Conv2d(
+        in_planes, out_planes, kernel_size=1, stride=stride, bias=bias)
+
+
+class BasicBlock(nn.Module):
+    expansion: int = 1
+
+    def __init__(
+            self,
+            inplanes: int,
+            planes: int,
+            stride: int = 1,
+            downsample: Optional[nn.Module] = None,
+            groups: int = 1,
+            base_width: int = 64,
+            dilation: int = 1,
+            norm_layer: Optional[Callable[..., nn.Module]] = None) -> None:
+        super(BasicBlock, self).__init__()
+        if norm_layer is None:
+            norm_layer = nn.BatchNorm2d
+        if groups != 1 or base_width != 64:
+            raise ValueError(
+                'BasicBlock only supports groups=1 and base_width=64')
+        if dilation > 1:
+            raise NotImplementedError(
+                'Dilation > 1 not supported in BasicBlock')
+        # Both self.conv1 and self.downsample layers downsample the input when stride != 1
+        self.conv1 = conv3x3(inplanes, planes, stride)
+        self.bn1 = norm_layer(planes)
+        self.relu = nn.ReLU(inplace=True)
+        self.conv2 = conv3x3(planes, planes)
+        self.bn2 = norm_layer(planes)
+        self.downsample = downsample
+        self.stride = stride
+
+    def forward(self, x: Tensor) -> Tensor:
+        identity = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+
+        if self.downsample is not None:
+            identity = self.downsample(x)
+
+        out += identity
+        out = self.relu(out)
+
+        return out
+
+
+class Bottleneck(nn.Module):
+    # Bottleneck in torchvision places the stride for downsampling at 3x3 convolution(self.conv2)
+    # while original implementation places the stride at the first 1x1 convolution(self.conv1)
+    # according to "Deep residual learning for image recognition"https://arxiv.org/abs/1512.03385.
+    # This variant is also known as ResNet V1.5 and improves accuracy according to
+    # https://ngc.nvidia.com/catalog/model-scripts/nvidia:resnet_50_v1_5_for_pytorch.
+
+    expansion: int = 4
+
+    def __init__(
+            self,
+            inplanes: int,
+            planes: int,
+            stride: int = 1,
+            downsample: Optional[nn.Module] = None,
+            groups: int = 1,
+            base_width: int = 64,
+            dilation: int = 1,
+            norm_layer: Optional[Callable[..., nn.Module]] = None) -> None:
+        super(Bottleneck, self).__init__()
+        if norm_layer is None:
+            norm_layer = nn.BatchNorm2d
+        width = int(planes * (base_width / 64.)) * groups
+        # Both self.conv2 and self.downsample layers downsample the input when stride != 1
+        self.conv1 = conv1x1(inplanes, width)
+        self.bn1 = norm_layer(width)
+        self.conv2 = conv3x3(width, width, stride, groups, dilation)
+        self.bn2 = norm_layer(width)
+        self.conv3 = conv1x1(width, planes * self.expansion)
+        self.bn3 = norm_layer(planes * self.expansion)
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = downsample
+        self.stride = stride
+
+    def forward(self, x: Tensor) -> Tensor:
+        identity = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out = self.relu(out)
+
+        out = self.conv3(out)
+        out = self.bn3(out)
+
+        if self.downsample is not None:
+            identity = self.downsample(x)
+
+        out += identity
+        out = self.relu(out)
+
+        return out
+
+
+class ResNet(nn.Module):
+
+    def __init__(
+            self,
+            block: Type[Union[BasicBlock, Bottleneck]],
+            layers: List[int],
+            num_classes: int = 1000,
+            zero_init_residual: bool = False,
+            use_last_fc: bool = False,
+            groups: int = 1,
+            width_per_group: int = 64,
+            replace_stride_with_dilation: Optional[List[bool]] = None,
+            norm_layer: Optional[Callable[..., nn.Module]] = None) -> None:
+        super(ResNet, self).__init__()
+        if norm_layer is None:
+            norm_layer = nn.BatchNorm2d
+        self._norm_layer = norm_layer
+
+        self.inplanes = 64
+        self.dilation = 1
+        if replace_stride_with_dilation is None:
+            # each element in the tuple indicates if we should replace
+            # the 2x2 stride with a dilated convolution instead
+            replace_stride_with_dilation = [False, False, False]
+        if len(replace_stride_with_dilation) != 3:
+            raise ValueError('replace_stride_with_dilation should be None '
+                             'or a 3-element tuple, got {}'.format(
+                                 replace_stride_with_dilation))
+        self.use_last_fc = use_last_fc
+        self.groups = groups
+        self.base_width = width_per_group
+        self.conv1 = nn.Conv2d(
+            3, self.inplanes, kernel_size=7, stride=2, padding=3, bias=False)
+        self.bn1 = norm_layer(self.inplanes)
+        self.relu = nn.ReLU(inplace=True)
+        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+        self.layer1 = self._make_layer(block, 64, layers[0])
+        self.layer2 = self._make_layer(
+            block,
+            128,
+            layers[1],
+            stride=2,
+            dilate=replace_stride_with_dilation[0])
+        self.layer3 = self._make_layer(
+            block,
+            256,
+            layers[2],
+            stride=2,
+            dilate=replace_stride_with_dilation[1])
+        self.layer4 = self._make_layer(
+            block,
+            512,
+            layers[3],
+            stride=2,
+            dilate=replace_stride_with_dilation[2])
+        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
+
+        if self.use_last_fc:
+            self.fc = nn.Linear(512 * block.expansion, num_classes)
+
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(
+                    m.weight, mode='fan_out', nonlinearity='relu')
+            elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)):
+                nn.init.constant_(m.weight, 1)
+                nn.init.constant_(m.bias, 0)
+
+        # Zero-initialize the last BN in each residual branch,
+        # so that the residual branch starts with zeros, and each residual block behaves like an identity.
+        # This improves the model by 0.2~0.3% according to https://arxiv.org/abs/1706.02677
+        if zero_init_residual:
+            for m in self.modules():
+                if isinstance(m, Bottleneck):
+                    nn.init.constant_(m.bn3.weight,
+                                      0)  # type: ignore[arg-type]
+                elif isinstance(m, BasicBlock):
+                    nn.init.constant_(m.bn2.weight,
+                                      0)  # type: ignore[arg-type]
+
+    def _make_layer(self,
+                    block: Type[Union[BasicBlock, Bottleneck]],
+                    planes: int,
+                    blocks: int,
+                    stride: int = 1,
+                    dilate: bool = False) -> nn.Sequential:
+        norm_layer = self._norm_layer
+        downsample = None
+        previous_dilation = self.dilation
+        if dilate:
+            self.dilation *= stride
+            stride = 1
+        if stride != 1 or self.inplanes != planes * block.expansion:
+            downsample = nn.Sequential(
+                conv1x1(self.inplanes, planes * block.expansion, stride),
+                norm_layer(planes * block.expansion),
+            )
+
+        layers = []
+        layers.append(
+            block(self.inplanes, planes, stride, downsample, self.groups,
+                  self.base_width, previous_dilation, norm_layer))
+        self.inplanes = planes * block.expansion
+        for _ in range(1, blocks):
+            layers.append(
+                block(
+                    self.inplanes,
+                    planes,
+                    groups=self.groups,
+                    base_width=self.base_width,
+                    dilation=self.dilation,
+                    norm_layer=norm_layer))
+
+        return nn.Sequential(*layers)
+
+    def _forward_impl(self, x: Tensor) -> Tensor:
+        # See note [TorchScript super()]
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = self.relu(x)
+        x = self.maxpool(x)
+
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+        x = self.layer4(x)
+
+        x = self.avgpool(x)
+        if self.use_last_fc:
+            x = torch.flatten(x, 1)
+            x = self.fc(x)
+        return x
+
+    def forward(self, x: Tensor) -> Tensor:
+        return self._forward_impl(x)
+
+
+def _resnet(arch: str, block: Type[Union[BasicBlock,
+                                         Bottleneck]], layers: List[int],
+            pretrained: bool, progress: bool, **kwargs: Any) -> ResNet:
+    model = ResNet(block, layers, **kwargs)
+    if pretrained:
+        state_dict = load_state_dict_from_url(
+            model_urls[arch], progress=progress)
+        model.load_state_dict(state_dict)
+    return model
+
+
+def resnet18(pretrained: bool = False,
+             progress: bool = True,
+             **kwargs: Any) -> ResNet:
+    r"""ResNet-18 model from
+    `"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>`_.
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    return _resnet('resnet18', BasicBlock, [2, 2, 2, 2], pretrained, progress,
+                   **kwargs)
+
+
+def resnet34(pretrained: bool = False,
+             progress: bool = True,
+             **kwargs: Any) -> ResNet:
+    r"""ResNet-34 model from
+    `"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>`_.
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    return _resnet('resnet34', BasicBlock, [3, 4, 6, 3], pretrained, progress,
+                   **kwargs)
+
+
+def resnet50(pretrained: bool = False,
+             progress: bool = True,
+             **kwargs: Any) -> ResNet:
+    r"""ResNet-50 model from
+    `"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>`_.
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    return _resnet('resnet50', Bottleneck, [3, 4, 6, 3], pretrained, progress,
+                   **kwargs)
+
+
+def resnet101(pretrained: bool = False,
+              progress: bool = True,
+              **kwargs: Any) -> ResNet:
+    r"""ResNet-101 model from
+    `"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>`_.
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    return _resnet('resnet101', Bottleneck, [3, 4, 23, 3], pretrained,
+                   progress, **kwargs)
+
+
+def resnet152(pretrained: bool = False,
+              progress: bool = True,
+              **kwargs: Any) -> ResNet:
+    r"""ResNet-152 model from
+    `"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>`_.
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    return _resnet('resnet152', Bottleneck, [3, 8, 36, 3], pretrained,
+                   progress, **kwargs)
+
+
+def resnext50_32x4d(pretrained: bool = False,
+                    progress: bool = True,
+                    **kwargs: Any) -> ResNet:
+    r"""ResNeXt-50 32x4d model from
+    `"Aggregated Residual Transformation for Deep Neural Networks" <https://arxiv.org/pdf/1611.05431.pdf>`_.
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    kwargs['groups'] = 32
+    kwargs['width_per_group'] = 4
+    return _resnet('resnext50_32x4d', Bottleneck, [3, 4, 6, 3], pretrained,
+                   progress, **kwargs)
+
+
+def resnext101_32x8d(pretrained: bool = False,
+                     progress: bool = True,
+                     **kwargs: Any) -> ResNet:
+    r"""ResNeXt-101 32x8d model from
+    `"Aggregated Residual Transformation for Deep Neural Networks" <https://arxiv.org/pdf/1611.05431.pdf>`_.
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    kwargs['groups'] = 32
+    kwargs['width_per_group'] = 8
+    return _resnet('resnext101_32x8d', Bottleneck, [3, 4, 23, 3], pretrained,
+                   progress, **kwargs)
+
+
+def wide_resnet50_2(pretrained: bool = False,
+                    progress: bool = True,
+                    **kwargs: Any) -> ResNet:
+    r"""Wide ResNet-50-2 model from
+    `"Wide Residual Networks" <https://arxiv.org/pdf/1605.07146.pdf>`_.
+
+    The model is the same as ResNet except for the bottleneck number of channels
+    which is twice larger in every block. The number of channels in outer 1x1
+    convolutions is the same, e.g. last block in ResNet-50 has 2048-512-2048
+    channels, and in Wide ResNet-50-2 has 2048-1024-2048.
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    kwargs['width_per_group'] = 64 * 2
+    return _resnet('wide_resnet50_2', Bottleneck, [3, 4, 6, 3], pretrained,
+                   progress, **kwargs)
+
+
+def wide_resnet101_2(pretrained: bool = False,
+                     progress: bool = True,
+                     **kwargs: Any) -> ResNet:
+    r"""Wide ResNet-101-2 model from
+    `"Wide Residual Networks" <https://arxiv.org/pdf/1605.07146.pdf>`_.
+
+    The model is the same as ResNet except for the bottleneck number of channels
+    which is twice larger in every block. The number of channels in outer 1x1
+    convolutions is the same, e.g. last block in ResNet-50 has 2048-512-2048
+    channels, and in Wide ResNet-50-2 has 2048-1024-2048.
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    kwargs['width_per_group'] = 64 * 2
+    return _resnet('wide_resnet101_2', Bottleneck, [3, 4, 23, 3], pretrained,
+                   progress, **kwargs)
+
+
+func_dict = {'resnet18': (resnet18, 512), 'resnet50': (resnet50, 2048)}
diff --git a/modelscope/models/cv/face_reconstruction/models/nv_diffrast.py b/modelscope/models/cv/face_reconstruction/models/nv_diffrast.py
new file mode 100644
index 00000000..f17246e5
--- /dev/null
+++ b/modelscope/models/cv/face_reconstruction/models/nv_diffrast.py
@@ -0,0 +1,400 @@
+# Part of the implementation is borrowed and modified from Deep3DFaceRecon_pytorch,
+# publicly available at https://github.com/sicxu/Deep3DFaceRecon_pytorch
+import warnings
+from typing import List
+
+import numpy as np
+import nvdiffrast.torch as dr
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+from .losses import TVLoss, TVLoss_std
+
+warnings.filterwarnings('ignore')
+
+
+def ndc_projection(x=0.1, n=1.0, f=50.0):
+    return np.array([[n / x, 0, 0, 0], [0, n / -x, 0, 0],
+                     [0, 0, -(f + n) / (f - n), -(2 * f * n) / (f - n)],
+                     [0, 0, -1, 0]]).astype(np.float32)
+
+
+def to_image(face_shape):
+    """
+    Return:
+        face_proj        -- torch.tensor, size (B, N, 2), y direction is opposite to v direction
+
+    Parameters:
+        face_shape       -- torch.tensor, size (B, N, 3)
+    """
+
+    focal = 1015.
+    center = 112.
+    persc_proj = np.array([focal, 0, center, 0, focal, center, 0, 0,
+                           1]).reshape([3, 3]).astype(np.float32).transpose()
+
+    persc_proj = torch.tensor(persc_proj).to(face_shape.device)
+
+    face_proj = face_shape @ persc_proj
+    face_proj = face_proj[..., :2] / face_proj[..., 2:]
+
+    return face_proj
+
+
+class MeshRenderer(nn.Module):
+
+    def __init__(self, rasterize_fov, znear=0.1, zfar=10, rasterize_size=224):
+        super(MeshRenderer, self).__init__()
+
+        x = np.tan(np.deg2rad(rasterize_fov * 0.5)) * znear
+        self.ndc_proj = torch.tensor(ndc_projection(
+            x=x, n=znear,
+            f=zfar)).matmul(torch.diag(torch.tensor([1., -1, -1, 1])))
+        self.rasterize_size = rasterize_size
+        self.glctx = None
+
+    def forward(self, vertex, tri, feat=None):
+        """
+        Return:
+            mask               -- torch.tensor, size (B, 1, H, W)
+            depth              -- torch.tensor, size (B, 1, H, W)
+            features(optional) -- torch.tensor, size (B, C, H, W) if feat is not None
+
+        Parameters:
+            vertex          -- torch.tensor, size (B, N, 3)
+            tri             -- torch.tensor, size (B, M, 3) or (M, 3), triangles
+            feat(optional)  -- torch.tensor, size (B, C), features
+        """
+        device = vertex.device
+        rsize = int(self.rasterize_size)
+        ndc_proj = self.ndc_proj.to(device)
+        verts_proj = to_image(vertex)
+        # trans to homogeneous coordinates of 3d vertices, the direction of y is the same as v
+        if vertex.shape[-1] == 3:
+            vertex = torch.cat(
+                [vertex, torch.ones([*vertex.shape[:2], 1]).to(device)],
+                dim=-1)
+            vertex[..., 1] = -vertex[..., 1]
+
+        vertex_ndc = vertex @ ndc_proj.t()
+        if self.glctx is None:
+            self.glctx = dr.RasterizeCudaContext(device=device)
+
+        ranges = None
+        if isinstance(tri, List) or len(tri.shape) == 3:
+            vum = vertex_ndc.shape[1]
+            fnum = torch.tensor([f.shape[0]
+                                 for f in tri]).unsqueeze(1).to(device)
+
+            print('fnum shape:{}'.format(fnum.shape))
+
+            fstartidx = torch.cumsum(fnum, dim=0) - fnum
+            ranges = torch.cat([fstartidx, fnum],
+                               axis=1).type(torch.int32).cpu()
+            for i in range(tri.shape[0]):
+                tri[i] = tri[i] + i * vum
+            vertex_ndc = torch.cat(vertex_ndc, dim=0)
+            tri = torch.cat(tri, dim=0)
+
+        # for range_mode vetex: [B*N, 4], tri: [B*M, 3], for instance_mode vetex: [B, N, 4], tri: [M, 3]
+        tri = tri.type(torch.int32).contiguous()
+        rast_out, _ = dr.rasterize(
+            self.glctx,
+            vertex_ndc.contiguous(),
+            tri,
+            resolution=[rsize, rsize],
+            ranges=ranges)
+
+        depth, _ = dr.interpolate(
+            vertex.reshape([-1, 4])[..., 2].unsqueeze(1).contiguous(),
+            rast_out, tri)
+        depth = depth.permute(0, 3, 1, 2)
+        mask = (rast_out[..., 3] > 0).float().unsqueeze(1)
+        depth = mask * depth
+
+        image = None
+
+        verts_x = verts_proj[0, :, 0]
+        verts_y = 224 - verts_proj[0, :, 1]
+        verts_int = torch.ceil(verts_proj[0]).long()  # (n, 2)
+        verts_xr_int = verts_int[:, 0].clamp(1, 224 - 1)
+        verts_yt_int = 224 - verts_int[:, 1].clamp(2, 224)
+        verts_right_float = verts_xr_int - verts_x
+        verts_left_float = 1 - verts_right_float
+        verts_top_float = verts_y - verts_yt_int
+        verts_bottom_float = 1 - verts_top_float
+
+        rast_lt = rast_out[0, verts_yt_int, verts_xr_int - 1, 3]
+        rast_lb = rast_out[0, verts_yt_int + 1, verts_xr_int - 1, 3]
+        rast_rt = rast_out[0, verts_yt_int, verts_xr_int, 3]
+        rast_rb = rast_out[0, verts_yt_int + 1, verts_xr_int, 3]
+
+        occ_feat = (rast_lt > 0) * 1.0 * (verts_left_float + verts_top_float) + \
+                   (rast_lb > 0) * 1.0 * (verts_left_float + verts_bottom_float) + \
+                   (rast_rt > 0) * 1.0 * (verts_right_float + verts_top_float) + \
+                   (rast_rb > 0) * 1.0 * (verts_right_float + verts_bottom_float)
+        occ_feat = occ_feat[None, :, None] / 4.0
+
+        occ, _ = dr.interpolate(occ_feat, rast_out, tri)
+        occ = occ.permute(0, 3, 1, 2)
+
+        if feat is not None:
+            image, _ = dr.interpolate(feat, rast_out, tri)
+            image = image.permute(0, 3, 1, 2)
+            image = mask * image
+
+        return mask, depth, image, occ
+
+    def render_uv_texture(self, vertex, tri, uv, uv_texture):
+        """
+        Return:
+            mask               -- torch.tensor, size (B, 1, H, W)
+            depth              -- torch.tensor, size (B, 1, H, W)
+            features(optional) -- torch.tensor, size (B, C, H, W) if feat is not None
+
+        Parameters:
+            vertex          -- torch.tensor, size (B, N, 3)
+            tri             -- torch.tensor, size (M, 3), triangles
+            uv                -- torch.tensor, size (B,N, 2),  uv mapping
+            base_tex   -- torch.tensor, size (B,H,W,C)
+        """
+        device = vertex.device
+        rsize = int(self.rasterize_size)
+        ndc_proj = self.ndc_proj.to(device)
+        # trans to homogeneous coordinates of 3d vertices, the direction of y is the same as v
+        if vertex.shape[-1] == 3:
+            vertex = torch.cat(
+                [vertex, torch.ones([*vertex.shape[:2], 1]).to(device)],
+                dim=-1)
+            vertex[..., 1] = -vertex[..., 1]
+
+        vertex_ndc = vertex @ ndc_proj.t()
+        if self.glctx is None:
+            self.glctx = dr.RasterizeCudaContext(device=device)
+
+        ranges = None
+        if isinstance(tri, List) or len(tri.shape) == 3:
+            vum = vertex_ndc.shape[1]
+            fnum = torch.tensor([f.shape[0]
+                                 for f in tri]).unsqueeze(1).to(device)
+
+            print('fnum shape:{}'.format(fnum.shape))
+
+            fstartidx = torch.cumsum(fnum, dim=0) - fnum
+            ranges = torch.cat([fstartidx, fnum],
+                               axis=1).type(torch.int32).cpu()
+            for i in range(tri.shape[0]):
+                tri[i] = tri[i] + i * vum
+            vertex_ndc = torch.cat(vertex_ndc, dim=0)
+            tri = torch.cat(tri, dim=0)
+
+        # for range_mode vetex: [B*N, 4], tri: [B*M, 3], for instance_mode vetex: [B, N, 4], tri: [M, 3]
+        tri = tri.type(torch.int32).contiguous()
+        rast_out, _ = dr.rasterize(
+            self.glctx,
+            vertex_ndc.contiguous(),
+            tri,
+            resolution=[rsize, rsize],
+            ranges=ranges)
+
+        depth, _ = dr.interpolate(
+            vertex.reshape([-1, 4])[..., 2].unsqueeze(1).contiguous(),
+            rast_out, tri)
+        depth = depth.permute(0, 3, 1, 2)
+        mask = (rast_out[..., 3] > 0).float().unsqueeze(1)
+        depth = mask * depth
+        uv[..., -1] = 1.0 - uv[..., -1]
+
+        rast_out, rast_db = dr.rasterize(
+            self.glctx,
+            vertex_ndc.contiguous(),
+            tri,
+            resolution=[rsize, rsize],
+            ranges=ranges)
+
+        interp_out, uv_da = dr.interpolate(
+            uv, rast_out, tri, rast_db, diff_attrs='all')
+
+        uv_texture = uv_texture.permute(0, 2, 3, 1).contiguous()
+        img = dr.texture(
+            uv_texture, interp_out, filter_mode='linear')  # , uv_da)
+        img = img * torch.clamp(rast_out[..., -1:], 0,
+                                1)  # Mask out background.
+
+        tex_map = uv_texture[0].detach().cpu().numpy()[..., ::-1] * 255.0
+
+        image = img.permute(0, 3, 1, 2)
+
+        return mask, depth, image, tex_map
+
+    def pred_shape_and_texture(self,
+                               vertex,
+                               tri,
+                               uv,
+                               target_img,
+                               base_tex=None):
+        """
+        Return:
+            mask               -- torch.tensor, size (B, 1, H, W)
+            depth              -- torch.tensor, size (B, 1, H, W)
+            features(optional) -- torch.tensor, size (B, C, H, W) if feat is not None
+
+        Parameters:
+            vertex          -- torch.tensor, size (B, N, 3)
+            tri             -- torch.tensor, size (B, M, 3) or (M, 3), triangles
+            uv                -- torch.tensor, size (B,N, 2),  uv mapping
+            base_tex   -- torch.tensor, size (B,H,W,C)
+        """
+        vertex = torch.cat([vertex[:, :35241, :], vertex[:, 37082:, :]],
+                           dim=1)  # BFM front
+        tri = torch.cat([tri[:69732, :], tri[73936:, ]], dim=0)
+        uv = torch.cat([uv[:, :35241, :], uv[:, 37082:, :]], dim=1)
+        tri[69732:, :] = tri[69732:, :] - (37082 - 35241)
+
+        device = vertex.device
+        rsize = int(self.rasterize_size)
+        ndc_proj = self.ndc_proj.to(device)
+        # trans to homogeneous coordinates of 3d vertices, the direction of y is the same as v
+        if vertex.shape[-1] == 3:
+            vertex = torch.cat(
+                [vertex, torch.ones([*vertex.shape[:2], 1]).to(device)],
+                dim=-1)
+            vertex[..., 1] = -vertex[..., 1]
+
+        vertex_ndc = vertex @ ndc_proj.t()
+        if self.glctx is None:
+            self.glctx = dr.RasterizeCudaContext(device=device)
+
+        ranges = None
+        if isinstance(tri, List) or len(tri.shape) == 3:
+            vum = vertex_ndc.shape[1]
+            fnum = torch.tensor([f.shape[0]
+                                 for f in tri]).unsqueeze(1).to(device)
+
+            fstartidx = torch.cumsum(fnum, dim=0) - fnum
+            ranges = torch.cat([fstartidx, fnum],
+                               axis=1).type(torch.int32).cpu()
+            for i in range(tri.shape[0]):
+                tri[i] = tri[i] + i * vum
+            vertex_ndc = torch.cat(vertex_ndc, dim=0)
+            tri = torch.cat(tri, dim=0)
+
+        # for range_mode vetex: [B*N, 4], tri: [B*M, 3], for instance_mode vetex: [B, N, 4], tri: [M, 3]
+        tri = tri.type(torch.int32).contiguous()
+        rast_out, _ = dr.rasterize(
+            self.glctx,
+            vertex_ndc.contiguous(),
+            tri,
+            resolution=[rsize, rsize],
+            ranges=ranges)
+
+        depth, _ = dr.interpolate(
+            vertex.reshape([-1, 4])[..., 2].unsqueeze(1).contiguous(),
+            rast_out, tri)
+        depth = depth.permute(0, 3, 1, 2)
+        mask = (rast_out[..., 3] > 0).float().unsqueeze(1)
+        depth = mask * depth
+        uv[..., -1] = 1.0 - uv[..., -1]
+
+        rast_out, rast_db = dr.rasterize(
+            self.glctx,
+            vertex_ndc.contiguous(),
+            tri,
+            resolution=[rsize, rsize],
+            ranges=ranges)
+
+        interp_out, uv_da = dr.interpolate(
+            uv, rast_out, tri, rast_db, diff_attrs='all')
+
+        mask_3c = mask.permute(0, 2, 3, 1)
+        mask_3c = torch.cat((mask_3c, mask_3c, mask_3c), dim=-1)
+        maskout_img = mask_3c * target_img
+        mean_color = torch.sum(maskout_img, dim=(1, 2))
+        valid_pixel_count = torch.sum(mask)
+
+        mean_color = mean_color / valid_pixel_count
+
+        tex = torch.zeros((1, 128 * 5 // 4, 128, 3), dtype=torch.float32)
+        tex[:, :, :, 0] = mean_color[0, 0]
+        tex[:, :, :, 1] = mean_color[0, 1]
+        tex[:, :, :, 2] = mean_color[0, 2]
+
+        tex = tex.cuda()
+
+        tex_mask = torch.zeros((1, 2048 * 5 // 4, 2048, 3),
+                               dtype=torch.float32)
+        tex_mask[:, :, :, 1] = 1.0
+        tex_mask = tex_mask.cuda()
+        tex_mask.requires_grad = True
+        tex_mask = tex_mask.contiguous()
+
+        criterionTV = TVLoss()
+
+        if base_tex is not None:
+            base_tex = base_tex.cuda()
+
+        for tex_resolution in [64, 128, 256, 512, 1024, 2048]:
+            tex = tex.detach()
+            tex = tex.permute(0, 3, 1, 2)
+            tex = F.interpolate(tex, (tex_resolution * 5 // 4, tex_resolution))
+            tex = tex.permute(0, 2, 3, 1).contiguous()
+
+            if base_tex is not None:
+                _base_tex = base_tex.permute(0, 3, 1, 2)
+                _base_tex = F.interpolate(
+                    _base_tex, (tex_resolution * 5 // 4, tex_resolution))
+                _base_tex = _base_tex.permute(0, 2, 3, 1).contiguous()
+                tex += _base_tex
+
+            tex.requires_grad = True
+            optim = torch.optim.Adam([tex], lr=1e-2)
+
+            texture_opt_iters = 100
+
+            if tex_resolution == 2048:
+                optim_mask = torch.optim.Adam([tex_mask], lr=1e-2)
+
+            for i in range(int(texture_opt_iters)):
+
+                if tex_resolution == 2048:
+                    optim_mask.zero_grad()
+                    rendered = dr.texture(
+                        tex_mask, interp_out, filter_mode='linear')  # , uv_da)
+                    rendered = rendered * torch.clamp(
+                        rast_out[..., -1:], 0, 1)  # Mask out background.
+                    tex_loss = torch.mean((target_img - rendered)**2)
+
+                    tex_loss.backward()
+                    optim_mask.step()
+
+                optim.zero_grad()
+
+                img = dr.texture(
+                    tex, interp_out, filter_mode='linear')  # , uv_da)
+                img = img * torch.clamp(rast_out[..., -1:], 0,
+                                        1)  # Mask out background.
+                recon_loss = torch.mean((target_img - img)**2)
+
+                if tex_resolution < 2048:
+                    tv_loss = criterionTV(tex.permute(0, 3, 1, 2))
+
+                    total_loss = recon_loss + tv_loss * 0.01
+                else:
+
+                    total_loss = recon_loss
+
+                total_loss.backward()
+                optim.step()
+
+        tex_map = tex[0].detach().cpu().numpy()[..., ::-1] * 255.0
+
+        image = img.permute(0, 3, 1, 2)
+
+        tex_mask = tex_mask[0].detach().cpu().numpy() * 255.0
+        tex_mask = np.where(tex_mask[..., 1] > 250, 1.0, 0.0) * np.where(
+            tex_mask[..., 0] < 10, 1.0, 0) * np.where(tex_mask[..., 2] < 10,
+                                                      1.0, 0)
+        tex_mask = 1.0 - tex_mask
+
+        return mask, depth, image, tex_map, tex_mask
diff --git a/modelscope/models/cv/face_reconstruction/models/opt.py b/modelscope/models/cv/face_reconstruction/models/opt.py
new file mode 100644
index 00000000..c979e64e
--- /dev/null
+++ b/modelscope/models/cv/face_reconstruction/models/opt.py
@@ -0,0 +1,13 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+bfm_folder = ''
+bfm_model = 'head_model_for_maas.mat'
+camera_d = 10.0
+center = 112.0
+focal = 1015.0
+isTrain = False
+net_recon = 'resnet50'
+phase = 'test'
+use_ddp = False
+use_last_fc = False
+z_far = 15.0
+z_near = 5.0
diff --git a/modelscope/models/cv/face_reconstruction/utils.py b/modelscope/models/cv/face_reconstruction/utils.py
new file mode 100644
index 00000000..9f2a25ed
--- /dev/null
+++ b/modelscope/models/cv/face_reconstruction/utils.py
@@ -0,0 +1,752 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import argparse
+import math
+import os
+import os.path as osp
+from array import array
+
+import cv2
+import numba
+import numpy as np
+import torch
+import torch.nn.functional as F
+from PIL import Image
+from scipy.io import loadmat, savemat
+
+
+def img_value_rescale(img, old_range: list, new_range: list):
+    assert len(old_range) == 2
+    assert len(new_range) == 2
+    img = (img - old_range[0]) / (old_range[1] - old_range[0]) * (
+        new_range[1] - new_range[0]) + new_range[0]
+    return img
+
+
+def resize_on_long_side(img, long_side=800):
+    src_height = img.shape[0]
+    src_width = img.shape[1]
+
+    if src_height > src_width:
+        scale = long_side * 1.0 / src_height
+        _img = cv2.resize(
+            img, (int(src_width * scale), long_side),
+            interpolation=cv2.INTER_CUBIC)
+
+    else:
+        scale = long_side * 1.0 / src_width
+        _img = cv2.resize(
+            img, (long_side, int(src_height * scale)),
+            interpolation=cv2.INTER_CUBIC)
+
+    return _img, scale
+
+
+def get_mg_layer(src, gt, skin_mask=None):
+    """
+    src, gt shape: [h, w, 3] value: [0, 1]
+    return: mg, shape: [h, w, 1] value: [0, 1]
+    """
+    mg = (src * src - gt + 1e-10) / (2 * src * src - 2 * src + 2e-10)
+    mg[mg < 0] = 0.5
+    mg[mg > 1] = 0.5
+
+    diff_abs = np.abs(gt - src)
+    mg[diff_abs < (1 / 255.0)] = 0.5
+
+    if skin_mask is not None:
+        mg[skin_mask == 0] = 0.5
+
+    return mg
+
+
+def str2bool(v):
+    if isinstance(v, bool):
+        return v
+    if v.lower() in ('yes', 'true', 't', 'y', '1'):
+        return True
+    elif v.lower() in ('no', 'false', 'f', 'n', '0'):
+        return False
+    else:
+        raise argparse.ArgumentTypeError('Boolean value expected.')
+
+
+def spread_flow(length, spread_ratio=2):
+    Flow = np.zeros(shape=(length, length, 2), dtype=np.float32)
+    mag = np.zeros(shape=(length, length), dtype=np.float32)
+
+    radius = length * 0.5
+    for h in range(Flow.shape[0]):
+        for w in range(Flow.shape[1]):
+
+            if (h - length // 2)**2 + (w - length // 2)**2 <= radius**2:
+                Flow[h, w, 0] = -(w - length // 2)
+                Flow[h, w, 1] = -(h - length // 2)
+
+                distance = np.sqrt((w - length // 2)**2 + (h - length // 2)**2)
+
+                if distance <= radius / 2.0:
+                    mag[h, w] = 2.0 / radius * distance
+                else:
+                    mag[h, w] = -2.0 / radius * distance + 2.0
+
+    _, ang = cv2.cartToPolar(Flow[..., 0] + 1e-8, Flow[..., 1] + 1e-8)
+
+    mag *= spread_ratio
+
+    x, y = cv2.polarToCart(mag, ang, angleInDegrees=False)
+    Flow = np.dstack((x, y))
+
+    return Flow
+
+
+@numba.jit(nopython=True, parallel=True)
+def bilinear_interp(x, y, v11, v12, v21, v22):
+    t = 0.2
+
+    if x < t and y < t:
+        return v11
+    elif x < t and y > 1 - t:
+        return v12
+    elif x > 1 - t and y < t:
+        return v21
+    elif x > 1 - t and y > 1 - t:
+        return v22
+    else:
+        result = (v11 * (1 - y) + v12 * y) * (1 - x) + \
+                 (v21 * (1 - y) + v22 * y) * x
+        if result < 0:
+            result = 0
+
+        if result > 255:
+            result = 255
+        return result
+
+
+@numba.jit(nopython=True, parallel=True)
+def image_warp_grid1(rDx, rDy, oriImg, transRatio, pads):
+    # assert oriImg.dtype == np.uint8
+    srcW = oriImg.shape[1]
+    srcH = oriImg.shape[0]
+
+    padTop, padBottom, padLeft, padRight = pads
+
+    left_bound = padLeft + 1
+    right_bound = srcW - padRight
+    bottom_bound = srcH - padBottom
+    top_bound = padTop + 1
+
+    newImg = oriImg.copy()
+
+    for i in range(srcH):
+        for j in range(srcW):
+            _i = i
+            _j = j
+
+            deltaX = rDx[_i, _j]
+            deltaY = rDy[_i, _j]
+
+            if abs(deltaX) < 0.2 and abs(deltaY) < 0.2:
+                continue
+
+            nx = _j + deltaX * transRatio
+            ny = _i + deltaY * transRatio
+
+            if nx >= srcW - padRight:
+                if nx > srcW - 1:
+                    nx = srcW - 1
+
+                if _j < right_bound:
+                    right_bound = _j
+
+            if ny >= srcH - padBottom:
+                if ny > srcH - 1:
+                    ny = srcH - 1
+
+                if _i < bottom_bound:
+                    bottom_bound = _i
+
+            if nx < padLeft:
+                if nx < 0:
+                    nx = 0
+
+                if _j + 1 > left_bound:
+                    left_bound = _j + 1
+
+            if ny < padTop:
+                if ny < 0:
+                    ny = 0
+
+                if _i + 1 > top_bound:
+                    top_bound = _i + 1
+
+            nxi = int(math.floor(nx))
+            nyi = int(math.floor(ny))
+            nxi1 = int(math.ceil(nx))
+            nyi1 = int(math.ceil(ny))
+
+            if nxi < 0:
+                nxi = 0
+            if nxi > oriImg.shape[1] - 1:
+                nxi = oriImg.shape[1] - 1
+
+            if nxi1 < 0:
+                nxi1 = 0
+            if nxi1 > oriImg.shape[1] - 1:
+                nxi1 = oriImg.shape[1] - 1
+
+            if nyi < 0:
+                nyi = 0
+            if nyi > oriImg.shape[0] - 1:
+                nyi = oriImg.shape[0] - 1
+
+            if nyi1 < 0:
+                nyi1 = 0
+            if nyi1 > oriImg.shape[0] - 1:
+                nyi1 = oriImg.shape[0] - 1
+
+            for ll in range(3):
+                newImg[_i, _j,
+                       ll] = bilinear_interp(ny - nyi, nx - nxi,
+                                             oriImg[nyi, nxi,
+                                                    ll], oriImg[nyi, nxi1, ll],
+                                             oriImg[nyi1, nxi,
+                                                    ll], oriImg[nyi1, nxi1,
+                                                                ll])
+
+    return newImg, top_bound, bottom_bound, left_bound, right_bound
+
+
+def warp(x, flow, mode='bilinear', padding_mode='zeros', coff=0.1):
+    """
+
+    Args:
+        x: [n, c, h, w]
+        flow: [n, h, w, 2]
+        mode:
+        padding_mode:
+        coff:
+
+    Returns:
+
+    """
+    n, c, h, w = x.size()
+    yv, xv = torch.meshgrid([torch.arange(h), torch.arange(w)])
+    xv = xv.float() / (w - 1) * 2.0 - 1
+    yv = yv.float() / (h - 1) * 2.0 - 1
+    '''
+    grid[0,:,:,0] =
+    -1, .....1
+    -1, .....1
+    -1, .....1
+
+    grid[0,:,:,1] =
+    -1,  -1, -1
+     ;        ;
+     1,   1,  1
+
+    '''
+
+    if torch.cuda.is_available():
+        grid = torch.cat((xv.unsqueeze(-1), yv.unsqueeze(-1)),
+                         -1).unsqueeze(0).cuda()
+    else:
+        grid = torch.cat((xv.unsqueeze(-1), yv.unsqueeze(-1)), -1).unsqueeze(0)
+    grid_x = grid + 2 * flow * coff
+    warp_x = F.grid_sample(x, grid_x, mode=mode, padding_mode=padding_mode)
+    return warp_x
+
+
+# load expression basis
+def LoadExpBasis(bfm_folder='asset/BFM'):
+    n_vertex = 53215
+    Expbin = open(osp.join(bfm_folder, 'Exp_Pca.bin'), 'rb')
+    exp_dim = array('i')
+    exp_dim.fromfile(Expbin, 1)
+    expMU = array('f')
+    expPC = array('f')
+    expMU.fromfile(Expbin, 3 * n_vertex)
+    expPC.fromfile(Expbin, 3 * exp_dim[0] * n_vertex)
+    Expbin.close()
+
+    expPC = np.array(expPC)
+    expPC = np.reshape(expPC, [exp_dim[0], -1])
+    expPC = np.transpose(expPC)
+
+    expEV = np.loadtxt(osp.join(bfm_folder, 'std_exp.txt'))
+
+    return expPC, expEV
+
+
+# transfer original BFM09 to our face model
+def transferBFM09(bfm_folder='BFM'):
+    print('Transfer BFM09 to BFM_model_front......')
+    original_BFM = loadmat(osp.join(bfm_folder, '01_MorphableModel.mat'))
+    shapePC = original_BFM['shapePC']  # shape basis, 160470*199
+    shapeEV = original_BFM['shapeEV']  # corresponding eigen value, 199*1
+    shapeMU = original_BFM['shapeMU']  # mean face, 160470*1
+    texPC = original_BFM['texPC']  # texture basis, 160470*199
+    texEV = original_BFM['texEV']  # eigen value, 199*1
+    texMU = original_BFM['texMU']  # mean texture, 160470*1
+
+    expPC, expEV = LoadExpBasis()
+
+    # transfer BFM09 to our face model
+
+    idBase = shapePC * np.reshape(shapeEV, [-1, 199])
+    idBase = idBase / 1e5  # unify the scale to decimeter
+    idBase = idBase[:, :80]  # use only first 80 basis
+
+    exBase = expPC * np.reshape(expEV, [-1, 79])
+    exBase = exBase / 1e5  # unify the scale to decimeter
+    exBase = exBase[:, :64]  # use only first 64 basis
+
+    texBase = texPC * np.reshape(texEV, [-1, 199])
+    texBase = texBase[:, :80]  # use only first 80 basis
+
+    # our face model is cropped along face landmarks and contains only 35709 vertex.
+    # original BFM09 contains 53490 vertex, and expression basis provided by Guo et al. contains 53215 vertex.
+    # thus we select corresponding vertex to get our face model.
+
+    index_exp = loadmat(osp.join(bfm_folder, 'BFM_front_idx.mat'))
+    index_exp = index_exp['idx'].astype(
+        np.int32) - 1  # starts from 0 (to 53215)
+
+    index_shape = loadmat(osp.join(bfm_folder, 'BFM_exp_idx.mat'))
+    index_shape = index_shape['trimIndex'].astype(
+        np.int32) - 1  # starts from 0 (to 53490)
+    index_shape = index_shape[index_exp]
+
+    idBase = np.reshape(idBase, [-1, 3, 80])
+    idBase = idBase[index_shape, :, :]
+    idBase = np.reshape(idBase, [-1, 80])
+
+    texBase = np.reshape(texBase, [-1, 3, 80])
+    texBase = texBase[index_shape, :, :]
+    texBase = np.reshape(texBase, [-1, 80])
+
+    exBase = np.reshape(exBase, [-1, 3, 64])
+    exBase = exBase[index_exp, :, :]
+    exBase = np.reshape(exBase, [-1, 64])
+
+    meanshape = np.reshape(shapeMU, [-1, 3]) / 1e5
+    meanshape = meanshape[index_shape, :]
+    meanshape = np.reshape(meanshape, [1, -1])
+
+    meantex = np.reshape(texMU, [-1, 3])
+    meantex = meantex[index_shape, :]
+    meantex = np.reshape(meantex, [1, -1])
+
+    # other info contains triangles, region used for computing photometric loss,
+    # region used for skin texture regularization, and 68 landmarks index etc.
+    other_info = loadmat(osp.join(bfm_folder, 'facemodel_info.mat'))
+    frontmask2_idx = other_info['frontmask2_idx']
+    skinmask = other_info['skinmask']
+    keypoints = other_info['keypoints']
+    point_buf = other_info['point_buf']
+    tri = other_info['tri']
+    tri_mask2 = other_info['tri_mask2']
+
+    # save our face model
+    savemat(
+        osp.join(bfm_folder, 'BFM_model_front.mat'), {
+            'meanshape': meanshape,
+            'meantex': meantex,
+            'idBase': idBase,
+            'exBase': exBase,
+            'texBase': texBase,
+            'tri': tri,
+            'point_buf': point_buf,
+            'tri_mask2': tri_mask2,
+            'keypoints': keypoints,
+            'frontmask2_idx': frontmask2_idx,
+            'skinmask': skinmask
+        })
+
+
+# load landmarks for standard face, which is used for image preprocessing
+def load_lm3d(bfm_folder):
+
+    Lm3D = loadmat(osp.join(bfm_folder, 'similarity_Lm3D_all.mat'))
+    Lm3D = Lm3D['lm']
+
+    # calculate 5 facial landmarks using 68 landmarks
+    lm_idx = np.array([31, 37, 40, 43, 46, 49, 55]) - 1
+    value_list = [
+        Lm3D[lm_idx[0], :],
+        np.mean(Lm3D[lm_idx[[1, 2]], :], 0),
+        np.mean(Lm3D[lm_idx[[3, 4]], :], 0), Lm3D[lm_idx[5], :],
+        Lm3D[lm_idx[6], :]
+    ]
+    Lm3D = np.stack(value_list, axis=0)
+    Lm3D = Lm3D[[1, 2, 0, 3, 4], :]
+
+    return Lm3D
+
+
+def write_obj(save_path, mesh):
+    save_dir = os.path.dirname(save_path)
+    save_name = os.path.splitext(os.path.basename(save_path))[0]
+
+    if 'texture_map' in mesh:
+        cv2.imwrite(
+            os.path.join(save_dir, save_name + '.jpg'), mesh['texture_map'])
+
+        with open(os.path.join(save_dir, save_name + '.mtl'), 'w') as wf:
+            wf.write('# Created by ModelScope\n')
+            wf.write('newmtl material_0\n')
+            wf.write('Ka 1.000000 0.000000 0.000000\n')
+            wf.write('Kd 1.000000 1.000000 1.000000\n')
+            wf.write('Ks 0.000000 0.000000 0.000000\n')
+            wf.write('Tr 0.000000\n')
+            wf.write('illum 0\n')
+            wf.write('Ns 0.000000\n')
+            wf.write('map_Kd {}\n'.format(save_name + '.jpg'))
+
+    with open(save_path, 'w') as wf:
+        if 'texture_map' in mesh:
+            wf.write('# Create by ModelScope\n')
+            wf.write('mtllib ./{}.mtl\n'.format(save_name))
+
+        if 'colors' in mesh:
+            for i, v in enumerate(mesh['vertices']):
+                wf.write('v {} {} {} {} {} {}\n'.format(
+                    v[0], v[1], v[2], mesh['colors'][i][0],
+                    mesh['colors'][i][1], mesh['colors'][i][2]))
+        else:
+            for v in mesh['vertices']:
+                wf.write('v {} {} {}\n'.format(v[0], v[1], v[2]))
+
+        if 'UVs' in mesh:
+            for uv in mesh['UVs']:
+                wf.write('vt {} {}\n'.format(uv[0], uv[1]))
+
+        if 'normals' in mesh:
+            for vn in mesh['normals']:
+                wf.write('vn {} {} {}\n'.format(vn[0], vn[1], vn[2]))
+
+        if 'faces' in mesh:
+            for ind, face in enumerate(mesh['faces']):
+                if 'faces_uv' in mesh or 'faces_normal' in mesh:
+                    if 'faces_uv' in mesh:
+                        face_uv = mesh['faces_uv'][ind]
+                    else:
+                        face_uv = face
+                    if 'faces_normal' in mesh:
+                        face_normal = mesh['faces_normal'][ind]
+                    else:
+                        face_normal = face
+                    row = 'f ' + ' '.join([
+                        '{}/{}/{}'.format(face[i], face_uv[i], face_normal[i])
+                        for i in range(len(face))
+                    ]) + '\n'
+                else:
+                    row = 'f ' + ' '.join(
+                        ['{}'.format(face[i])
+                         for i in range(len(face))]) + '\n'
+                wf.write(row)
+
+
+def read_obj(obj_path, print_shape=True):
+    with open(obj_path, 'r') as f:
+        bfm_lines = f.readlines()
+
+    vertices = []
+    faces = []
+    uvs = []
+    vns = []
+    faces_uv = []
+    faces_normal = []
+    max_face_length = 0
+    for line in bfm_lines:
+        if line[:2] == 'v ':
+            vertex = [
+                float(a) for a in line.strip().split(' ')[1:] if len(a) > 0
+            ]
+            vertices.append(vertex)
+
+        if line[:2] == 'f ':
+            items = line.strip().split(' ')[1:]
+            face = [int(a.split('/')[0]) for a in items if len(a) > 0]
+            max_face_length = max(max_face_length, len(face))
+            if len(faces) > 0 and len(face) != len(faces[0]):
+                continue
+            faces.append(face)
+
+            if '/' in items[0] and len(items[0].split('/')[1]) > 0:
+                face_uv = [int(a.split('/')[1]) for a in items if len(a) > 0]
+                faces_uv.append(face_uv)
+
+            if '/' in items[0] and len(items[0].split('/')) >= 3 and len(
+                    items[0].split('/')[2]) > 0:
+                face_normal = [
+                    int(a.split('/')[2]) for a in items if len(a) > 0
+                ]
+                faces_normal.append(face_normal)
+
+        if line[:3] == 'vt ':
+            items = line.strip().split(' ')[1:]
+            uv = [float(a) for a in items if len(a) > 0]
+            uvs.append(uv)
+
+        if line[:3] == 'vn ':
+            items = line.strip().split(' ')[1:]
+            vn = [float(a) for a in items if len(a) > 0]
+            vns.append(vn)
+
+    vertices = np.array(vertices).astype(np.float32)
+    if max_face_length <= 3:
+        faces = np.array(faces).astype(np.int32)
+
+    if vertices.shape[1] == 3:
+        mesh = {
+            'vertices': vertices,
+            'faces': faces,
+        }
+    else:
+        mesh = {
+            'vertices': vertices[:, :3],
+            'colors': vertices[:, 3:],
+            'faces': faces,
+        }
+
+    if len(uvs) > 0:
+        uvs = np.array(uvs).astype(np.float32)
+        mesh['uvs'] = uvs
+
+    if len(vns) > 0:
+        vns = np.array(vns).astype(np.float32)
+        mesh['vns'] = vns
+
+    if len(faces_uv) > 0:
+        if max_face_length <= 3:
+            faces_uv = np.array(faces_uv).astype(np.int32)
+        mesh['faces_uv'] = faces_uv
+
+    if len(faces_normal) > 0:
+        if max_face_length <= 3:
+            faces_normal = np.array(faces_normal).astype(np.int32)
+        mesh['faces_normal'] = faces_normal
+
+    return mesh
+
+
+# calculating least square problem for image alignment
+def POS(xp, x):
+    npts = xp.shape[1]
+
+    A = np.zeros([2 * npts, 8])
+
+    A[0:2 * npts - 1:2, 0:3] = x.transpose()
+    A[0:2 * npts - 1:2, 3] = 1
+
+    A[1:2 * npts:2, 4:7] = x.transpose()
+    A[1:2 * npts:2, 7] = 1
+
+    b = np.reshape(xp.transpose(), [2 * npts, 1])
+
+    k, _, _, _ = np.linalg.lstsq(A, b)
+
+    R1 = k[0:3]
+    R2 = k[4:7]
+    sTx = k[3]
+    sTy = k[7]
+    s = (np.linalg.norm(R1) + np.linalg.norm(R2)) / 2
+    t = np.stack([sTx, sTy], axis=0)
+
+    return t, s
+
+
+# bounding box for 68 landmark detection
+def BBRegression(points, params):
+    w1 = params['W1']
+    b1 = params['B1']
+    w2 = params['W2']
+    b2 = params['B2']
+    data = points.copy()
+    data = data.reshape([5, 2])
+    data_mean = np.mean(data, axis=0)
+    x_mean = data_mean[0]
+    y_mean = data_mean[1]
+    data[:, 0] = data[:, 0] - x_mean
+    data[:, 1] = data[:, 1] - y_mean
+
+    rms = np.sqrt(np.sum(data**2) / 5)
+    data = data / rms
+    data = data.reshape([1, 10])
+    data = np.transpose(data)
+    inputs = np.matmul(w1, data) + b1
+    inputs = 2 / (1 + np.exp(-2 * inputs)) - 1
+    inputs = np.matmul(w2, inputs) + b2
+    inputs = np.transpose(inputs)
+    x = inputs[:, 0] * rms + x_mean
+    y = inputs[:, 1] * rms + y_mean
+    w = 224 / inputs[:, 2] * rms
+    rects = [x, y, w, w]
+    return np.array(rects).reshape([4])
+
+
+# utils for landmark detection
+def img_padding(img, box):
+    success = True
+    bbox = box.copy()
+    res = np.zeros([2 * img.shape[0], 2 * img.shape[1], 3])
+    res[img.shape[0] // 2:img.shape[0] + img.shape[0] // 2,
+        img.shape[1] // 2:img.shape[1] + img.shape[1] // 2] = img
+
+    bbox[0] = bbox[0] + img.shape[1] // 2
+    bbox[1] = bbox[1] + img.shape[0] // 2
+    if bbox[0] < 0 or bbox[1] < 0:
+        success = False
+    return res, bbox, success
+
+
+# utils for landmark detection
+def crop(img, bbox):
+    padded_img, padded_bbox, flag = img_padding(img, bbox)
+    if flag:
+        crop_img = padded_img[padded_bbox[1]:padded_bbox[1] + padded_bbox[3],
+                              padded_bbox[0]:padded_bbox[0] + padded_bbox[2]]
+        crop_img = cv2.resize(
+            crop_img.astype(np.uint8), (224, 224),
+            interpolation=cv2.INTER_CUBIC)
+        scale = 224 / padded_bbox[3]
+        return crop_img, scale
+    else:
+        return padded_img, 0
+
+
+# utils for landmark detection
+def scale_trans(img, lm, t, s):
+    imgw = img.shape[1]
+    imgh = img.shape[0]
+    M_s = np.array(
+        [[1, 0, -t[0] + imgw // 2 + 0.5], [0, 1, -imgh // 2 + t[1]]],
+        dtype=np.float32)
+    img = cv2.warpAffine(img, M_s, (imgw, imgh))
+    w = int(imgw / s * 100)
+    h = int(imgh / s * 100)
+    img = cv2.resize(img, (w, h))
+    lm = np.stack([lm[:, 0] - t[0] + imgw // 2, lm[:, 1] - t[1] + imgh // 2],
+                  axis=1) / s * 100
+
+    left = w // 2 - 112
+    up = h // 2 - 112
+    bbox = [left, up, 224, 224]
+    cropped_img, scale2 = crop(img, bbox)
+    assert (scale2 != 0)
+    t1 = np.array([bbox[0], bbox[1]])
+
+    # back to raw img s * crop + s * t1 + t2
+    t1 = np.array([w // 2 - 112, h // 2 - 112])
+    scale = s / 100
+    t2 = np.array([t[0] - imgw / 2, t[1] - imgh / 2])
+    inv = (scale / scale2, scale * t1 + t2.reshape([2]))
+    return cropped_img, inv
+
+
+# utils for landmark detection
+def align_for_lm(img, five_points, params):
+    five_points = np.array(five_points).reshape([1, 10])
+    bbox = BBRegression(five_points, params)
+    assert (bbox[2] != 0)
+    bbox = np.round(bbox).astype(np.int32)
+    crop_img, scale = crop(img, bbox)
+    return crop_img, scale, bbox
+
+
+# resize and crop images for face reconstruction
+def resize_n_crop_img(img, lm, t, s, target_size=224., mask=None):
+    w0, h0 = img.size
+    w = (w0 * s).astype(np.int32)
+    h = (h0 * s).astype(np.int32)
+    left = (w / 2 - target_size / 2 + float(
+        (t[0] - w0 / 2) * s)).astype(np.int32)
+    right = left + target_size
+    up = (h / 2 - target_size / 2 + float(
+        (h0 / 2 - t[1]) * s)).astype(np.int32)
+    below = up + target_size
+
+    new_img = img.resize((w, h), resample=Image.BICUBIC)
+    new_img = new_img.crop((left, up, right, below))
+
+    if mask is not None:
+        mask = mask.resize((w, h), resample=Image.BICUBIC)
+        mask = mask.crop((left, up, right, below))
+
+    new_lm = np.stack([lm[:, 0] - t[0] + w0 / 2, lm[:, 1] - t[1] + h0 / 2],
+                      axis=1) * s
+    new_lm = new_lm - np.reshape(
+        np.array([(w / 2 - target_size / 2),
+                  (h / 2 - target_size / 2)]), [1, 2])
+
+    return new_img, new_lm, mask
+
+
+# utils for face reconstruction
+def extract_5p(lm):
+    lm_idx = np.array([31, 37, 40, 43, 46, 49, 55]) - 1
+    value_list = [
+        lm[lm_idx[0], :],
+        np.mean(lm[lm_idx[[1, 2]], :], 0),
+        np.mean(lm[lm_idx[[3, 4]], :], 0), lm[lm_idx[5], :], lm[lm_idx[6], :]
+    ]
+    lm5p = np.stack(value_list, axis=0)
+    lm5p = lm5p[[1, 2, 0, 3, 4], :]
+    return lm5p
+
+
+# utils for face reconstruction
+def align_img(img, lm, lm3D, mask=None, target_size=224., rescale_factor=102.):
+    """
+    Return:
+        transparams        --numpy.array  (raw_W, raw_H, scale, tx, ty)
+        img_new            --PIL.Image  (target_size, target_size, 3)
+        lm_new             --numpy.array  (68, 2), y direction is opposite to v direction
+        mask_new           --PIL.Image  (target_size, target_size)
+
+    Parameters:
+        img                --PIL.Image  (raw_H, raw_W, 3)
+        lm                 --numpy.array  (68, 2), y direction is opposite to v direction
+        lm3D               --numpy.array  (5, 3)
+        mask               --PIL.Image  (raw_H, raw_W, 3)
+    """
+
+    w0, h0 = img.size
+    if lm.shape[0] != 5:
+        lm5p = extract_5p(lm)
+    else:
+        lm5p = lm
+
+    # calculate translation and scale factors using 5 facial landmarks and standard landmarks of a 3D face
+    t, s = POS(lm5p.transpose(), lm3D.transpose())
+    s = rescale_factor / s
+
+    # processing the image
+    img_new, lm_new, mask_new = resize_n_crop_img(
+        img, lm, t, s, target_size=target_size, mask=mask)
+    trans_params = np.array([w0, h0, s, t[0], t[1]])
+
+    return trans_params, img_new, lm_new, mask_new
+
+
+def normalize_v3(arr):
+    ''' Normalize a numpy array of 3 component vectors shape=(n,3) '''
+    lens = np.sqrt(arr[:, 0]**2 + arr[:, 1]**2 + arr[:, 2]**2)[:, None]
+    arr /= lens
+    return arr
+
+
+def estimate_normals(vertices, faces):
+    norm = np.zeros(vertices.shape, dtype=vertices.dtype)
+    tris = vertices[faces]
+    n = np.cross(tris[::, 1] - tris[::, 0], tris[::, 2] - tris[::, 0])
+    n[(n[:, 0] == 0) * (n[:, 1] == 0) * (n[:, 2] == 0)] = [0, 0, 1.0]
+    n = normalize_v3(n)
+    for i in range(3):
+        for j in range(faces.shape[0]):
+            norm[faces[j, i]] += n[j]
+
+    inds = (norm[:, 0] == 0) * (norm[:, 1] == 0) * (norm[:, 2] == 0)
+    norm[inds] = [0, 0, 1.0]
+    result = normalize_v3(norm)
+    return result
diff --git a/modelscope/models/cv/facial_landmark_confidence/flc/facial_landmark_confidence.py b/modelscope/models/cv/facial_landmark_confidence/flc/facial_landmark_confidence.py
index 27474d14..152903c1 100644
--- a/modelscope/models/cv/facial_landmark_confidence/flc/facial_landmark_confidence.py
+++ b/modelscope/models/cv/facial_landmark_confidence/flc/facial_landmark_confidence.py
@@ -16,8 +16,7 @@ from modelscope.utils.constant import ModelFile, Tasks
 from .manual_landmark_net import LandmarkConfidence
 
 
-@MODELS.register_module(
-    Tasks.facial_landmark_confidence, module_name=Models.flc)
+@MODELS.register_module(Tasks.face_2d_keypoints, module_name=Models.flc)
 class FacialLandmarkConfidence(TorchModel):
 
     def __init__(self, model_path, device='cuda'):
diff --git a/modelscope/models/cv/image_classification/__init__.py b/modelscope/models/cv/image_classification/__init__.py
index 7afe44bb..5cf14863 100644
--- a/modelscope/models/cv/image_classification/__init__.py
+++ b/modelscope/models/cv/image_classification/__init__.py
@@ -5,10 +5,12 @@ from modelscope.utils.import_utils import LazyImportModule
 
 if TYPE_CHECKING:
     from .mmcls_model import ClassificationModel
+    from .resnet50_cc import ContentCheckBackbone
 
 else:
     _import_structure = {
         'mmcls_model': ['ClassificationModel'],
+        'resnet50_cc': ['ContentCheckBackbone'],
     }
 
     import sys
diff --git a/modelscope/models/cv/image_classification/resnet50_cc.py b/modelscope/models/cv/image_classification/resnet50_cc.py
new file mode 100644
index 00000000..607d4deb
--- /dev/null
+++ b/modelscope/models/cv/image_classification/resnet50_cc.py
@@ -0,0 +1,50 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os
+from collections import namedtuple
+from math import lgamma
+
+import torch
+import torch.nn as nn
+from torchvision import models
+
+from modelscope.metainfo import Models
+from modelscope.models import MODELS
+from modelscope.models.base import TorchModel
+from modelscope.utils.constant import ModelFile, Tasks
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+
+@MODELS.register_module(Tasks.image_classification, Models.content_check)
+class ContentCheckBackbone(TorchModel):
+
+    def __init__(self, *args, **kwargs):
+        super(ContentCheckBackbone, self).__init__()
+        cc_model = models.resnet50()
+        cc_model.fc = nn.Sequential(
+            nn.Linear(2048, 512),
+            nn.ReLU(),
+            nn.Dropout(0.2),
+            nn.Linear(512, 10),
+        )
+        self.model = cc_model
+
+    def forward(self, img):
+        x = self.model(img)
+        return x
+
+    @classmethod
+    def _instantiate(cls, **kwargs):
+        model_file = kwargs.get('model_name', ModelFile.TORCH_MODEL_FILE)
+        ckpt_path = os.path.join(kwargs['model_dir'], model_file)
+        logger.info(f'loading model from {ckpt_path}')
+        model_dir = kwargs.pop('model_dir')
+        model = cls(**kwargs)
+        ckpt_path = os.path.join(model_dir, model_file)
+        load_dict = torch.load(ckpt_path, map_location='cpu')
+        new_dict = {}
+        for k, v in load_dict.items():
+            new_dict['model.' + k] = v
+        model.load_state_dict(new_dict)
+        return model
diff --git a/modelscope/models/cv/image_color_enhance/__init__.py b/modelscope/models/cv/image_color_enhance/__init__.py
index 72f26b52..67027d51 100644
--- a/modelscope/models/cv/image_color_enhance/__init__.py
+++ b/modelscope/models/cv/image_color_enhance/__init__.py
@@ -5,10 +5,14 @@ from modelscope.utils.import_utils import LazyImportModule
 
 if TYPE_CHECKING:
     from .image_color_enhance import ImageColorEnhance
+    from .adaint import AdaIntImageColorEnhance
+    from .deeplpf import DeepLPFImageColorEnhance
 
 else:
     _import_structure = {
         'image_color_enhance': ['ImageColorEnhance'],
+        'adaint': ['AdaIntImageColorEnhance'],
+        'deeplpf': ['DeepLPFImageColorEnhance']
     }
 
     import sys
diff --git a/modelscope/models/cv/image_color_enhance/adaint/__init__.py b/modelscope/models/cv/image_color_enhance/adaint/__init__.py
new file mode 100644
index 00000000..2ac677b7
--- /dev/null
+++ b/modelscope/models/cv/image_color_enhance/adaint/__init__.py
@@ -0,0 +1 @@
+from .adaint import AdaIntImageColorEnhance
diff --git a/modelscope/models/cv/image_color_enhance/adaint/adaint.py b/modelscope/models/cv/image_color_enhance/adaint/adaint.py
new file mode 100644
index 00000000..8839f03a
--- /dev/null
+++ b/modelscope/models/cv/image_color_enhance/adaint/adaint.py
@@ -0,0 +1,396 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import numbers
+import os.path as osp
+from typing import Dict, Union
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torchvision
+
+from modelscope.metainfo import Models
+from modelscope.models.base import Tensor, TorchModel
+from modelscope.models.builder import MODELS
+from modelscope.utils.constant import ModelFile, Tasks
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+__all__ = ['AdaIntImageColorEnhance']
+
+try:
+    from modelscope.ops.ailut import ailut_transform
+except ImportError:
+    raise ImportError(
+        'The model [AdaInt] requires cuda extension to be installed.')
+
+
+class BasicBlock(nn.Sequential):
+    r"""The basic block module (Conv+LeakyReLU[+InstanceNorm]).
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size=3,
+                 stride=1,
+                 norm=False):
+        body = [
+            nn.Conv2d(
+                in_channels,
+                out_channels,
+                kernel_size,
+                stride=stride,
+                padding=1),
+            nn.LeakyReLU(0.2)
+        ]
+        if norm:
+            body.append(nn.InstanceNorm2d(out_channels, affine=True))
+        super(BasicBlock, self).__init__(*body)
+
+
+class TPAMIBackbone(nn.Sequential):
+    r"""The 5-layer CNN backbone module in [TPAMI 3D-LUT]
+        (https://github.com/HuiZeng/Image-Adaptive-3DLUT).
+
+    Args:
+        pretrained (bool, optional): [ignored].
+        input_resolution (int, optional): Resolution for pre-downsampling. Default: 256.
+        extra_pooling (bool, optional): Whether to insert an extra pooling layer
+            at the very end of the module to reduce the number of parameters of
+            the subsequent module. Default: False.
+    """
+
+    def __init__(self,
+                 pretrained=False,
+                 input_resolution=256,
+                 extra_pooling=False):
+        body = [
+            BasicBlock(3, 16, stride=2, norm=True),
+            BasicBlock(16, 32, stride=2, norm=True),
+            BasicBlock(32, 64, stride=2, norm=True),
+            BasicBlock(64, 128, stride=2, norm=True),
+            BasicBlock(128, 128, stride=2),
+            nn.Dropout(p=0.5),
+        ]
+        if extra_pooling:
+            body.append(nn.AdaptiveAvgPool2d(2))
+        super().__init__(*body)
+        self.input_resolution = input_resolution
+        self.out_channels = 128 * (4 if extra_pooling else 64)
+
+    def forward(self, imgs):
+        imgs = F.interpolate(
+            imgs,
+            size=(self.input_resolution, ) * 2,
+            mode='bilinear',
+            align_corners=False)
+        return super().forward(imgs).view(imgs.shape[0], -1)
+
+
+class Res18Backbone(nn.Module):
+    r"""The ResNet-18 backbone.
+
+    Args:
+        pretrained (bool, optional): Whether to use the torchvison pretrained weights.
+            Default: True.
+        input_resolution (int, optional): Resolution for pre-downsampling. Default: 224.
+        extra_pooling (bool, optional): [ignore].
+    """
+
+    def __init__(self,
+                 pretrained=True,
+                 input_resolution=224,
+                 extra_pooling=False):
+        super().__init__()
+        net = torchvision.models.resnet18(pretrained=pretrained)
+        net.fc = nn.Identity()
+        self.net = net
+        self.input_resolution = input_resolution
+        self.out_channels = 512
+
+    def forward(self, imgs):
+        imgs = F.interpolate(
+            imgs,
+            size=(self.input_resolution, ) * 2,
+            mode='bilinear',
+            align_corners=False)
+        return self.net(imgs).view(imgs.shape[0], -1)
+
+
+class LUTGenerator(nn.Module):
+    r"""The LUT generator module (mapping h).
+
+    Args:
+        n_colors (int): Number of input color channels.
+        n_vertices (int): Number of sampling points along each lattice dimension.
+        n_feats (int): Dimension of the input image representation vector.
+        n_ranks (int): Number of ranks in the mapping h (or the number of basis LUTs).
+    """
+
+    def __init__(self, n_colors, n_vertices, n_feats, n_ranks) -> None:
+        super().__init__()
+
+        # h0
+        self.weights_generator = nn.Linear(n_feats, n_ranks)
+        # h1
+        self.basis_luts_bank = nn.Linear(
+            n_ranks, n_colors * (n_vertices**n_colors), bias=False)
+
+        self.n_colors = n_colors
+        self.n_vertices = n_vertices
+        self.n_feats = n_feats
+        self.n_ranks = n_ranks
+
+    def init_weights(self):
+        r"""Init weights for models.
+
+        For the mapping f (`backbone`) and h (`lut_generator`), we follow the initialization in
+            [TPAMI 3D-LUT](https://github.com/HuiZeng/Image-Adaptive-3DLUT).
+
+        """
+        nn.init.ones_(self.weights_generator.bias)
+        tmp1 = torch.meshgrid(
+            *[torch.arange(self.n_vertices) for _ in range(self.n_colors)])
+        tmp2 = [
+            torch.zeros(self.n_colors, *((self.n_vertices, ) * self.n_colors))
+            for _ in range(self.n_ranks - 1)
+        ]
+        identity_lut = torch.stack(
+            [torch.stack(tmp1, dim=0).div(self.n_vertices - 1).flip(0), *tmp2],
+            dim=0).view(self.n_ranks, -1)
+        self.basis_luts_bank.weight.data.copy_(identity_lut.t())
+
+    def forward(self, x):
+        weights = self.weights_generator(x)
+        luts = self.basis_luts_bank(weights)
+        luts = luts.view(x.shape[0], -1,
+                         *((self.n_vertices, ) * self.n_colors))
+        return weights, luts
+
+    def regularizations(self, smoothness, monotonicity):
+        basis_luts = self.basis_luts_bank.weight.t().view(
+            self.n_ranks, self.n_colors,
+            *((self.n_vertices, ) * self.n_colors))
+        tv, mn = 0, 0
+        diff = basis_luts[:, :, :-1, ...] - basis_luts[:, :, 1:, ...]
+        tv += torch.square(diff).sum(0).mean()
+        mn += F.relu(diff).sum(0).mean()
+        diff = basis_luts[:, :, :, :-1, :] - basis_luts[:, :, :, 1:, :]
+        tv += torch.square(diff).sum(0).mean()
+        mn += F.relu(diff).sum(0).mean()
+        diff = basis_luts[:, :, :, :, :-1] - basis_luts[:, :, :, :, 1:]
+        tv += torch.square(diff).sum(0).mean()
+        mn += F.relu(diff).sum(0).mean()
+        reg_smoothness = smoothness * tv
+        reg_monotonicity = monotonicity * mn
+        return reg_smoothness, reg_monotonicity
+
+
+class AdaInt(nn.Module):
+    r"""The Adaptive Interval Learning (AdaInt) module (mapping g).
+
+    It consists of a single fully-connected layer and some post-process operations.
+
+    Args:
+        n_colors (int): Number of input color channels.
+        n_vertices (int): Number of sampling points along each lattice dimension.
+        n_feats (int): Dimension of the input image representation vector.
+        adaint_share (bool, optional): Whether to enable Share-AdaInt. Default: False.
+    """
+
+    def __init__(self,
+                 n_colors,
+                 n_vertices,
+                 n_feats,
+                 adaint_share=False) -> None:
+        super().__init__()
+        repeat_factor = n_colors if not adaint_share else 1
+        self.intervals_generator = nn.Linear(n_feats,
+                                             (n_vertices - 1) * repeat_factor)
+
+        self.n_colors = n_colors
+        self.n_vertices = n_vertices
+        self.adaint_share = adaint_share
+
+    def init_weights(self):
+        r"""Init weights for models.
+
+        We use all-zero and all-one initializations for its weights and bias, respectively.
+        """
+        nn.init.zeros_(self.intervals_generator.weight)
+        nn.init.ones_(self.intervals_generator.bias)
+
+    def forward(self, x):
+        r"""Forward function for AdaInt module.
+
+        Args:
+            x (tensor): Input image representation, shape (b, f).
+        Returns:
+            Tensor: Sampling coordinates along each lattice dimension, shape (b, c, d).
+        """
+        x = x.view(x.shape[0], -1)
+        intervals = self.intervals_generator(x).view(x.shape[0], -1,
+                                                     self.n_vertices - 1)
+        if self.adaint_share:
+            intervals = intervals.repeat_interleave(self.n_colors, dim=1)
+        intervals = intervals.softmax(-1)
+        vertices = F.pad(intervals.cumsum(-1), (1, 0), 'constant', 0)
+        return vertices
+
+
+@MODELS.register_module(
+    Tasks.image_color_enhancement, module_name=Models.adaint)
+class AdaIntImageColorEnhance(TorchModel):
+    r"""Adaptive-Interval 3D Lookup Table for real-time image enhancement.
+
+    Args:
+        n_ranks (int, optional): Number of ranks in the mapping h
+            (or the number of basis LUTs). Default: 3.
+        n_vertices (int, optional): Number of sampling points along
+            each lattice dimension. Default: 33.
+        en_adaint (bool, optional): Whether to enable AdaInt. Default: True.
+        en_adaint_share (bool, optional): Whether to enable Share-AdaInt.
+            Only used when `en_adaint` is True. Default: False.
+        backbone (str, optional): Backbone architecture to use. Can be either 'tpami'
+            or 'res18'. Default: 'tpami'.
+        pretrained (bool, optional): Whether to use ImageNet-pretrained weights.
+            Only used when `backbone` is 'res18'. Default: None.
+        n_colors (int, optional): Number of input color channels. Default: 3.
+        recons_loss (dict, optional): Config for pixel-wise reconstruction loss.
+        train_cfg (dict, optional): Config for training. Default: None.
+        test_cfg (dict, optional): Config for testing. Default: None.
+    """
+
+    def __init__(self,
+                 n_ranks=3,
+                 n_vertices=33,
+                 en_adaint=True,
+                 en_adaint_share=False,
+                 backbone='tpami',
+                 pretrained=False,
+                 n_colors=3,
+                 *args,
+                 **kwargs):
+        super(AdaIntImageColorEnhance, self).__init__()
+
+        assert backbone.lower() in ['tpami', 'res18']
+
+        # mapping f
+        self.backbone = dict(
+            tpami=TPAMIBackbone, res18=Res18Backbone)[backbone.lower()](
+                pretrained, extra_pooling=en_adaint)
+
+        # mapping h
+        self.lut_generator = LUTGenerator(n_colors, n_vertices,
+                                          self.backbone.out_channels, n_ranks)
+
+        # mapping g
+        if en_adaint:
+            self.adaint = AdaInt(n_colors, n_vertices,
+                                 self.backbone.out_channels, en_adaint_share)
+        else:
+            uniform_vertices = torch.arange(n_vertices).div(n_vertices - 1) \
+                                    .repeat(n_colors, 1)
+            self.register_buffer('uniform_vertices',
+                                 uniform_vertices.unsqueeze(0))
+
+        self.n_ranks = n_ranks
+        self.n_colors = n_colors
+        self.n_vertices = n_vertices
+        self.en_adaint = en_adaint
+        self.backbone_name = backbone.lower()
+
+        self.init_weights()
+
+    def init_weights(self):
+        r"""Init weights for models.
+
+        For the mapping f (`backbone`) and h (`lut_generator`), we follow the initialization in
+            [TPAMI 3D-LUT](https://github.com/HuiZeng/Image-Adaptive-3DLUT).
+        For the mapping g (`adaint`), we use all-zero and all-one initializations for its weights
+        and bias, respectively.
+        """
+
+        def special_initilization(m):
+            classname = m.__class__.__name__
+            if 'Conv' in classname:
+                nn.init.xavier_normal_(m.weight.data)
+            elif 'InstanceNorm' in classname:
+                nn.init.normal_(m.weight.data, 1.0, 0.02)
+                nn.init.constant_(m.bias.data, 0.0)
+
+        if self.backbone_name not in ['res18']:
+            self.apply(special_initilization)
+        self.lut_generator.init_weights()
+        if self.en_adaint:
+            self.adaint.init_weights()
+
+    def __forward(self, imgs):
+        r"""The real implementation of model forward.
+
+        Args:
+            img (Tensor): Input image, shape (b, c, h, w).
+        Returns:
+            tuple(Tensor, Tensor, Tensor):
+                Output image, LUT weights, Sampling Coordinates.
+        """
+        # E: (b, f)
+        codes = self.backbone(imgs)
+        # (b, m), T: (b, c, d, d, d)
+        weights, luts = self.lut_generator(codes)
+        # \hat{P}: (b, c, d)
+        if self.en_adaint:
+            vertices = self.adaint(codes)
+        else:
+            vertices = self.uniform_vertices
+
+        outs = ailut_transform(imgs, luts, vertices)
+
+        return outs, weights, vertices
+
+    def _evaluate_postprocess(self, src: Tensor,
+                              target: Tensor) -> Dict[str, list]:
+        preds, _, _ = self.__forward(src)
+        preds = list(torch.split(preds, 1, 0))
+        targets = list(torch.split(target, 1, 0))
+
+        preds = [(pred.data * 255.).squeeze(0).type(torch.uint8).permute(
+            1, 2, 0).cpu().numpy() for pred in preds]
+        targets = [(target.data * 255.).squeeze(0).type(torch.uint8).permute(
+            1, 2, 0).cpu().numpy() for target in targets]
+
+        return {'pred': preds, 'target': targets}
+
+    def _inference_forward(self, src: Tensor) -> Dict[str, Tensor]:
+        return {'outputs': self.__forward(src)[0].clamp(0, 1)}
+
+    def forward(self, input: Dict[str,
+                                  Tensor]) -> Dict[str, Union[list, Tensor]]:
+        """return the result by the model
+
+        Args:
+            input (Dict[str, Tensor]): the preprocessed data
+
+        Returns:
+            Dict[str, Union[list, Tensor]]: results
+        """
+        if 'target' in input:
+            return self._evaluate_postprocess(**input)
+        else:
+            return self._inference_forward(**input)
+
+    def regularizations(self, smoothness, monotonicity):
+        return self.lut_generator.regularizations(smoothness, monotonicity)
+
+    @classmethod
+    def _instantiate(cls, **kwargs):
+        model_path = osp.join(kwargs['model_dir'], ModelFile.TORCH_MODEL_FILE)
+        model = cls(**kwargs)
+        model = model._load_pretrained(
+            model, model_path, strict=False, param_key='state_dict')
+        if model.training:
+            model.train()
+        else:
+            model.eval()
+        return model
diff --git a/modelscope/models/cv/image_color_enhance/deeplpf/__init__.py b/modelscope/models/cv/image_color_enhance/deeplpf/__init__.py
new file mode 100644
index 00000000..916735e4
--- /dev/null
+++ b/modelscope/models/cv/image_color_enhance/deeplpf/__init__.py
@@ -0,0 +1 @@
+from .deeplpf_image_color_enhance import DeepLPFImageColorEnhance
diff --git a/modelscope/models/cv/image_color_enhance/deeplpf/deeplpf_image_color_enhance.py b/modelscope/models/cv/image_color_enhance/deeplpf/deeplpf_image_color_enhance.py
new file mode 100644
index 00000000..400e9b80
--- /dev/null
+++ b/modelscope/models/cv/image_color_enhance/deeplpf/deeplpf_image_color_enhance.py
@@ -0,0 +1,78 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os.path as osp
+from typing import Dict, Union
+
+import torch
+
+from modelscope.metainfo import Models
+from modelscope.models.base import Tensor, TorchModel
+from modelscope.models.builder import MODELS
+from modelscope.utils.constant import ModelFile, Tasks
+from modelscope.utils.logger import get_logger
+from .deeplpfnet import DeepLPFNet
+
+logger = get_logger()
+
+__all__ = ['DeepLPFImageColorEnhance']
+
+
+@MODELS.register_module(
+    Tasks.image_color_enhancement, module_name=Models.deeplpfnet)
+class DeepLPFImageColorEnhance(TorchModel):
+
+    def __init__(self, model_dir: str, *args, **kwargs):
+        """initialize the image color enhance model from the `model_dir` path.
+
+        Args:
+            model_dir (str): the model path.
+        """
+        super().__init__(model_dir, *args, **kwargs)
+
+        model_path = osp.join(model_dir, ModelFile.TORCH_MODEL_FILE)
+
+        self.model = DeepLPFNet()
+        if torch.cuda.is_available():
+            self._device = torch.device('cuda')
+        else:
+            self._device = torch.device('cpu')
+        self.model = self.model.to(self._device)
+
+        self.model = self._load_pretrained(self.model, model_path)
+
+        if self.training:
+            self.model.train()
+        else:
+            self.model.eval()
+
+    def _evaluate_postprocess(self, src: Tensor,
+                              target: Tensor) -> Dict[str, list]:
+        preds = self.model(src)
+        preds = list(torch.split(preds, 1, 0))
+        targets = list(torch.split(target, 1, 0))
+
+        preds = [(pred.data * 255.).squeeze(0).type(torch.uint8).permute(
+            1, 2, 0).cpu().numpy() for pred in preds]
+        targets = [(target.data * 255.).squeeze(0).type(torch.uint8).permute(
+            1, 2, 0).cpu().numpy() for target in targets]
+
+        return {'pred': preds, 'target': targets}
+
+    def _inference_forward(self, src: Tensor) -> Dict[str, Tensor]:
+        return {'outputs': self.model(src).clamp(0, 1)}
+
+    def forward(self, input: Dict[str,
+                                  Tensor]) -> Dict[str, Union[list, Tensor]]:
+        """return the result by the model
+
+        Args:
+            input (Dict[str, Tensor]): the preprocessed data
+
+        Returns:
+            Dict[str, Union[list, Tensor]]: results
+        """
+        for key, value in input.items():
+            input[key] = input[key].to(self._device)
+        if 'target' in input:
+            return self._evaluate_postprocess(**input)
+        else:
+            return self._inference_forward(**input)
diff --git a/modelscope/models/cv/image_color_enhance/deeplpf/deeplpfnet.py b/modelscope/models/cv/image_color_enhance/deeplpf/deeplpfnet.py
new file mode 100644
index 00000000..e7760de9
--- /dev/null
+++ b/modelscope/models/cv/image_color_enhance/deeplpf/deeplpfnet.py
@@ -0,0 +1,854 @@
+# Copyright (C) 2020. Huawei Technologies Co., Ltd. All rights reserved.
+
+# This program is free software; you can redistribute it and/or modify it under the terms of the BSD 0-Clause License.
+
+# This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
+# without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+# See the BSD 0-Clause License for more details.
+'''
+This is a PyTorch implementation of the CVPR 2020 paper:
+"Deep Local Parametric Filters for Image Enhancement": https://arxiv.org/abs/2003.13985
+
+DeeLPF is a method for automatic estimation of parametric filters for
+local image enhancement, which is instantiated using Elliptical, Graduated, Polynomial filters.
+
+Please cite the paper if you use this code
+
+Tested with Pytorch 1.7.1, Python 3.7.9
+
+Authors: Sean Moran (sean.j.moran@gmail.com),
+         Pierre Marza (pierre.marza@gmail.com)
+
+'''
+import math
+from math import exp
+
+import matplotlib
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.autograd import Variable
+
+matplotlib.use('agg')
+
+
+class BinaryLayer(nn.Module):
+
+    def forward(self, input):
+        return torch.sign(input)
+
+    def backward(self, grad_output):
+        input = self.saved_tensors
+        grad_output[input > 1] = 0
+        grad_output[input < -1] = 0
+        return grad_output
+
+
+class CubicFilter(nn.Module):
+
+    def __init__(self, num_in_channels=64, num_out_channels=64, batch_size=1):
+        super(CubicFilter, self).__init__()
+
+        self.cubic_layer1 = ConvBlock(num_in_channels, num_out_channels)
+        self.cubic_layer2 = MaxPoolBlock()
+        self.cubic_layer3 = ConvBlock(num_out_channels, num_out_channels)
+        self.cubic_layer4 = MaxPoolBlock()
+        self.cubic_layer5 = ConvBlock(num_out_channels, num_out_channels)
+        self.cubic_layer6 = MaxPoolBlock()
+        self.cubic_layer7 = ConvBlock(num_out_channels, num_out_channels)
+        self.cubic_layer8 = GlobalPoolingBlock(2)
+        self.fc_cubic = torch.nn.Linear(num_out_channels, 60)  # cubic
+        self.upsample = torch.nn.Upsample(
+            size=(300, 300), mode='bilinear', align_corners=False)
+        self.dropout = nn.Dropout(0.5)
+
+    def get_cubic_mask(self, feat, img):
+        feat_cubic = torch.cat((feat, img), 1)
+        feat_cubic = self.upsample(feat_cubic)
+
+        x = self.cubic_layer1(feat_cubic)
+        x = self.cubic_layer2(x)
+        x = self.cubic_layer3(x)
+        x = self.cubic_layer4(x)
+        x = self.cubic_layer5(x)
+        x = self.cubic_layer6(x)
+        x = self.cubic_layer7(x)
+        x = self.cubic_layer8(x)
+        x = x.view(x.size()[0], -1)
+        x = self.dropout(x)
+
+        R = self.fc_cubic(x)
+
+        cubic_mask = torch.zeros_like(img)
+
+        x_axis = Variable(
+            torch.arange(img.shape[2]).view(-1, 1).repeat(
+                1, img.shape[3]).cuda()) / img.shape[2]
+        y_axis = Variable(
+            torch.arange(img.shape[3]).repeat(img.shape[2],
+                                              1).cuda()) / img.shape[3]
+        '''
+        Cubic for R channel
+        '''
+        cubic_mask[0, 0, :, :] = R[0, 0] * (x_axis ** 3) + R[0, 1] * (x_axis ** 2) * y_axis + R[0, 2] * (
+            x_axis ** 2) * img[0, 0, :, :] + R[0, 3] * (x_axis ** 2) + R[0, 4] * x_axis * (y_axis ** 2) + R[
+            0, 5] * x_axis * y_axis * img[0, 0, :, :] \
+            + R[0, 6] * x_axis * y_axis + R[0, 7] * x_axis * (img[0, 0, :, :] ** 2) + R[
+            0, 8] * x_axis * img[0, 0, :, :] + R[0, 9] * x_axis + R[0, 10] * (
+            y_axis ** 3) + R[0, 11] * (y_axis ** 2) * img[0, 0, :, :] \
+            + R[0, 12] * (y_axis ** 2) + R[0, 13] * y_axis * (img[0, 0, :, :] ** 2) + R[
+            0, 14] * y_axis * img[0, 0, :, :] + R[0, 15] * y_axis + R[0, 16] * (
+            img[0, 0, :, :] ** 3) + R[0, 17] * (img[0, 0, :, :] ** 2) \
+            + R[0, 18] * \
+            img[0, 0, :, :] + R[0, 19]
+        '''
+        Cubic for G channel
+        '''
+        cubic_mask[0, 1, :, :] = R[0, 20] * (x_axis ** 3) + R[0, 21] * (x_axis ** 2) * y_axis + R[0, 22] * (
+            x_axis ** 2) * img[0, 1, :, :] + R[0, 23] * (x_axis ** 2) + R[0, 24] * x_axis * (y_axis ** 2) + R[
+            0, 25] * x_axis * y_axis * img[0, 1, :, :] \
+            + R[0, 26] * x_axis * y_axis + R[0, 27] * x_axis * (img[0, 1, :, :] ** 2) + R[
+            0, 28] * x_axis * img[0, 1, :, :] + R[0, 29] * x_axis + R[0, 30] * (
+            y_axis ** 3) + R[0, 31] * (y_axis ** 2) * img[0, 1, :, :] \
+            + R[0, 32] * (y_axis ** 2) + R[0, 33] * y_axis * (img[0, 1, :, :] ** 2) + R[
+            0, 34] * y_axis * img[0, 1, :, :] + R[0, 35] * y_axis + R[0, 36] * (
+            img[0, 1, :, :] ** 3) + R[0, 37] * (img[0, 1, :, :] ** 2) \
+            + R[0, 38] * \
+            img[0, 1, :, :] + R[0, 39]
+        '''
+        Cubic for B channel
+        '''
+        cubic_mask[0, 2, :, :] = R[0, 40] * (x_axis ** 3) + R[0, 41] * (x_axis ** 2) * y_axis + R[0, 42] * (
+            x_axis ** 2) * img[0, 2, :, :] + R[0, 43] * (x_axis ** 2) + R[0, 44] * x_axis * (y_axis ** 2) + R[
+            0, 45] * x_axis * y_axis * img[0, 2, :, :] \
+            + R[0, 46] * x_axis * y_axis + R[0, 47] * x_axis * (img[0, 2, :, :] ** 2) + R[
+            0, 48] * x_axis * img[0, 2, :, :] + R[0, 49] * x_axis + R[0, 50] * (
+            y_axis ** 3) + R[0, 51] * (y_axis ** 2) * img[0, 2, :, :] \
+            + R[0, 52] * (y_axis ** 2) + R[0, 53] * y_axis * (img[0, 2, :, :] ** 2) + R[
+            0, 54] * y_axis * img[0, 2, :, :] + R[0, 55] * y_axis + R[0, 56] * (
+            img[0, 2, :, :] ** 3) + R[0, 57] * (img[0, 2, :, :] ** 2) \
+            + R[0, 58] * \
+            img[0, 2, :, :] + R[0, 59]
+
+        img_cubic = torch.clamp(img + cubic_mask, 0, 1)
+        return img_cubic
+
+
+class GraduatedFilter(nn.Module):
+
+    def __init__(self, num_in_channels=64, num_out_channels=64):
+        super(GraduatedFilter, self).__init__()
+
+        self.graduated_layer1 = ConvBlock(num_in_channels, num_out_channels)
+        self.graduated_layer2 = MaxPoolBlock()
+        self.graduated_layer3 = ConvBlock(num_out_channels, num_out_channels)
+        self.graduated_layer4 = MaxPoolBlock()
+        self.graduated_layer5 = ConvBlock(num_out_channels, num_out_channels)
+        self.graduated_layer6 = MaxPoolBlock()
+        self.graduated_layer7 = ConvBlock(num_out_channels, num_out_channels)
+        self.graduated_layer8 = GlobalPoolingBlock(2)
+        self.fc_graduated = torch.nn.Linear(num_out_channels, 24)
+        self.upsample = torch.nn.Upsample(
+            size=(300, 300), mode='bilinear', align_corners=False)
+        self.dropout = nn.Dropout(0.5)
+        self.bin_layer = BinaryLayer()
+
+    def tanh01(self, x):
+        tanh = nn.Tanh()
+        return 0.5 * (tanh(x) + 1)
+
+    def where(self, cond, x_1, x_2):
+        cond = cond.float()
+        return (cond * x_1) + ((1 - cond) * x_2)
+
+    def get_inverted_mask(self, factor, invert, d1, d2, max_scale, top_line):
+        if (invert == 1).all():
+
+            if (factor >= 1).all():
+                diff = ((factor - 1)) / 2 + 1
+                grad1 = (diff - factor) / d1
+                grad2 = (1 - diff) / d2
+                mask_scale = torch.clamp(
+                    factor + grad1 * top_line + grad2 * top_line,
+                    min=1,
+                    max=max_scale)
+            else:
+                diff = ((1 - factor)) / 2 + factor
+                grad1 = (diff - factor) / d1
+                grad2 = (1 - diff) / d2
+                mask_scale = torch.clamp(
+                    factor + grad1 * top_line + grad2 * top_line, min=0, max=1)
+        else:
+
+            if (factor >= 1).all():
+                diff = ((factor - 1)) / 2 + 1
+                grad1 = (diff - factor) / d1
+                grad2 = (factor - diff) / d2
+                mask_scale = torch.clamp(
+                    1 + grad1 * top_line + grad2 * top_line,
+                    min=1,
+                    max=max_scale)
+            else:
+                diff = ((1 - factor)) / 2 + factor
+                grad1 = (diff - 1) / d1
+                grad2 = (factor - diff) / d2
+                mask_scale = torch.clamp(
+                    1 + grad1 * top_line + grad2 * top_line, min=0, max=1)
+
+        mask_scale = torch.clamp(mask_scale.unsqueeze(0), 0, max_scale)
+        return mask_scale
+
+    def get_graduated_mask(self, feat, img):
+        eps = 1e-10
+
+        x_axis = Variable(
+            torch.arange(img.shape[2]).view(-1, 1).repeat(
+                1, img.shape[3]).cuda()) / img.shape[2]
+        y_axis = Variable(
+            torch.arange(img.shape[3]).repeat(img.shape[2],
+                                              1).cuda()) / img.shape[3]
+
+        feat_graduated = torch.cat((feat, img), 1)
+        feat_graduated = self.upsample(feat_graduated)
+
+        # The following layers calculate the parameters of the graduated filters that we use for image enhancement
+        x = self.graduated_layer1(feat_graduated)
+        x = self.graduated_layer2(x)
+        x = self.graduated_layer3(x)
+        x = self.graduated_layer4(x)
+        x = self.graduated_layer5(x)
+        x = self.graduated_layer6(x)
+        x = self.graduated_layer7(x)
+        x = self.graduated_layer8(x)
+        x = x.view(x.size()[0], -1)
+        x = self.dropout(x)
+        G = self.fc_graduated(x)
+
+        # Classification values (above or below the line)
+        above_or_below_line1 = ((self.bin_layer(G[0, 0])) + 1) / 2
+        above_or_below_line2 = ((self.bin_layer(G[0, 1])) + 1) / 2
+        above_or_below_line3 = ((self.bin_layer(G[0, 2])) + 1) / 2
+
+        slope1 = G[0, 3].clone()
+        slope2 = G[0, 4].clone()
+        slope3 = G[0, 5].clone()
+
+        y_axis_dist1 = self.tanh01(G[0, 6]) + eps
+        y_axis_dist2 = self.tanh01(G[0, 7]) + eps
+        y_axis_dist3 = self.tanh01(G[0, 8]) + eps
+
+        y_axis_dist1 = torch.clamp(
+            self.tanh01(G[0, 9]), y_axis_dist1.data, 1.0)
+        y_axis_dist2 = torch.clamp(
+            self.tanh01(G[0, 10]), y_axis_dist2.data, 1.0)
+        y_axis_dist3 = torch.clamp(
+            self.tanh01(G[0, 11]), y_axis_dist3.data, 1.0)
+
+        y_axis_dist4 = torch.clamp(self.tanh01(G[0, 12]), 0, y_axis_dist1.data)
+        y_axis_dist5 = torch.clamp(self.tanh01(G[0, 13]), 0, y_axis_dist2.data)
+        y_axis_dist6 = torch.clamp(self.tanh01(G[0, 14]), 0, y_axis_dist3.data)
+
+        # Scales
+        max_scale = 2
+
+        scale_factor1 = self.tanh01(G[0, 15]) * max_scale
+        scale_factor2 = self.tanh01(G[0, 16]) * max_scale
+        scale_factor3 = self.tanh01(G[0, 17]) * max_scale
+
+        scale_factor4 = self.tanh01(G[0, 18]) * max_scale
+        scale_factor5 = self.tanh01(G[0, 19]) * max_scale
+        scale_factor6 = self.tanh01(G[0, 20]) * max_scale
+
+        scale_factor7 = self.tanh01(G[0, 21]) * max_scale
+        scale_factor8 = self.tanh01(G[0, 22]) * max_scale
+        scale_factor9 = self.tanh01(G[0, 23]) * max_scale
+
+        slope1_angle = torch.atan(slope1)
+        slope2_angle = torch.atan(slope2)
+        slope3_angle = torch.atan(slope3)
+
+        # Distances between central line and two outer lines
+        d1 = self.tanh01(y_axis_dist1 * torch.cos(slope1_angle))
+        d2 = self.tanh01(y_axis_dist4 * torch.cos(slope1_angle))
+        d3 = self.tanh01(y_axis_dist2 * torch.cos(slope2_angle))
+        d4 = self.tanh01(y_axis_dist5 * torch.cos(slope2_angle))
+        d5 = self.tanh01(y_axis_dist3 * torch.cos(slope3_angle))
+        d6 = self.tanh01(y_axis_dist6 * torch.cos(slope3_angle))
+
+        top_line1 = self.tanh01(y_axis - (slope1 * x_axis + y_axis_dist1 + d1))
+        top_line2 = self.tanh01(y_axis - (slope2 * x_axis + y_axis_dist2 + d3))
+        top_line3 = self.tanh01(y_axis - (slope3 * x_axis + y_axis_dist3 + d5))
+        '''
+        The following are the scale factors for each of the 9 graduated filters
+        '''
+        mask_scale1 = self.get_inverted_mask(scale_factor1,
+                                             above_or_below_line1, d1, d2,
+                                             max_scale, top_line1)
+        mask_scale2 = self.get_inverted_mask(scale_factor2,
+                                             above_or_below_line1, d1, d2,
+                                             max_scale, top_line1)
+        mask_scale3 = self.get_inverted_mask(scale_factor3,
+                                             above_or_below_line1, d1, d2,
+                                             max_scale, top_line1)
+
+        mask_scale_1 = torch.cat((mask_scale1, mask_scale2, mask_scale3),
+                                 dim=0)
+        mask_scale_1 = torch.clamp(mask_scale_1.unsqueeze(0), 0, max_scale)
+
+        mask_scale4 = self.get_inverted_mask(scale_factor4,
+                                             above_or_below_line2, d3, d4,
+                                             max_scale, top_line2)
+        mask_scale5 = self.get_inverted_mask(scale_factor5,
+                                             above_or_below_line2, d3, d4,
+                                             max_scale, top_line2)
+        mask_scale6 = self.get_inverted_mask(scale_factor6,
+                                             above_or_below_line2, d3, d4,
+                                             max_scale, top_line2)
+
+        mask_scale_4 = torch.cat((mask_scale4, mask_scale5, mask_scale6),
+                                 dim=0)
+        mask_scale_4 = torch.clamp(mask_scale_4.unsqueeze(0), 0, max_scale)
+
+        mask_scale7 = self.get_inverted_mask(scale_factor7,
+                                             above_or_below_line3, d5, d6,
+                                             max_scale, top_line3)
+        mask_scale8 = self.get_inverted_mask(scale_factor8,
+                                             above_or_below_line3, d5, d6,
+                                             max_scale, top_line3)
+        mask_scale9 = self.get_inverted_mask(scale_factor9,
+                                             above_or_below_line3, d5, d6,
+                                             max_scale, top_line3)
+
+        mask_scale_7 = torch.cat((mask_scale7, mask_scale8, mask_scale9),
+                                 dim=0)
+        mask_scale_7 = torch.clamp(mask_scale_7.unsqueeze(0), 0, max_scale)
+
+        mask_scale = torch.clamp(mask_scale_1 * mask_scale_4 * mask_scale_7, 0,
+                                 max_scale)
+
+        return mask_scale
+
+
+class EllipticalFilter(nn.Module):
+
+    def __init__(self, num_in_channels=64, num_out_channels=64):
+        super(EllipticalFilter, self).__init__()
+
+        self.elliptical_layer1 = ConvBlock(num_in_channels, num_out_channels)
+        self.elliptical_layer2 = MaxPoolBlock()
+        self.elliptical_layer3 = ConvBlock(num_out_channels, num_out_channels)
+        self.elliptical_layer4 = MaxPoolBlock()
+        self.elliptical_layer5 = ConvBlock(num_out_channels, num_out_channels)
+        self.elliptical_layer6 = MaxPoolBlock()
+        self.elliptical_layer7 = ConvBlock(num_out_channels, num_out_channels)
+        self.elliptical_layer8 = GlobalPoolingBlock(2)
+        self.fc_elliptical = torch.nn.Linear(num_out_channels,
+                                             24)  # elliptical
+        self.upsample = torch.nn.Upsample(
+            size=(300, 300), mode='bilinear', align_corners=False)
+        self.dropout = nn.Dropout(0.5)
+
+    def tanh01(self, x):
+        tanh = nn.Tanh()
+        return 0.5 * (tanh(x) + 1)
+
+    def where(self, cond, x_1, x_2):
+        cond = cond.float()
+        return (cond * x_1) + ((1 - cond) * x_2)
+
+    def get_mask(self,
+                 x_axis,
+                 y_axis,
+                 shift_x=0,
+                 shift_y=0,
+                 semi_axis_x=1,
+                 semi_axis_y=1,
+                 alpha=0,
+                 scale_factor=2,
+                 max_scale=2,
+                 eps=1e-7,
+                 radius=1):
+        # Check whether a point is inside our outside of the ellipse and set the scaling factor accordingly
+        ellipse_equation_part1 = \
+            (((x_axis - shift_x) * torch.cos(alpha) + (y_axis - shift_y) * torch.sin(alpha))**2)
+        ellipse_equation_part1 /= ((semi_axis_x)**2)
+        ellipse_equation_part2 = \
+            (((x_axis - shift_x) * torch.sin(alpha) - (y_axis - shift_y) * torch.cos(alpha))**2)
+        ellipse_equation_part2 /= ((semi_axis_y)**2)
+
+        # Set the scaling factors to decay with radius inside the ellipse
+        tmp = torch.sqrt((x_axis - shift_x)**2 + (y_axis - shift_y)**2 + eps)
+        tmp *= (1 - scale_factor)
+        tmp = tmp / radius + scale_factor
+        mask_scale = self.where(
+            ellipse_equation_part1 + ellipse_equation_part2 < 1, tmp, 1)
+
+        mask_scale = torch.clamp(mask_scale.unsqueeze(0), 0, max_scale)
+
+        return mask_scale
+
+    def get_elliptical_mask(self, feat, img):
+        # The two eps parameters are used to avoid numerical issues in the learning
+        eps2 = 1e-7
+        eps1 = 1e-10
+
+        # max_scale is the maximum an ellipse can scale the image R,G,B values by
+        max_scale = 2
+
+        feat_elliptical = torch.cat((feat, img), 1)
+        feat_elliptical = self.upsample(feat_elliptical)
+
+        # The following layers calculate the parameters of the ellipses that we use for image enhancement
+        x = self.elliptical_layer1(feat_elliptical)
+        x = self.elliptical_layer2(x)
+        x = self.elliptical_layer3(x)
+        x = self.elliptical_layer4(x)
+        x = self.elliptical_layer5(x)
+        x = self.elliptical_layer6(x)
+        x = self.elliptical_layer7(x)
+        x = self.elliptical_layer8(x)
+        x = x.view(x.size()[0], -1)
+        x = self.dropout(x)
+        G = self.fc_elliptical(x)
+
+        # The next code implements a rotated ellipse according to:
+        # https://math.stackexchange.com/questions/426150/what-is-the-general-equation-of-the-ellipse-that-is-not-in-the-origin-and-rotate
+
+        # Normalised coordinates for x and y-axes, we instantiate the ellipses in these coordinates
+        x_axis = Variable(
+            torch.arange(img.shape[2]).view(-1, 1).repeat(
+                1, img.shape[3]).cuda()) / img.shape[2]
+        y_axis = Variable(
+            torch.arange(img.shape[3]).repeat(img.shape[2],
+                                              1).cuda()) / img.shape[3]
+
+        # Centre of ellipse, x-coordinate
+        x_coord1 = self.tanh01(G[0, 0]) + eps1
+        x_coord2 = self.tanh01(G[0, 1]) + eps1
+        x_coord3 = self.tanh01(G[0, 2]) + eps1
+
+        # Centre of ellipse, y-coordinate
+        y_coord1 = self.tanh01(G[0, 3]) + eps1
+        y_coord2 = self.tanh01(G[0, 4]) + eps1
+        y_coord3 = self.tanh01(G[0, 5]) + eps1
+
+        # a value of ellipse
+        a1 = self.tanh01(G[0, 6]) + eps1
+        a2 = self.tanh01(G[0, 7]) + eps1
+        a3 = self.tanh01(G[0, 8]) + eps1
+
+        # b value
+        b1 = self.tanh01(G[0, 9]) + eps1
+        b2 = self.tanh01(G[0, 10]) + eps1
+        b3 = self.tanh01(G[0, 11]) + eps1
+
+        # A value is angle to the x-axis
+        A1 = self.tanh01(G[0, 12]) * math.pi + eps1
+        A2 = self.tanh01(G[0, 13]) * math.pi + eps1
+        A3 = self.tanh01(G[0, 14]) * math.pi + eps1
+        '''
+        The following are the scale factors for each of the 9 ellipses
+        '''
+        scale1 = self.tanh01(G[0, 15]) * max_scale + eps1
+        scale2 = self.tanh01(G[0, 16]) * max_scale + eps1
+        scale3 = self.tanh01(G[0, 17]) * max_scale + eps1
+
+        scale4 = self.tanh01(G[0, 18]) * max_scale + eps1
+        scale5 = self.tanh01(G[0, 19]) * max_scale + eps1
+        scale6 = self.tanh01(G[0, 20]) * max_scale + eps1
+
+        scale7 = self.tanh01(G[0, 21]) * max_scale + eps1
+        scale8 = self.tanh01(G[0, 22]) * max_scale + eps1
+        scale9 = self.tanh01(G[0, 23]) * max_scale + eps1
+
+        # Angle of orientation of the ellipses with respect to the y semi-axis
+        tmp = torch.sqrt((x_axis - x_coord1)**2 + (y_axis - y_coord1)**2
+                         + eps1)
+        angle_1 = torch.acos(
+            torch.clamp((y_axis - y_coord1) / tmp, -1 + eps2, 1 - eps2)) - A1
+
+        tmp = torch.sqrt((x_axis - x_coord2)**2 + (y_axis - y_coord2)**2
+                         + eps1)
+        angle_2 = torch.acos(
+            torch.clamp((y_axis - y_coord2) / tmp, -1 + eps2, 1 - eps2)) - A2
+
+        tmp = torch.sqrt((x_axis - x_coord3)**2 + (y_axis - y_coord3)**2
+                         + eps1)
+        angle_3 = torch.acos(
+            torch.clamp((y_axis - y_coord3) / tmp, -1 + eps2, 1 - eps2)) - A3
+
+        # Radius of the ellipses
+        # https://math.stackexchange.com/questions/432902/how-to-get-the-radius-of-an-ellipse-at-a-specific-angle-by-knowing-its-semi-majo
+        radius_1 = (a1 * b1) / torch.sqrt((a1**2) * (torch.sin(angle_1)**2)
+                                          + (b1**2) * (torch.cos(angle_1)**2)
+                                          + eps1)
+
+        radius_2 = (a2 * b2) / torch.sqrt((a2**2) * (torch.sin(angle_2)**2)
+                                          + (b2**2) * (torch.cos(angle_2)**2)
+                                          + eps1)
+
+        radius_3 = (a3 * b3) / torch.sqrt((a3**2) * (torch.sin(angle_3)**2)
+                                          + (b3**2) * (torch.cos(angle_3)**2)
+                                          + eps1)
+
+        # Scaling factors for the R,G,B channels, here we learn three ellipses
+        mask_scale1 = self.get_mask(
+            x_axis,
+            y_axis,
+            shift_x=x_coord1,
+            shift_y=y_coord1,
+            semi_axis_x=a1,
+            semi_axis_y=b1,
+            alpha=angle_1,
+            scale_factor=scale1,
+            radius=radius_1)
+
+        mask_scale2 = self.get_mask(
+            x_axis,
+            y_axis,
+            shift_x=x_coord1,
+            shift_y=y_coord1,
+            semi_axis_x=a1,
+            semi_axis_y=b1,
+            alpha=angle_1,
+            scale_factor=scale2,
+            radius=radius_1)
+
+        mask_scale3 = self.get_mask(
+            x_axis,
+            y_axis,
+            shift_x=x_coord1,
+            shift_y=y_coord1,
+            semi_axis_x=a1,
+            semi_axis_y=b1,
+            alpha=angle_1,
+            scale_factor=scale3,
+            radius=radius_1)
+
+        mask_scale_1 = torch.cat((mask_scale1, mask_scale2, mask_scale3),
+                                 dim=0)
+        mask_scale_1_rad = torch.clamp(mask_scale_1.unsqueeze(0), 0, max_scale)
+
+        # Scaling factors for the R,G,B channels, here we learn three ellipses
+        mask_scale4 = self.get_mask(
+            x_axis,
+            y_axis,
+            shift_x=x_coord2,
+            shift_y=y_coord2,
+            semi_axis_x=a2,
+            semi_axis_y=b2,
+            alpha=angle_2,
+            scale_factor=scale4,
+            radius=radius_2)
+
+        mask_scale5 = self.get_mask(
+            x_axis,
+            y_axis,
+            shift_x=x_coord2,
+            shift_y=y_coord2,
+            semi_axis_x=a2,
+            semi_axis_y=b2,
+            alpha=angle_2,
+            scale_factor=scale5,
+            radius=radius_2)
+
+        mask_scale6 = self.get_mask(
+            x_axis,
+            y_axis,
+            shift_x=x_coord2,
+            shift_y=y_coord2,
+            semi_axis_x=a2,
+            semi_axis_y=b3,
+            alpha=angle_2,
+            scale_factor=scale6,
+            radius=radius_2)
+
+        mask_scale_4 = torch.cat((mask_scale4, mask_scale5, mask_scale6),
+                                 dim=0)
+        mask_scale_4_rad = torch.clamp(mask_scale_4.unsqueeze(0), 0, max_scale)
+
+        # Scaling factors for the R,G,B channels, here we learn three ellipses
+        mask_scale7 = self.get_mask(
+            x_axis,
+            y_axis,
+            shift_x=x_coord3,
+            shift_y=y_coord3,
+            semi_axis_x=a3,
+            semi_axis_y=b3,
+            alpha=angle_3,
+            scale_factor=scale7,
+            radius=radius_3)
+
+        mask_scale8 = self.get_mask(
+            x_axis,
+            y_axis,
+            shift_x=x_coord3,
+            shift_y=y_coord3,
+            semi_axis_x=a3,
+            semi_axis_y=b3,
+            alpha=angle_3,
+            scale_factor=scale8,
+            radius=radius_3)
+
+        mask_scale9 = self.get_mask(
+            x_axis,
+            y_axis,
+            shift_x=x_coord3,
+            shift_y=y_coord3,
+            semi_axis_x=a3,
+            semi_axis_y=b3,
+            alpha=angle_3,
+            scale_factor=scale9,
+            radius=radius_3)
+
+        mask_scale_7 = torch.cat((mask_scale7, mask_scale8, mask_scale9),
+                                 dim=0)
+        mask_scale_7_rad = torch.clamp(mask_scale_7.unsqueeze(0), 0, max_scale)
+
+        # Mix the ellipses together by multiplication
+        mask_scale_elliptical = torch.clamp(
+            mask_scale_1_rad * mask_scale_4_rad * mask_scale_7_rad, 0,
+            max_scale)
+
+        return mask_scale_elliptical
+
+
+class Block(nn.Module):
+
+    def __init__(self):
+        super(Block, self).__init__()
+
+    def conv3x3(self, in_channels, out_channels, stride=1):
+        return nn.Conv2d(
+            in_channels,
+            out_channels,
+            kernel_size=3,
+            stride=stride,
+            padding=1,
+            bias=True)
+
+
+class ConvBlock(Block, nn.Module):
+
+    def __init__(self, num_in_channels, num_out_channels, stride=1):
+        super(Block, self).__init__()
+        self.conv = self.conv3x3(num_in_channels, num_out_channels, stride=2)
+        self.lrelu = nn.LeakyReLU()
+
+    def forward(self, x):
+        img_out = self.lrelu(self.conv(x))
+        return img_out
+
+
+class MaxPoolBlock(Block, nn.Module):
+
+    def __init__(self):
+        super(Block, self).__init__()
+
+        self.max_pool = nn.MaxPool2d(kernel_size=2, stride=2)
+
+    def forward(self, x):
+        img_out = self.max_pool(x)
+        return img_out
+
+
+class GlobalPoolingBlock(Block, nn.Module):
+
+    def __init__(self, receptive_field):
+        super(Block, self).__init__()
+        self.avg_pool = nn.AdaptiveAvgPool2d(1)
+
+    def forward(self, x):
+        out = self.avg_pool(x)
+        return out
+
+
+class DeepLPFParameterPrediction(nn.Module):
+
+    def __init__(self, num_in_channels=64, num_out_channels=64, batch_size=1):
+        super(DeepLPFParameterPrediction, self).__init__()
+        self.num_in_channels = num_in_channels
+        self.num_out_channels = num_out_channels
+        self.cubic_filter = CubicFilter()
+        self.graduated_filter = GraduatedFilter()
+        self.elliptical_filter = EllipticalFilter()
+
+    def forward(self, x):
+        x.contiguous()  # remove memory holes
+        x.cuda()
+
+        feat = x[:, 3:64, :, :]
+        img = x[:, 0:3, :, :]
+
+        torch.cuda.empty_cache()
+
+        img_cubic = self.cubic_filter.get_cubic_mask(feat, img)
+
+        mask_scale_graduated = self.graduated_filter.get_graduated_mask(
+            feat, img_cubic)
+        mask_scale_elliptical = self.elliptical_filter.get_elliptical_mask(
+            feat, img_cubic)
+
+        mask_scale_fuse = torch.clamp(
+            mask_scale_graduated + mask_scale_elliptical, 0, 2)
+
+        img_fuse = torch.clamp(img_cubic * mask_scale_fuse, 0, 1)
+
+        img = torch.clamp(img_fuse + img, 0, 1)
+
+        return img
+
+
+class UNet(nn.Module):
+
+    def __init__(self):
+        super().__init__()
+
+        self.conv1 = nn.Conv2d(16, 64, 1)
+        self.conv2 = nn.Conv2d(32, 64, 1)
+        self.conv3 = nn.Conv2d(64, 64, 1)
+
+        self.local_net = LocalNet(16)
+
+        self.dconv_down1 = LocalNet(3, 16)
+        self.dconv_down2 = LocalNet(16, 32)
+        self.dconv_down3 = LocalNet(32, 64)
+        self.dconv_down4 = LocalNet(64, 128)
+        self.dconv_down5 = LocalNet(128, 128)
+
+        self.maxpool = nn.MaxPool2d(2, padding=0)
+
+        self.upsample = nn.UpsamplingNearest2d(scale_factor=2)
+        self.up_conv1x1_1 = nn.Conv2d(128, 128, 1)
+        self.up_conv1x1_2 = nn.Conv2d(128, 128, 1)
+        self.up_conv1x1_3 = nn.Conv2d(64, 64, 1)
+        self.up_conv1x1_4 = nn.Conv2d(32, 32, 1)
+
+        self.dconv_up4 = LocalNet(256, 128)
+        self.dconv_up3 = LocalNet(192, 64)
+        self.dconv_up2 = LocalNet(96, 32)
+        self.dconv_up1 = LocalNet(48, 16)
+
+        self.conv_last = LocalNet(16, 3)
+
+    def forward(self, x):
+        x_in_tile = x.clone()
+
+        conv1 = self.dconv_down1(x)
+        x = self.maxpool(conv1)
+
+        conv2 = self.dconv_down2(x)
+        x = self.maxpool(conv2)
+
+        conv3 = self.dconv_down3(x)
+        x = self.maxpool(conv3)
+
+        conv4 = self.dconv_down4(x)
+        x = self.maxpool(conv4)
+
+        x = self.dconv_down5(x)
+
+        x = self.up_conv1x1_1(self.upsample(x))
+
+        if x.shape[3] != conv4.shape[3] and x.shape[2] != conv4.shape[2]:
+            x = torch.nn.functional.pad(x, (1, 0, 0, 1))
+        elif x.shape[2] != conv4.shape[2]:
+            x = torch.nn.functional.pad(x, (0, 0, 0, 1))
+        elif x.shape[3] != conv4.shape[3]:
+            x = torch.nn.functional.pad(x, (1, 0, 0, 0))
+
+        x = torch.cat([x, conv4], dim=1)
+
+        x = self.dconv_up4(x)
+        x = self.up_conv1x1_2(self.upsample(x))
+
+        if x.shape[3] != conv3.shape[3] and x.shape[2] != conv3.shape[2]:
+            x = torch.nn.functional.pad(x, (1, 0, 0, 1))
+        elif x.shape[2] != conv3.shape[2]:
+            x = torch.nn.functional.pad(x, (0, 0, 0, 1))
+        elif x.shape[3] != conv3.shape[3]:
+            x = torch.nn.functional.pad(x, (1, 0, 0, 0))
+
+        x = torch.cat([x, conv3], dim=1)
+
+        x = self.dconv_up3(x)
+        x = self.up_conv1x1_3(self.upsample(x))
+
+        del conv3
+
+        if x.shape[3] != conv2.shape[3] and x.shape[2] != conv2.shape[2]:
+            x = torch.nn.functional.pad(x, (1, 0, 0, 1))
+        elif x.shape[2] != conv2.shape[2]:
+            x = torch.nn.functional.pad(x, (0, 0, 0, 1))
+        elif x.shape[3] != conv2.shape[3]:
+            x = torch.nn.functional.pad(x, (1, 0, 0, 0))
+
+        x = torch.cat([x, conv2], dim=1)
+
+        x = self.dconv_up2(x)
+        x = self.up_conv1x1_4(self.upsample(x))
+
+        del conv2
+
+        if x.shape[3] != conv1.shape[3] and x.shape[2] != conv1.shape[2]:
+            x = torch.nn.functional.pad(x, (1, 0, 0, 1))
+        elif x.shape[2] != conv1.shape[2]:
+            x = torch.nn.functional.pad(x, (0, 0, 0, 1))
+        elif x.shape[3] != conv1.shape[3]:
+            x = torch.nn.functional.pad(x, (1, 0, 0, 0))
+
+        x = torch.cat([x, conv1], dim=1)
+        del conv1
+
+        x = self.dconv_up1(x)
+
+        out = self.conv_last(x)
+        out = out + x_in_tile
+
+        return out
+
+
+class LocalNet(nn.Module):
+
+    def forward(self, x_in):
+        x = self.lrelu(self.conv1(self.refpad(x_in)))
+        x = self.lrelu(self.conv2(self.refpad(x)))
+
+        return x
+
+    def __init__(self, in_channels=16, out_channels=64):
+        super(LocalNet, self).__init__()
+        self.conv1 = nn.Conv2d(in_channels, out_channels, 3, 1, 0, 1)
+        self.conv2 = nn.Conv2d(out_channels, out_channels, 3, 1, 0, 1)
+        self.lrelu = nn.LeakyReLU()
+        self.refpad = nn.ReflectionPad2d(1)
+
+
+# Model definition
+class UNetModel(nn.Module):
+
+    def __init__(self):
+        super(UNetModel, self).__init__()
+
+        self.unet = UNet()
+        self.final_conv = nn.Conv2d(3, 64, 3, 1, 0, 1)
+        self.refpad = nn.ReflectionPad2d(1)
+
+    def forward(self, img):
+        output_img = self.unet(img)
+        return self.final_conv(self.refpad(output_img))
+
+
+class DeepLPFNet(nn.Module):
+
+    def __init__(self):
+        super(DeepLPFNet, self).__init__()
+        self.backbonenet = UNetModel()
+        self.deeplpfnet = DeepLPFParameterPrediction()
+
+    def forward(self, img):
+        feat = self.backbonenet(img)
+        img = self.deeplpfnet(feat)
+        img = torch.clamp(img, 0.0, 1.0)
+
+        return img
diff --git a/modelscope/models/cv/image_debanding/__init__.py b/modelscope/models/cv/image_debanding/__init__.py
new file mode 100644
index 00000000..850a03ec
--- /dev/null
+++ b/modelscope/models/cv/image_debanding/__init__.py
@@ -0,0 +1,22 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .rrdb import RRDBImageDebanding
+
+else:
+    _import_structure = {
+        'rrdb': ['RRDBImageDebanding'],
+    }
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/models/cv/image_debanding/rrdb/__init__.py b/modelscope/models/cv/image_debanding/rrdb/__init__.py
new file mode 100644
index 00000000..a063db5a
--- /dev/null
+++ b/modelscope/models/cv/image_debanding/rrdb/__init__.py
@@ -0,0 +1 @@
+from .rrdb_image_debanding import RRDBImageDebanding
diff --git a/modelscope/models/cv/image_debanding/rrdb/rrdb_image_debanding.py b/modelscope/models/cv/image_debanding/rrdb/rrdb_image_debanding.py
new file mode 100644
index 00000000..c21965f4
--- /dev/null
+++ b/modelscope/models/cv/image_debanding/rrdb/rrdb_image_debanding.py
@@ -0,0 +1,91 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+'''RRDB debanding network
+This model use rrdbnet to achieve image debanding task.
+Training data is obtained from:
+https://github.com/akshay-kap/Meng-699-Image-Banding-detection
+'''
+import os.path as osp
+from typing import Dict, Union
+
+import torch
+
+from modelscope.metainfo import Models
+from modelscope.models.base import Tensor, TorchModel
+from modelscope.models.builder import MODELS
+from modelscope.models.cv.super_resolution import RRDBNet
+from modelscope.utils.constant import ModelFile, Tasks
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+__all__ = ['RRDBImageDebanding']
+
+
+@MODELS.register_module(Tasks.image_debanding, module_name=Models.rrdb)
+class RRDBImageDebanding(TorchModel):
+
+    def __init__(self, model_dir: str, *args, **kwargs):
+        """initialize the image color enhance model from the `model_dir` path.
+
+        Args:
+            model_dir (str): the model path.
+        """
+        super().__init__(model_dir, *args, **kwargs)
+
+        model_path = osp.join(model_dir, ModelFile.TORCH_MODEL_FILE)
+
+        self.num_feat = 64
+        self.num_block = 23
+        self.scale = 1
+        self.model = RRDBNet(
+            num_in_ch=3,
+            num_out_ch=3,
+            num_feat=self.num_feat,
+            num_block=self.num_block,
+            num_grow_ch=32,
+            scale=self.scale)
+        if torch.cuda.is_available():
+            self._device = torch.device('cuda')
+        else:
+            self._device = torch.device('cpu')
+        self.model = self.model.to(self._device)
+
+        self.model = self._load_pretrained(self.model, model_path)
+
+        if self.training:
+            self.model.train()
+        else:
+            self.model.eval()
+
+    def _evaluate_postprocess(self, src: Tensor,
+                              target: Tensor) -> Dict[str, list]:
+        preds = self.model(src)
+        preds = list(torch.split(preds, 1, 0))
+        targets = list(torch.split(target, 1, 0))
+
+        preds = [(pred.data * 255.).squeeze(0).type(torch.uint8).permute(
+            1, 2, 0).cpu().numpy() for pred in preds]
+        targets = [(target.data * 255.).squeeze(0).type(torch.uint8).permute(
+            1, 2, 0).cpu().numpy() for target in targets]
+
+        return {'pred': preds, 'target': targets}
+
+    def _inference_forward(self, src: Tensor) -> Dict[str, Tensor]:
+        return {'outputs': self.model(src).clamp(0, 1)}
+
+    def forward(self, input: Dict[str,
+                                  Tensor]) -> Dict[str, Union[list, Tensor]]:
+        """return the result by the model
+
+        Args:
+            input (Dict[str, Tensor]): the preprocessed data
+
+        Returns:
+            Dict[str, Union[list, Tensor]]: results
+        """
+        for key, value in input.items():
+            input[key] = input[key].to(self._device)
+        if 'target' in input:
+            return self._evaluate_postprocess(**input)
+        else:
+            return self._inference_forward(**input)
diff --git a/modelscope/models/cv/image_defrcn_fewshot/defrcn_for_fewshot.py b/modelscope/models/cv/image_defrcn_fewshot/defrcn_for_fewshot.py
index d42e59b2..77b71888 100644
--- a/modelscope/models/cv/image_defrcn_fewshot/defrcn_for_fewshot.py
+++ b/modelscope/models/cv/image_defrcn_fewshot/defrcn_for_fewshot.py
@@ -7,11 +7,14 @@ import torch
 from modelscope.metainfo import Models
 from modelscope.models.base.base_torch_model import TorchModel
 from modelscope.models.builder import MODELS
+from modelscope.msdatasets import MsDataset
 from modelscope.utils.config import Config
-from modelscope.utils.constant import ModelFile, Tasks
+from modelscope.utils.constant import DownloadMode, ModelFile, Tasks
 from modelscope.utils.logger import get_logger
-from .models.defaults_config import _C
+from .models.calibration_layer import PrototypicalCalibrationBlock
 from .models.defrcn import DeFRCN
+from .utils.configuration_mapper import CfgMapper
+from .utils.register_data import register_data
 from .utils.requirements_check import requires_version
 
 logger = get_logger()
@@ -26,37 +29,64 @@ class DeFRCNForFewShot(TorchModel):
         Detail configs can be visited on detectron2.config.defaults and .models.defaults_config.
     """
 
-    def __init__(self, model_dir: str, *args, **kwargs):
+    def __init__(self,
+                 model_dir: str,
+                 _cfg_dict: Config = None,
+                 *args,
+                 **kwargs):
         """initialize the few-shot defrcn model from the `model_dir` path.
 
         Args:
             model_dir (str): the model path.
-
+            _cfg_dict (Config): An optional model config. If provided, it will replace
+                the config read out of the `model_name_or_path`
         """
         requires_version()
 
         super().__init__(model_dir, *args, **kwargs)
 
-        self.model_dir = model_dir
-        self.config = Config.from_file(
-            os.path.join(self.model_dir, ModelFile.CONFIGURATION))
+        if _cfg_dict is None:
+            self.config = Config.from_file(
+                os.path.join(model_dir, ModelFile.CONFIGURATION))
+        else:
+            self.config = _cfg_dict
 
-        if 'config_path' in kwargs:
-            self.config.merge_from_dict(
-                {'model.config_path': kwargs['config_path']})
+        self.model_cfg = CfgMapper(self.config).__call__()
 
-        self.model_cfg = _C.clone()
-        self.model_cfg.merge_from_file(
-            os.path.join(model_dir, self.config.model.config_path))
+        data_dir = self.config.safe_get('datasets.root', None)
+        data_type = self.config.safe_get('datasets.type', 'pascal_voc')
 
-        if 'model_weights' in kwargs:
-            self.model_cfg.merge_from_list(
-                ['MODEL.WEIGHTS', kwargs['model_weights']])
-
-        self.model_cfg.freeze()
+        if self.training or self.model_cfg.TEST.PCB_ENABLE:
+            if data_dir is None:  # use default datasets
+                dataset_name = 'VOC_fewshot' if data_type == 'pascal_voc' else 'coco2014_fewshot'
+                logger.warning('data_dir is none, use default {} data.'.format(
+                    dataset_name))
+                data_voc = MsDataset.load(
+                    dataset_name=dataset_name,
+                    namespace='shimin2023',
+                    split='train',
+                    download_mode=DownloadMode.REUSE_DATASET_IF_EXISTS)
+                data_dir = os.path.join(
+                    data_voc.config_kwargs['split_config']['train'], 'data')
+                logger.info('{} datasets download dir is {}'.format(
+                    dataset_name, data_dir))
+            register_data(data_type, data_dir)
 
         self.model = DeFRCN(self.model_cfg)
 
+        if self.model_cfg.TEST.PCB_ENABLE:
+            if not os.path.exists(self.model_cfg.TEST.PCB_MODELPATH):
+                logger.warning('{} no model.'.format(
+                    self.model_cfg.TEST.PCB_MODELPATH))
+                self.model_cfg.TEST.PCB_MODELPATH = os.path.join(
+                    model_dir,
+                    'ImageNetPretrained/torchvision/resnet101-5d3b4d8f.pth')
+                logger.info('PCB use default model {}'.format(
+                    self.model_cfg.TEST.PCB_MODELPATH))
+            self.pcb = PrototypicalCalibrationBlock(self.model_cfg)
+
+        self.model_cfg.freeze()
+
     def forward(self, inputs) -> Any:
         """return the result by the model
 
@@ -74,6 +104,8 @@ class DeFRCNForFewShot(TorchModel):
     def inference(self, input: Dict[str, Any]) -> Any:
         with torch.no_grad():
             results = self.model([input])
+            if self.model_cfg.TEST.PCB_ENABLE:
+                results = self.pcb.execute_calibration([input], results)
         return results[0] if len(results) > 0 else None
 
     def get_model_cfg(self):
diff --git a/modelscope/models/cv/image_defrcn_fewshot/evaluation/__init__.py b/modelscope/models/cv/image_defrcn_fewshot/evaluation/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/modelscope/models/cv/image_defrcn_fewshot/evaluation/coco_evaluation.py b/modelscope/models/cv/image_defrcn_fewshot/evaluation/coco_evaluation.py
new file mode 100644
index 00000000..9dd08cc2
--- /dev/null
+++ b/modelscope/models/cv/image_defrcn_fewshot/evaluation/coco_evaluation.py
@@ -0,0 +1,368 @@
+# The implementation is adopted from er-muyue/DeFRCN
+# made publicly available under the MIT License at
+# https://github.com/er-muyue/DeFRCN/blob/main/defrcn/evaluation/coco_evaluation.py
+
+import contextlib
+import copy
+import io
+import itertools
+import logging
+import os
+from collections import OrderedDict
+
+import json
+import numpy as np
+import torch
+from detectron2.data import MetadataCatalog
+from detectron2.data.datasets.coco import convert_to_coco_json
+from detectron2.evaluation import DatasetEvaluator
+from detectron2.structures import BoxMode
+from detectron2.utils import comm as comm
+from detectron2.utils.logger import create_small_table
+from fvcore.common.file_io import PathManager
+from pycocotools.coco import COCO
+from pycocotools.cocoeval import COCOeval
+from tabulate import tabulate
+
+
+class COCOEvaluator(DatasetEvaluator):
+
+    def __init__(self, dataset_name, distributed, output_dir=None):
+
+        self._distributed = distributed
+        self._output_dir = output_dir
+        self._dataset_name = dataset_name
+        self._cpu_device = torch.device('cpu')
+        self._logger = logging.getLogger(__name__)
+
+        self._metadata = MetadataCatalog.get(dataset_name)
+        if not hasattr(self._metadata, 'json_file'):
+            self._logger.warning(
+                f"json_file was not found in MetaDataCatalog for '{dataset_name}'"
+            )
+            cache_path = convert_to_coco_json(dataset_name, output_dir)
+            self._metadata.json_file = cache_path
+        self._is_splits = 'all' in dataset_name or 'base' in dataset_name \
+            or 'novel' in dataset_name
+        self._base_classes = [
+            8,
+            10,
+            11,
+            13,
+            14,
+            15,
+            22,
+            23,
+            24,
+            25,
+            27,
+            28,
+            31,
+            32,
+            33,
+            34,
+            35,
+            36,
+            37,
+            38,
+            39,
+            40,
+            41,
+            42,
+            43,
+            46,
+            47,
+            48,
+            49,
+            50,
+            51,
+            52,
+            53,
+            54,
+            55,
+            56,
+            57,
+            58,
+            59,
+            60,
+            61,
+            65,
+            70,
+            73,
+            74,
+            75,
+            76,
+            77,
+            78,
+            79,
+            80,
+            81,
+            82,
+            84,
+            85,
+            86,
+            87,
+            88,
+            89,
+            90,
+        ]
+        self._novel_classes = [
+            1, 2, 3, 4, 5, 6, 7, 9, 16, 17, 18, 19, 20, 21, 44, 62, 63, 64, 67,
+            72
+        ]
+
+        json_file = PathManager.get_local_path(self._metadata.json_file)
+        with contextlib.redirect_stdout(io.StringIO()):
+            self._coco_api = COCO(json_file)
+        self._do_evaluation = 'annotations' in self._coco_api.dataset
+
+    def reset(self):
+        self._predictions = []
+        self._coco_results = []
+
+    def process(self, inputs, outputs):
+        """
+        Args:
+            inputs: the inputs to a COCO model (e.g., GeneralizedRCNN).
+                It is a list of dict. Each dict corresponds to an image and
+                contains keys like "height", "width", "file_name", "image_id".
+            outputs: the outputs of a COCO model. It is a list of dicts with key
+                "instances" that contains :class:`Instances`.
+        """
+        for input, output in zip(inputs, outputs):
+            prediction = {'image_id': input['image_id']}
+            # TODO this is ugly
+            if 'instances' in output:
+                instances = output['instances'].to(self._cpu_device)
+                prediction['instances'] = instances_to_coco_json(
+                    instances, input['image_id'])
+            self._predictions.append(prediction)
+
+    def evaluate(self):
+        if self._distributed:
+            comm.synchronize()
+            self._predictions = comm.gather(self._predictions, dst=0)
+            self._predictions = list(itertools.chain(*self._predictions))
+            if not comm.is_main_process():
+                return {}
+
+        if len(self._predictions) == 0:
+            self._logger.warning(
+                '[COCOEvaluator] Did not receive valid predictions.')
+            return {}
+
+        if self._output_dir:
+            PathManager.mkdirs(self._output_dir)
+            file_path = os.path.join(self._output_dir,
+                                     'instances_predictions.pth')
+            with PathManager.open(file_path, 'wb') as f:
+                torch.save(self._predictions, f)
+
+        self._results = OrderedDict()
+        if 'instances' in self._predictions[0]:
+            self._eval_predictions()
+        # Copy so the caller can do whatever with results
+        return copy.deepcopy(self._results)
+
+    def _eval_predictions(self):
+        """
+        Evaluate self._predictions on the instance detection task.
+        Fill self._results with the metrics of the instance detection task.
+        """
+        self._logger.info('Preparing results for COCO format ...')
+        self._coco_results = list(
+            itertools.chain(*[x['instances'] for x in self._predictions]))
+
+        # unmap the category ids for COCO
+        if hasattr(self._metadata, 'thing_dataset_id_to_contiguous_id'):
+            reverse_id_mapping = {
+                v: k
+                for k, v in
+                self._metadata.thing_dataset_id_to_contiguous_id.items()
+            }
+            for result in self._coco_results:
+                result['category_id'] = reverse_id_mapping[
+                    result['category_id']]
+
+        if self._output_dir:
+            file_path = os.path.join(self._output_dir,
+                                     'coco_instances_results.json')
+            self._logger.info('Saving results to {}'.format(file_path))
+            with PathManager.open(file_path, 'w') as f:
+                f.write(json.dumps(self._coco_results))
+                f.flush()
+
+        if not self._do_evaluation:
+            self._logger.info('Annotations are not available for evaluation.')
+            return
+
+        self._logger.info('Evaluating predictions ...')
+        if self._is_splits:
+            self._results['bbox'] = {}
+            for split, classes, names in [
+                ('all', None, self._metadata.get('thing_classes')),
+                ('base', self._base_classes,
+                 self._metadata.get('base_classes')),
+                ('novel', self._novel_classes,
+                 self._metadata.get('novel_classes'))
+            ]:
+                if 'all' not in self._dataset_name and \
+                        split not in self._dataset_name:
+                    continue
+                coco_eval = (
+                    _evaluate_predictions_on_coco(
+                        self._coco_api,
+                        self._coco_results,
+                        'bbox',
+                        classes,
+                    ) if len(self._coco_results) > 0 else
+                    None  # cocoapi does not handle empty results very well
+                )
+                res_ = self._derive_coco_results(
+                    coco_eval, 'bbox', class_names=names)
+                res = {}
+                for metric in res_.keys():
+                    if len(metric) <= 4:
+                        if split == 'all':
+                            res[metric] = res_[metric]
+                        elif split == 'base':
+                            res['b' + metric] = res_[metric]
+                        elif split == 'novel':
+                            res['n' + metric] = res_[metric]
+                self._results['bbox'].update(res)
+
+            # add "AP" if not already in
+            if 'AP' not in self._results['bbox']:
+                if 'nAP' in self._results['bbox']:
+                    self._results['bbox']['AP'] = self._results['bbox']['nAP']
+                else:
+                    self._results['bbox']['AP'] = self._results['bbox']['bAP']
+        else:
+            coco_eval = (
+                _evaluate_predictions_on_coco(
+                    self._coco_api,
+                    self._coco_results,
+                    'bbox',
+                ) if len(self._coco_results) > 0 else
+                None  # cocoapi does not handle empty results very well
+            )
+            res = self._derive_coco_results(
+                coco_eval,
+                'bbox',
+                class_names=self._metadata.get('thing_classes'))
+            self._results['bbox'] = res
+
+    def _derive_coco_results(self, coco_eval, iou_type, class_names=None):
+        """
+        Derive the desired score numbers from summarized COCOeval.
+
+        Args:
+            coco_eval (None or COCOEval): None represents no predictions from model.
+            iou_type (str):
+            class_names (None or list[str]): if provided, will use it to predict
+                per-category AP.
+
+        Returns:
+            a dict of {metric name: score}
+        """
+
+        metrics = ['AP', 'AP50', 'AP75', 'APs', 'APm', 'APl']
+
+        if coco_eval is None:
+            self._logger.warn(
+                'No predictions from the model! Set scores to -1')
+            return {metric: -1 for metric in metrics}
+
+        # the standard metrics
+        results = {
+            metric: float(coco_eval.stats[idx] * 100)
+            for idx, metric in enumerate(metrics)
+        }
+        self._logger.info('Evaluation results for {}: \n'.format(iou_type)
+                          + create_small_table(results))
+
+        if class_names is None or len(class_names) <= 1:
+            return results
+        # Compute per-category AP
+        precisions = coco_eval.eval['precision']
+        # precision has dims (iou, recall, cls, area range, max dets)
+        assert len(class_names) == precisions.shape[2]
+
+        results_per_category = []
+        for idx, name in enumerate(class_names):
+            # area range index 0: all area ranges
+            # max dets index -1: typically 100 per image
+            precision = precisions[:, :, idx, 0, -1]
+            precision = precision[precision > -1]
+            ap = np.mean(precision) if precision.size else float('nan')
+            results_per_category.append(('{}'.format(name), float(ap * 100)))
+
+        # tabulate it
+        N_COLS = min(6, len(results_per_category) * 2)
+        results_flatten = list(itertools.chain(*results_per_category))
+        results_2d = itertools.zip_longest(
+            *[results_flatten[i::N_COLS] for i in range(N_COLS)])
+        table = tabulate(
+            results_2d,
+            tablefmt='pipe',
+            floatfmt='.3f',
+            headers=['category', 'AP'] * (N_COLS // 2),
+            numalign='left',
+        )
+        self._logger.info('Per-category {} AP: \n'.format(iou_type) + table)
+
+        results.update({'AP-' + name: ap for name, ap in results_per_category})
+        return results
+
+
+def instances_to_coco_json(instances, img_id):
+    """
+    Dump an "Instances" object to a COCO-format json that's used for evaluation.
+
+    Args:
+        instances (Instances):
+        img_id (int): the image id
+
+    Returns:
+        list[dict]: list of json annotations in COCO format.
+    """
+    num_instance = len(instances)
+    if num_instance == 0:
+        return []
+
+    boxes = instances.pred_boxes.tensor.numpy()
+    boxes = BoxMode.convert(boxes, BoxMode.XYXY_ABS, BoxMode.XYWH_ABS)
+    boxes = boxes.tolist()
+    scores = instances.scores.tolist()
+    classes = instances.pred_classes.tolist()
+
+    results = []
+    for k in range(num_instance):
+        result = {
+            'image_id': img_id,
+            'category_id': classes[k],
+            'bbox': boxes[k],
+            'score': scores[k],
+        }
+        results.append(result)
+    return results
+
+
+def _evaluate_predictions_on_coco(coco_gt,
+                                  coco_results,
+                                  iou_type,
+                                  catIds=None):
+    """
+    Evaluate the coco results using COCOEval API.
+    """
+    assert len(coco_results) > 0
+
+    coco_dt = coco_gt.loadRes(coco_results)
+    coco_eval = COCOeval(coco_gt, coco_dt, iou_type)
+    if catIds is not None:
+        coco_eval.params.catIds = catIds
+    coco_eval.evaluate()
+    coco_eval.accumulate()
+    coco_eval.summarize()
+
+    return coco_eval
diff --git a/modelscope/models/cv/image_defrcn_fewshot/evaluation/evaluator.py b/modelscope/models/cv/image_defrcn_fewshot/evaluation/evaluator.py
new file mode 100644
index 00000000..13f73acf
--- /dev/null
+++ b/modelscope/models/cv/image_defrcn_fewshot/evaluation/evaluator.py
@@ -0,0 +1,77 @@
+# The implementation is adopted from er-muyue/DeFRCN
+# made publicly available under the MIT License at
+# https://github.com/facebookresearch/detectron2/blob/v0.3/detectron2/evaluation/evaluator.py
+
+import datetime
+import logging
+import time
+
+import torch
+from detectron2.evaluation.evaluator import inference_context
+
+from ..models.calibration_layer import PrototypicalCalibrationBlock
+
+
+def inference_on_dataset(model, data_loader, evaluator, cfg=None):
+
+    num_devices = torch.distributed.get_world_size(
+    ) if torch.distributed.is_initialized() else 1
+    logger = logging.getLogger(__name__)
+
+    pcb = None
+    if cfg.TEST.PCB_ENABLE:
+        logger.info('Start initializing PCB module, please wait a seconds...')
+        pcb = PrototypicalCalibrationBlock(cfg)
+
+    logger.info('Start inference on {} images'.format(len(data_loader)))
+    total = len(data_loader)  # inference data loader must have a fixed length
+    evaluator.reset()
+
+    logging_interval = 50
+    num_warmup = min(5, logging_interval - 1, total - 1)
+    start_time = time.time()
+    total_compute_time = 0
+    with inference_context(model), torch.no_grad():
+        for idx, inputs in enumerate(data_loader):
+            if idx == num_warmup:
+                start_time = time.time()
+                total_compute_time = 0
+
+            start_compute_time = time.time()
+            outputs = model(inputs)
+            if cfg.TEST.PCB_ENABLE:
+                outputs = pcb.execute_calibration(inputs, outputs)
+            torch.cuda.synchronize()
+            total_compute_time += time.time() - start_compute_time
+            evaluator.process(inputs, outputs)
+
+            if (idx + 1) % logging_interval == 0:
+                duration = time.time() - start_time
+                seconds_per_img = duration / (idx + 1 - num_warmup)
+                eta = datetime.timedelta(
+                    seconds=int(seconds_per_img * (total - num_warmup)
+                                - duration))
+                logger.info(
+                    'Inference done {}/{}. {:.4f} s / img. ETA={}'.format(
+                        idx + 1, total, seconds_per_img, str(eta)))
+
+    # Measure the time only for this worker (before the synchronization barrier)
+    total_time = int(time.time() - start_time)
+    total_time_str = str(datetime.timedelta(seconds=total_time))
+    # NOTE this format is parsed by grep
+    logger.info(
+        'Total inference time: {} ({:.6f} s / img per device, on {} devices)'.
+        format(total_time_str, total_time / (total - num_warmup), num_devices))
+    total_compute_time_str = str(
+        datetime.timedelta(seconds=int(total_compute_time)))
+    time_per_device = total_compute_time / (total - num_warmup)
+    logger.info(
+        'Total inference pure compute time: {} ({:.6f} s / img per device, on {} devices)'
+        .format(total_compute_time_str, time_per_device, num_devices))
+
+    results = evaluator.evaluate()
+    # An evaluator may return None when not in main process.
+    # Replace it by an empty dict instead to make it easier for downstream code to handle
+    if results is None:
+        results = {}
+    return results
diff --git a/modelscope/models/cv/image_defrcn_fewshot/evaluation/pascal_voc_evaluation.py b/modelscope/models/cv/image_defrcn_fewshot/evaluation/pascal_voc_evaluation.py
new file mode 100644
index 00000000..eff7fecf
--- /dev/null
+++ b/modelscope/models/cv/image_defrcn_fewshot/evaluation/pascal_voc_evaluation.py
@@ -0,0 +1,118 @@
+# The implementation is adopted from er-muyue/DeFRCN
+# made publicly available under the MIT License at
+# https://github.com/er-muyue/DeFRCN/blob/main/defrcn/evaluation/pascal_voc_evaluation.py
+
+import os
+import tempfile
+from collections import OrderedDict, defaultdict
+
+import numpy as np
+from detectron2.data import MetadataCatalog
+from detectron2.evaluation.pascal_voc_evaluation import (
+    PascalVOCDetectionEvaluator, voc_eval)
+from detectron2.utils import comm
+from detectron2.utils.logger import create_small_table
+
+
+class PascalVOCEvaluator(PascalVOCDetectionEvaluator):
+    """
+    Evaluate Pascal VOC AP.
+    """
+
+    def __init__(self, dataset_name):
+        """
+        Args:
+            dataset_name (str): name of the dataset, e.g., "voc_2007_test"
+        """
+        super().__init__(dataset_name)
+        meta = MetadataCatalog.get(dataset_name)
+        self._base_classes = meta.base_classes
+        self._novel_classes = meta.novel_classes
+
+    def evaluate(self):
+        """
+        Returns:
+            dict: has a key "segm", whose value is a dict of "AP", "AP50", and "AP75".
+        """
+        all_predictions = comm.gather(self._predictions, dst=0)
+        if not comm.is_main_process():
+            return
+        predictions = defaultdict(list)
+        for predictions_per_rank in all_predictions:
+            for clsid, lines in predictions_per_rank.items():
+                predictions[clsid].extend(lines)
+        del all_predictions
+
+        self._logger.info(
+            'Evaluating {} using {} metric. '
+            'Note that results do not use the official Matlab API.'.format(
+                self._dataset_name, 2007 if self._is_2007 else 2012))
+
+        with tempfile.TemporaryDirectory(prefix='pascal_voc_eval_') as dirname:
+            res_file_template = os.path.join(dirname, '{}.txt')
+
+            aps = defaultdict(list)  # iou -> ap per class
+            aps_base = defaultdict(list)
+            aps_novel = defaultdict(list)
+            exist_base, exist_novel = False, False
+            for cls_id, cls_name in enumerate(self._class_names):
+                lines = predictions.get(cls_id, [''])
+
+                with open(res_file_template.format(cls_name), 'w') as f:
+                    f.write('\n'.join(lines))
+
+                for thresh in range(50, 100, 5):
+                    rec, prec, ap = voc_eval(
+                        res_file_template,
+                        self._anno_file_template,
+                        self._image_set_path,
+                        cls_name,
+                        ovthresh=thresh / 100.0,
+                        use_07_metric=self._is_2007,
+                    )
+                    aps[thresh].append(ap * 100)
+
+                    if self._base_classes is not None and cls_name in self._base_classes:
+                        aps_base[thresh].append(ap * 100)
+                        exist_base = True
+
+                    if self._novel_classes is not None and cls_name in self._novel_classes:
+                        aps_novel[thresh].append(ap * 100)
+                        exist_novel = True
+
+        ret = OrderedDict()
+        mAP = {iou: np.mean(x) for iou, x in aps.items()}
+        ret['bbox'] = {
+            'AP': np.mean(list(mAP.values())),
+            'AP50': mAP[50],
+            'AP75': mAP[75]
+        }
+
+        # adding evaluation of the base and novel classes
+        if exist_base:
+            mAP_base = {iou: np.mean(x) for iou, x in aps_base.items()}
+            ret['bbox'].update({
+                'bAP': np.mean(list(mAP_base.values())),
+                'bAP50': mAP_base[50],
+                'bAP75': mAP_base[75]
+            })
+
+        if exist_novel:
+            mAP_novel = {iou: np.mean(x) for iou, x in aps_novel.items()}
+            ret['bbox'].update({
+                'nAP': np.mean(list(mAP_novel.values())),
+                'nAP50': mAP_novel[50],
+                'nAP75': mAP_novel[75]
+            })
+
+        # write per class AP to logger
+        per_class_res = {
+            self._class_names[idx]: ap
+            for idx, ap in enumerate(aps[50])
+        }
+
+        self._logger.info('Evaluate per-class mAP50:\n'
+                          + create_small_table(per_class_res))
+        self._logger.info('Evaluate overall bbox:\n'
+                          + create_small_table(ret['bbox']))
+        return ret
diff --git a/modelscope/models/cv/image_defrcn_fewshot/models/calibration_layer.py b/modelscope/models/cv/image_defrcn_fewshot/models/calibration_layer.py
new file mode 100644
index 00000000..c6d7674d
--- /dev/null
+++ b/modelscope/models/cv/image_defrcn_fewshot/models/calibration_layer.py
@@ -0,0 +1,188 @@
+# The implementation is adopted from er-muyue/DeFRCN
+# made publicly available under the MIT License at
+# https://github.com/er-muyue/DeFRCN/blob/main/defrcn/evaluation/calibration_layer.py
+
+import cv2
+import torch
+from detectron2.data import DatasetMapper, build_detection_test_loader
+from detectron2.modeling.poolers import ROIPooler
+from detectron2.structures import ImageList
+from sklearn.metrics.pairwise import cosine_similarity
+
+from modelscope.utils.logger import get_logger
+from .resnet import resnet101
+
+
+class DatasetMapperIns(DatasetMapper):
+
+    def __init__(self, cfg, is_train: bool):
+        super(DatasetMapperIns, self).__init__(cfg, is_train)
+
+    def __call__(self, dataset_dict):
+        is_train = self.is_train
+        self.is_train = True
+        dataset_dict = super(DatasetMapperIns, self).__call__(dataset_dict)
+        self.is_train = is_train
+        return dataset_dict
+
+
+class PrototypicalCalibrationBlock:
+
+    def __init__(self, cfg):
+        super().__init__()
+
+        self.logger = get_logger()
+
+        self.cfg = cfg
+        self.device = torch.device(cfg.MODEL.DEVICE)
+        self.alpha = self.cfg.TEST.PCB_ALPHA
+
+        self.imagenet_model = self.build_model()
+        self.dataloader = build_detection_test_loader(
+            self.cfg,
+            self.cfg.DATASETS.TRAIN[0],
+            mapper=DatasetMapperIns(cfg, False))
+        self.roi_pooler = ROIPooler(
+            output_size=(1, 1),
+            scales=(1 / 32, ),
+            sampling_ratio=(0),
+            pooler_type='ROIAlignV2')
+        self.prototypes = self.build_prototypes()
+
+        self.exclude_cls = self.clsid_filter()
+
+    def build_model(self):
+        self.logger.info('Loading ImageNet Pre-train Model from {}'.format(
+            self.cfg.TEST.PCB_MODELPATH))
+        if self.cfg.TEST.PCB_MODELTYPE == 'resnet':
+            imagenet_model = resnet101()
+        else:
+            raise NotImplementedError
+        state_dict = torch.load(self.cfg.TEST.PCB_MODELPATH)
+        imagenet_model.load_state_dict(state_dict)
+        imagenet_model = imagenet_model.to(self.device)
+        imagenet_model.eval()
+        return imagenet_model
+
+    def build_prototypes(self):
+
+        all_features, all_labels = [], []
+        for index in range(len(self.dataloader.dataset)):
+            inputs = [self.dataloader.dataset[index]]
+            assert len(inputs) == 1
+            # load support images and gt-boxes
+            img = cv2.imread(inputs[0]['file_name'])  # BGR
+            img_h, _ = img.shape[0], img.shape[1]
+            ratio = img_h / inputs[0]['instances'].image_size[0]
+            inputs[0]['instances'].gt_boxes.tensor = inputs[0][
+                'instances'].gt_boxes.tensor * ratio
+            boxes = [x['instances'].gt_boxes.to(self.device) for x in inputs]
+
+            # extract roi features
+            features = self.extract_roi_features(img, boxes)
+            all_features.append(features.cpu().data)
+
+            gt_classes = [x['instances'].gt_classes for x in inputs]
+            all_labels.append(gt_classes[0].cpu().data)
+
+        # concat
+        all_features = torch.cat(all_features, dim=0)
+        all_labels = torch.cat(all_labels, dim=0)
+        assert all_features.shape[0] == all_labels.shape[0]
+
+        # calculate prototype
+        features_dict = {}
+        for i, label in enumerate(all_labels):
+            label = int(label)
+            if label not in features_dict:
+                features_dict[label] = []
+            features_dict[label].append(all_features[i].unsqueeze(0))
+
+        prototypes_dict = {}
+        for label in features_dict:
+            features = torch.cat(features_dict[label], dim=0)
+            prototypes_dict[label] = torch.mean(features, dim=0, keepdim=True)
+
+        return prototypes_dict
+
+    def extract_roi_features(self, img, boxes):
+        """
+        :param img:
+        :param boxes:
+        :return:
+        """
+
+        mean = torch.tensor([0.406, 0.456, 0.485]).reshape(
+            (3, 1, 1)).to(self.device)
+        std = torch.tensor([[0.225, 0.224, 0.229]]).reshape(
+            (3, 1, 1)).to(self.device)
+
+        img = img.transpose((2, 0, 1))
+        img = torch.from_numpy(img).to(self.device)
+        images = [(img / 255. - mean) / std]
+        images = ImageList.from_tensors(images, 0)
+        conv_feature = self.imagenet_model(
+            images.tensor[:, [2, 1, 0]])[1]  # size: BxCxHxW
+
+        box_features = self.roi_pooler([conv_feature],
+                                       boxes).squeeze(2).squeeze(2)
+
+        activation_vectors = self.imagenet_model.fc(box_features)
+
+        return activation_vectors
+
+    def execute_calibration(self, inputs, dts):
+        if 'file_name' in inputs[0]:
+            img = cv2.imread(inputs[0]['file_name'])
+        elif 'image_numpy' in inputs[0]:
+            img = inputs[0]['image_numpy']
+
+        ileft = (dts[0]['instances'].scores > self.cfg.TEST.PCB_UPPER).sum()
+        iright = (dts[0]['instances'].scores > self.cfg.TEST.PCB_LOWER).sum()
+        assert ileft <= iright
+        boxes = [dts[0]['instances'].pred_boxes[ileft:iright]]
+
+        features = self.extract_roi_features(img, boxes)
+
+        for i in range(ileft, iright):
+            tmp_class = int(dts[0]['instances'].pred_classes[i])
+            if tmp_class in self.exclude_cls:
+                continue
+            tmp_cos = cosine_similarity(
+                features[i - ileft].cpu().data.numpy().reshape((1, -1)),
+                self.prototypes[tmp_class].cpu().data.numpy())[0][0]
+            dts[0]['instances'].scores[i] = dts[0]['instances'].scores[
+                i] * self.alpha + tmp_cos * (1 - self.alpha)
+        return dts
+
+    def clsid_filter(self):
+        dsname = self.cfg.DATASETS.TEST[0]
+        exclude_ids = []
+        if 'test_all' in dsname:
+            if 'coco' in dsname:
+                exclude_ids = [
+                    7, 9, 10, 11, 12, 13, 20, 21, 22, 23, 24, 25, 26, 27, 28,
+                    29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 40, 41, 42, 43, 44,
+                    45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 59, 61, 63, 64,
+                    65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79
+                ]
+            elif 'voc' in dsname:
+                exclude_ids = list(range(0, 15))
+            else:
+                raise NotImplementedError
+        return exclude_ids
+
+
+@torch.no_grad()
+def concat_all_gather(tensor):
+    """
+    Performs all_gather operation on the provided tensors.
+    *** Warning ***: torch.distributed.all_gather has no gradient.
+    """
+    tensors_gather = [
+        torch.ones_like(tensor)
+        for _ in range(torch.distributed.get_world_size())
+    ]
+    torch.distributed.all_gather(tensors_gather, tensor, async_op=False)
+    output = torch.cat(tensors_gather, dim=0)
+    return output
diff --git a/modelscope/models/cv/image_defrcn_fewshot/models/defaults_config.py b/modelscope/models/cv/image_defrcn_fewshot/models/defaults_config.py
deleted file mode 100644
index 55fcc43b..00000000
--- a/modelscope/models/cv/image_defrcn_fewshot/models/defaults_config.py
+++ /dev/null
@@ -1,38 +0,0 @@
-# The implementation is adopted from er-muyue/DeFRCN
-# made publicly available under the MIT License at
-# https://github.com/er-muyue/DeFRCN/blob/main/defrcn/config/defaults.py
-
-from detectron2.config.defaults import _C
-
-_CC = _C
-
-# ----------- Backbone ----------- #
-_CC.MODEL.BACKBONE.FREEZE = False
-_CC.MODEL.BACKBONE.FREEZE_AT = 3
-
-# ------------- RPN -------------- #
-_CC.MODEL.RPN.FREEZE = False
-_CC.MODEL.RPN.ENABLE_DECOUPLE = False
-_CC.MODEL.RPN.BACKWARD_SCALE = 1.0
-
-# ------------- ROI -------------- #
-_CC.MODEL.ROI_HEADS.NAME = 'Res5ROIHeads'
-_CC.MODEL.ROI_HEADS.FREEZE_FEAT = False
-_CC.MODEL.ROI_HEADS.ENABLE_DECOUPLE = False
-_CC.MODEL.ROI_HEADS.BACKWARD_SCALE = 1.0
-_CC.MODEL.ROI_HEADS.OUTPUT_LAYER = 'FastRCNNOutputLayers'
-_CC.MODEL.ROI_HEADS.CLS_DROPOUT = False
-_CC.MODEL.ROI_HEADS.DROPOUT_RATIO = 0.8
-_CC.MODEL.ROI_BOX_HEAD.POOLER_RESOLUTION = 7  # for faster
-
-# ------------- TEST ------------- #
-_CC.TEST.PCB_ENABLE = False
-_CC.TEST.PCB_MODELTYPE = 'resnet'  # res-like
-_CC.TEST.PCB_MODELPATH = ''
-_CC.TEST.PCB_ALPHA = 0.50
-_CC.TEST.PCB_UPPER = 1.0
-_CC.TEST.PCB_LOWER = 0.05
-
-# ------------ Other ------------- #
-_CC.SOLVER.WEIGHT_DECAY = 5e-5
-_CC.MUTE_HEADER = True
diff --git a/modelscope/models/cv/image_defrcn_fewshot/models/fast_rcnn.py b/modelscope/models/cv/image_defrcn_fewshot/models/fast_rcnn.py
index 9415b5a6..88311774 100644
--- a/modelscope/models/cv/image_defrcn_fewshot/models/fast_rcnn.py
+++ b/modelscope/models/cv/image_defrcn_fewshot/models/fast_rcnn.py
@@ -4,7 +4,7 @@
 
 import numpy as np
 import torch
-from detectron2.layers import batched_nms, cat
+from detectron2.layers import cat
 from detectron2.modeling.roi_heads.fast_rcnn import \
     fast_rcnn_inference_single_image
 from detectron2.utils.events import get_event_storage
diff --git a/modelscope/models/cv/image_defrcn_fewshot/models/resnet.py b/modelscope/models/cv/image_defrcn_fewshot/models/resnet.py
new file mode 100644
index 00000000..0b0abcfe
--- /dev/null
+++ b/modelscope/models/cv/image_defrcn_fewshot/models/resnet.py
@@ -0,0 +1,66 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import torch
+from torchvision.models.resnet import BasicBlock, Bottleneck, ResNet
+
+__all__ = [
+    'ResNetFeatures', 'resnet18', 'resnet34', 'resnet50', 'resnet101',
+    'resnet152'
+]
+
+
+class ResNetFeatures(ResNet):
+    """
+    Modified from torchvision.models.resnet, the only one difference is outputing layer4 feature in forward.
+    """
+
+    def __init__(self,
+                 block,
+                 layers,
+                 num_classes=1000,
+                 zero_init_residual=False,
+                 groups=1,
+                 width_per_group=64,
+                 replace_stride_with_dilation=None,
+                 norm_layer=None):
+        super(ResNetFeatures,
+              self).__init__(block, layers, num_classes, zero_init_residual,
+                             groups, width_per_group,
+                             replace_stride_with_dilation, norm_layer)
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = self.relu(x)
+        x = self.maxpool(x)
+
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+        feature = self.layer4(x)
+
+        x = self.avgpool(feature)
+        x = torch.flatten(x, 1)
+        x = self.fc(x)
+
+        return x, feature
+
+
+def resnet18(**kwargs):
+    return ResNetFeatures(BasicBlock, [2, 2, 2, 2], **kwargs)
+
+
+def resnet34(**kwargs):
+    return ResNetFeatures(BasicBlock, [3, 4, 6, 3], **kwargs)
+
+
+def resnet50(**kwargs):
+    return ResNetFeatures(Bottleneck, [3, 4, 6, 3], **kwargs)
+
+
+def resnet101(**kwargs):
+    return ResNetFeatures(Bottleneck, [3, 4, 23, 3], **kwargs)
+
+
+def resnet152(**kwargs):
+    return ResNetFeatures(Bottleneck, [3, 8, 36, 3], **kwargs)
diff --git a/modelscope/models/cv/image_defrcn_fewshot/models/roi_heads.py b/modelscope/models/cv/image_defrcn_fewshot/models/roi_heads.py
index 9ac78119..9de00ad5 100644
--- a/modelscope/models/cv/image_defrcn_fewshot/models/roi_heads.py
+++ b/modelscope/models/cv/image_defrcn_fewshot/models/roi_heads.py
@@ -2,207 +2,15 @@
 # made publicly available under the MIT License at
 # https://github.com/er-muyue/DeFRCN/blob/main/defrcn/modeling/roi_heads/roi_heads.py
 
-from typing import Dict
-
-import numpy as np
-import torch
-from detectron2.layers import ShapeSpec
 from detectron2.modeling.backbone.resnet import BottleneckBlock, make_stage
 from detectron2.modeling.box_regression import Box2BoxTransform
-from detectron2.modeling.matcher import Matcher
 from detectron2.modeling.poolers import ROIPooler
-from detectron2.modeling.proposal_generator.proposal_utils import \
-    add_ground_truth_to_proposals
-from detectron2.modeling.roi_heads import select_foreground_proposals
-from detectron2.modeling.sampling import subsample_labels
-from detectron2.structures import Boxes, Instances, pairwise_iou
-from detectron2.utils.events import get_event_storage
+from detectron2.modeling.roi_heads import ROIHeads
 from torch import nn
 
 from .fast_rcnn import FastRCNNOutputLayers, FastRCNNOutputs
 
 
-class ROIHeads(torch.nn.Module):
-    """
-    ROIHeads perform all per-region computation in an R-CNN.
-
-    It contains logic of cropping the regions, extract per-region features,
-    and make per-region predictions.
-
-    It can have many variants, implemented as subclasses of this class.
-    """
-
-    def __init__(self, cfg, input_shape: Dict[str, ShapeSpec]):
-        super(ROIHeads, self).__init__()
-
-        # fmt: off
-        self.batch_size_per_image = cfg.MODEL.ROI_HEADS.BATCH_SIZE_PER_IMAGE
-        self.positive_sample_fraction = cfg.MODEL.ROI_HEADS.POSITIVE_FRACTION
-        self.test_score_thresh = cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST
-        self.test_nms_thresh = cfg.MODEL.ROI_HEADS.NMS_THRESH_TEST
-        self.test_detections_per_img = cfg.TEST.DETECTIONS_PER_IMAGE
-        self.in_features = cfg.MODEL.ROI_HEADS.IN_FEATURES
-        self.num_classes = cfg.MODEL.ROI_HEADS.NUM_CLASSES
-        self.proposal_append_gt = cfg.MODEL.ROI_HEADS.PROPOSAL_APPEND_GT
-        self.feature_strides = {k: v.stride for k, v in input_shape.items()}
-        self.feature_channels = {k: v.channels for k, v in input_shape.items()}
-        self.cls_agnostic_bbox_reg = cfg.MODEL.ROI_BOX_HEAD.CLS_AGNOSTIC_BBOX_REG
-        self.smooth_l1_beta = cfg.MODEL.ROI_BOX_HEAD.SMOOTH_L1_BETA
-        # fmt: on
-
-        # Matcher to assign box proposals to gt boxes
-        self.proposal_matcher = Matcher(
-            cfg.MODEL.ROI_HEADS.IOU_THRESHOLDS,
-            cfg.MODEL.ROI_HEADS.IOU_LABELS,
-            allow_low_quality_matches=False,
-        )
-
-        # Box2BoxTransform for bounding box regression
-        self.box2box_transform = Box2BoxTransform(
-            weights=cfg.MODEL.ROI_BOX_HEAD.BBOX_REG_WEIGHTS)
-
-    def _sample_proposals(self, matched_idxs, matched_labels, gt_classes):
-        """
-        Based on the matching between N proposals and M groundtruth,
-        sample the proposals and set their classification labels.
-
-        Args:
-            matched_idxs (Tensor): a vector of length N, each is the best-matched
-                gt index in [0, M) for each proposal.
-            matched_labels (Tensor): a vector of length N, the matcher's label
-                (one of cfg.MODEL.ROI_HEADS.IOU_LABELS) for each proposal.
-            gt_classes (Tensor): a vector of length M.
-
-        Returns:
-            Tensor: a vector of indices of sampled proposals. Each is in [0, N).
-            Tensor: a vector of the same length, the classification label for
-                each sampled proposal. Each sample is labeled as either a category in
-                [0, num_classes) or the background (num_classes).
-        """
-        has_gt = gt_classes.numel() > 0
-        # Get the corresponding GT for each proposal
-        if has_gt:
-            gt_classes = gt_classes[matched_idxs]
-            # Label unmatched proposals (0 label from matcher) as background (label=num_classes)
-            gt_classes[matched_labels == 0] = self.num_classes
-            # Label ignore proposals (-1 label)
-            gt_classes[matched_labels == -1] = -1
-        else:
-            gt_classes = torch.zeros_like(matched_idxs) + self.num_classes
-
-        sampled_fg_idxs, sampled_bg_idxs = subsample_labels(
-            gt_classes,
-            self.batch_size_per_image,
-            self.positive_sample_fraction,
-            self.num_classes,
-        )
-
-        sampled_idxs = torch.cat([sampled_fg_idxs, sampled_bg_idxs], dim=0)
-        return sampled_idxs, gt_classes[sampled_idxs]
-
-    @torch.no_grad()
-    def label_and_sample_proposals(self, proposals, targets):
-        """
-        Prepare some proposals to be used to train the ROI heads.
-        It performs box matching between `proposals` and `targets`, and assigns
-        training labels to the proposals.
-        It returns `self.batch_size_per_image` random samples from proposals and groundtruth boxes,
-        with a fraction of positives that is no larger than `self.positive_sample_fraction.
-
-        Args:
-            See :meth:`ROIHeads.forward`
-
-        Returns:
-            list[Instances]:
-                length `N` list of `Instances`s containing the proposals
-                sampled for training. Each `Instances` has the following fields:
-                - proposal_boxes: the proposal boxes
-                - gt_boxes: the ground-truth box that the proposal is assigned to
-                  (this is only meaningful if the proposal has a label > 0; if label = 0
-                   then the ground-truth box is random)
-                Other fields such as "gt_classes" that's included in `targets`.
-        """
-        gt_boxes = [x.gt_boxes for x in targets]
-
-        if self.proposal_append_gt:
-            proposals = add_ground_truth_to_proposals(gt_boxes, proposals)
-
-        proposals_with_gt = []
-
-        num_fg_samples = []
-        num_bg_samples = []
-        for proposals_per_image, targets_per_image in zip(proposals, targets):
-            has_gt = len(targets_per_image) > 0
-            match_quality_matrix = pairwise_iou(
-                targets_per_image.gt_boxes, proposals_per_image.proposal_boxes)
-            matched_idxs, matched_labels = self.proposal_matcher(
-                match_quality_matrix)
-            sampled_idxs, gt_classes = self._sample_proposals(
-                matched_idxs, matched_labels, targets_per_image.gt_classes)
-
-            # Set target attributes of the sampled proposals:
-            proposals_per_image = proposals_per_image[sampled_idxs]
-            proposals_per_image.gt_classes = gt_classes
-
-            # We index all the attributes of targets that start with "gt_"
-            # and have not been added to proposals yet (="gt_classes").
-            if has_gt:
-                sampled_targets = matched_idxs[sampled_idxs]
-
-                for (
-                        trg_name,
-                        trg_value,
-                ) in targets_per_image.get_fields().items():
-                    if trg_name.startswith(
-                            'gt_') and not proposals_per_image.has(trg_name):
-                        proposals_per_image.set(trg_name,
-                                                trg_value[sampled_targets])
-            else:
-                gt_boxes = Boxes(
-                    targets_per_image.gt_boxes.tensor.new_zeros(
-                        (len(sampled_idxs), 4)))
-                proposals_per_image.gt_boxes = gt_boxes
-
-            num_bg_samples.append(
-                (gt_classes == self.num_classes).sum().item())
-            num_fg_samples.append(gt_classes.numel() - num_bg_samples[-1])
-            proposals_with_gt.append(proposals_per_image)
-
-        # Log the number of fg/bg samples that are selected for training ROI heads
-        storage = get_event_storage()
-        storage.put_scalar('roi_head/num_fg_samples', np.mean(num_fg_samples))
-        storage.put_scalar('roi_head/num_bg_samples', np.mean(num_bg_samples))
-
-        return proposals_with_gt
-
-    def forward(self, images, features, proposals, targets=None):
-        """
-        Args:
-            images (ImageList):
-            features (dict[str: Tensor]): input data as a mapping from feature
-                map name to tensor. Axis 0 represents the number of images `N` in
-                the input data; axes 1-3 are channels, height, and width, which may
-                vary between feature maps (e.g., if a feature pyramid is used).
-            proposals (list[Instances]): length `N` list of `Instances`s. The i-th
-                `Instances` contains object proposals for the i-th input image,
-                with fields "proposal_boxes" and "objectness_logits".
-            targets (list[Instances], optional): length `N` list of `Instances`s. The i-th
-                `Instances` contains the ground-truth per-instance annotations
-                for the i-th input image.  Specify `targets` during training only.
-                It may have the following fields:
-                - gt_boxes: the bounding box of each instance.
-                - gt_classes: the label for each instance with a category ranging in [0, #class].
-
-        Returns:
-            results (list[Instances]): length `N` list of `Instances`s containing the
-                detected instances. Returned during inference only; may be []
-                during training.
-            losses (dict[str: Tensor]): mapping from a named loss to a tensor
-                storing the loss. Used during training only.
-        """
-        raise NotImplementedError()
-
-
 class Res5ROIHeads(ROIHeads):
     """
     The ROIHeads in a typical "C4" R-CNN model, where the heads share the
@@ -210,7 +18,23 @@ class Res5ROIHeads(ROIHeads):
     """
 
     def __init__(self, cfg, input_shape):
-        super().__init__(cfg, input_shape)
+
+        cfg_dict = ROIHeads.from_config(cfg)
+        super().__init__(**cfg_dict)
+
+        self.test_score_thresh = cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST
+        self.test_nms_thresh = cfg.MODEL.ROI_HEADS.NMS_THRESH_TEST
+        self.test_detections_per_img = cfg.TEST.DETECTIONS_PER_IMAGE
+        self.in_features = cfg.MODEL.ROI_HEADS.IN_FEATURES
+
+        self.feature_strides = {k: v.stride for k, v in input_shape.items()}
+        self.feature_channels = {k: v.channels for k, v in input_shape.items()}
+        self.cls_agnostic_bbox_reg = cfg.MODEL.ROI_BOX_HEAD.CLS_AGNOSTIC_BBOX_REG
+        self.smooth_l1_beta = cfg.MODEL.ROI_BOX_HEAD.SMOOTH_L1_BETA
+
+        # Box2BoxTransform for bounding box regression
+        self.box2box_transform = Box2BoxTransform(
+            weights=cfg.MODEL.ROI_BOX_HEAD.BBOX_REG_WEIGHTS)
 
         assert len(self.in_features) == 1
 
diff --git a/modelscope/models/cv/image_defrcn_fewshot/utils/coco_register.py b/modelscope/models/cv/image_defrcn_fewshot/utils/coco_register.py
new file mode 100644
index 00000000..c71be0eb
--- /dev/null
+++ b/modelscope/models/cv/image_defrcn_fewshot/utils/coco_register.py
@@ -0,0 +1,1086 @@
+# The implementation is adopted from er-muyue/DeFRCN
+# made publicly available under the MIT License at
+# https://github.com/er-muyue/DeFRCN/blob/main/defrcn/data/meta_coco.py
+# https://github.com/cocodataset/panopticapi/blob/master/panoptic_coco_categories.json
+
+import contextlib
+import io
+import os
+
+from detectron2.data import DatasetCatalog, MetadataCatalog
+from detectron2.structures import BoxMode
+from fvcore.common.file_io import PathManager
+from pycocotools.coco import COCO
+
+COCO_CATEGORIES = [
+    {
+        'color': [220, 20, 60],
+        'isthing': 1,
+        'id': 1,
+        'name': 'person'
+    },
+    {
+        'color': [119, 11, 32],
+        'isthing': 1,
+        'id': 2,
+        'name': 'bicycle'
+    },
+    {
+        'color': [0, 0, 142],
+        'isthing': 1,
+        'id': 3,
+        'name': 'car'
+    },
+    {
+        'color': [0, 0, 230],
+        'isthing': 1,
+        'id': 4,
+        'name': 'motorcycle'
+    },
+    {
+        'color': [106, 0, 228],
+        'isthing': 1,
+        'id': 5,
+        'name': 'airplane'
+    },
+    {
+        'color': [0, 60, 100],
+        'isthing': 1,
+        'id': 6,
+        'name': 'bus'
+    },
+    {
+        'color': [0, 80, 100],
+        'isthing': 1,
+        'id': 7,
+        'name': 'train'
+    },
+    {
+        'color': [0, 0, 70],
+        'isthing': 1,
+        'id': 8,
+        'name': 'truck'
+    },
+    {
+        'color': [0, 0, 192],
+        'isthing': 1,
+        'id': 9,
+        'name': 'boat'
+    },
+    {
+        'color': [250, 170, 30],
+        'isthing': 1,
+        'id': 10,
+        'name': 'traffic light'
+    },
+    {
+        'color': [100, 170, 30],
+        'isthing': 1,
+        'id': 11,
+        'name': 'fire hydrant'
+    },
+    {
+        'color': [220, 220, 0],
+        'isthing': 1,
+        'id': 13,
+        'name': 'stop sign'
+    },
+    {
+        'color': [175, 116, 175],
+        'isthing': 1,
+        'id': 14,
+        'name': 'parking meter',
+    },
+    {
+        'color': [250, 0, 30],
+        'isthing': 1,
+        'id': 15,
+        'name': 'bench'
+    },
+    {
+        'color': [165, 42, 42],
+        'isthing': 1,
+        'id': 16,
+        'name': 'bird'
+    },
+    {
+        'color': [255, 77, 255],
+        'isthing': 1,
+        'id': 17,
+        'name': 'cat'
+    },
+    {
+        'color': [0, 226, 252],
+        'isthing': 1,
+        'id': 18,
+        'name': 'dog'
+    },
+    {
+        'color': [182, 182, 255],
+        'isthing': 1,
+        'id': 19,
+        'name': 'horse'
+    },
+    {
+        'color': [0, 82, 0],
+        'isthing': 1,
+        'id': 20,
+        'name': 'sheep'
+    },
+    {
+        'color': [120, 166, 157],
+        'isthing': 1,
+        'id': 21,
+        'name': 'cow'
+    },
+    {
+        'color': [110, 76, 0],
+        'isthing': 1,
+        'id': 22,
+        'name': 'elephant'
+    },
+    {
+        'color': [174, 57, 255],
+        'isthing': 1,
+        'id': 23,
+        'name': 'bear'
+    },
+    {
+        'color': [199, 100, 0],
+        'isthing': 1,
+        'id': 24,
+        'name': 'zebra'
+    },
+    {
+        'color': [72, 0, 118],
+        'isthing': 1,
+        'id': 25,
+        'name': 'giraffe'
+    },
+    {
+        'color': [255, 179, 240],
+        'isthing': 1,
+        'id': 27,
+        'name': 'backpack'
+    },
+    {
+        'color': [0, 125, 92],
+        'isthing': 1,
+        'id': 28,
+        'name': 'umbrella'
+    },
+    {
+        'color': [209, 0, 151],
+        'isthing': 1,
+        'id': 31,
+        'name': 'handbag'
+    },
+    {
+        'color': [188, 208, 182],
+        'isthing': 1,
+        'id': 32,
+        'name': 'tie'
+    },
+    {
+        'color': [0, 220, 176],
+        'isthing': 1,
+        'id': 33,
+        'name': 'suitcase'
+    },
+    {
+        'color': [255, 99, 164],
+        'isthing': 1,
+        'id': 34,
+        'name': 'frisbee'
+    },
+    {
+        'color': [92, 0, 73],
+        'isthing': 1,
+        'id': 35,
+        'name': 'skis'
+    },
+    {
+        'color': [133, 129, 255],
+        'isthing': 1,
+        'id': 36,
+        'name': 'snowboard'
+    },
+    {
+        'color': [78, 180, 255],
+        'isthing': 1,
+        'id': 37,
+        'name': 'sports ball'
+    },
+    {
+        'color': [0, 228, 0],
+        'isthing': 1,
+        'id': 38,
+        'name': 'kite'
+    },
+    {
+        'color': [174, 255, 243],
+        'isthing': 1,
+        'id': 39,
+        'name': 'baseball bat'
+    },
+    {
+        'color': [45, 89, 255],
+        'isthing': 1,
+        'id': 40,
+        'name': 'baseball glove'
+    },
+    {
+        'color': [134, 134, 103],
+        'isthing': 1,
+        'id': 41,
+        'name': 'skateboard'
+    },
+    {
+        'color': [145, 148, 174],
+        'isthing': 1,
+        'id': 42,
+        'name': 'surfboard'
+    },
+    {
+        'color': [255, 208, 186],
+        'isthing': 1,
+        'id': 43,
+        'name': 'tennis racket',
+    },
+    {
+        'color': [197, 226, 255],
+        'isthing': 1,
+        'id': 44,
+        'name': 'bottle'
+    },
+    {
+        'color': [171, 134, 1],
+        'isthing': 1,
+        'id': 46,
+        'name': 'wine glass'
+    },
+    {
+        'color': [109, 63, 54],
+        'isthing': 1,
+        'id': 47,
+        'name': 'cup'
+    },
+    {
+        'color': [207, 138, 255],
+        'isthing': 1,
+        'id': 48,
+        'name': 'fork'
+    },
+    {
+        'color': [151, 0, 95],
+        'isthing': 1,
+        'id': 49,
+        'name': 'knife'
+    },
+    {
+        'color': [9, 80, 61],
+        'isthing': 1,
+        'id': 50,
+        'name': 'spoon'
+    },
+    {
+        'color': [84, 105, 51],
+        'isthing': 1,
+        'id': 51,
+        'name': 'bowl'
+    },
+    {
+        'color': [74, 65, 105],
+        'isthing': 1,
+        'id': 52,
+        'name': 'banana'
+    },
+    {
+        'color': [166, 196, 102],
+        'isthing': 1,
+        'id': 53,
+        'name': 'apple'
+    },
+    {
+        'color': [208, 195, 210],
+        'isthing': 1,
+        'id': 54,
+        'name': 'sandwich'
+    },
+    {
+        'color': [255, 109, 65],
+        'isthing': 1,
+        'id': 55,
+        'name': 'orange'
+    },
+    {
+        'color': [0, 143, 149],
+        'isthing': 1,
+        'id': 56,
+        'name': 'broccoli'
+    },
+    {
+        'color': [179, 0, 194],
+        'isthing': 1,
+        'id': 57,
+        'name': 'carrot'
+    },
+    {
+        'color': [209, 99, 106],
+        'isthing': 1,
+        'id': 58,
+        'name': 'hot dog'
+    },
+    {
+        'color': [5, 121, 0],
+        'isthing': 1,
+        'id': 59,
+        'name': 'pizza'
+    },
+    {
+        'color': [227, 255, 205],
+        'isthing': 1,
+        'id': 60,
+        'name': 'donut'
+    },
+    {
+        'color': [147, 186, 208],
+        'isthing': 1,
+        'id': 61,
+        'name': 'cake'
+    },
+    {
+        'color': [153, 69, 1],
+        'isthing': 1,
+        'id': 62,
+        'name': 'chair'
+    },
+    {
+        'color': [3, 95, 161],
+        'isthing': 1,
+        'id': 63,
+        'name': 'couch'
+    },
+    {
+        'color': [163, 255, 0],
+        'isthing': 1,
+        'id': 64,
+        'name': 'potted plant'
+    },
+    {
+        'color': [119, 0, 170],
+        'isthing': 1,
+        'id': 65,
+        'name': 'bed'
+    },
+    {
+        'color': [0, 182, 199],
+        'isthing': 1,
+        'id': 67,
+        'name': 'dining table'
+    },
+    {
+        'color': [0, 165, 120],
+        'isthing': 1,
+        'id': 70,
+        'name': 'toilet'
+    },
+    {
+        'color': [183, 130, 88],
+        'isthing': 1,
+        'id': 72,
+        'name': 'tv'
+    },
+    {
+        'color': [95, 32, 0],
+        'isthing': 1,
+        'id': 73,
+        'name': 'laptop'
+    },
+    {
+        'color': [130, 114, 135],
+        'isthing': 1,
+        'id': 74,
+        'name': 'mouse'
+    },
+    {
+        'color': [110, 129, 133],
+        'isthing': 1,
+        'id': 75,
+        'name': 'remote'
+    },
+    {
+        'color': [166, 74, 118],
+        'isthing': 1,
+        'id': 76,
+        'name': 'keyboard'
+    },
+    {
+        'color': [219, 142, 185],
+        'isthing': 1,
+        'id': 77,
+        'name': 'cell phone'
+    },
+    {
+        'color': [79, 210, 114],
+        'isthing': 1,
+        'id': 78,
+        'name': 'microwave'
+    },
+    {
+        'color': [178, 90, 62],
+        'isthing': 1,
+        'id': 79,
+        'name': 'oven'
+    },
+    {
+        'color': [65, 70, 15],
+        'isthing': 1,
+        'id': 80,
+        'name': 'toaster'
+    },
+    {
+        'color': [127, 167, 115],
+        'isthing': 1,
+        'id': 81,
+        'name': 'sink'
+    },
+    {
+        'color': [59, 105, 106],
+        'isthing': 1,
+        'id': 82,
+        'name': 'refrigerator'
+    },
+    {
+        'color': [142, 108, 45],
+        'isthing': 1,
+        'id': 84,
+        'name': 'book'
+    },
+    {
+        'color': [196, 172, 0],
+        'isthing': 1,
+        'id': 85,
+        'name': 'clock'
+    },
+    {
+        'color': [95, 54, 80],
+        'isthing': 1,
+        'id': 86,
+        'name': 'vase'
+    },
+    {
+        'color': [128, 76, 255],
+        'isthing': 1,
+        'id': 87,
+        'name': 'scissors'
+    },
+    {
+        'color': [201, 57, 1],
+        'isthing': 1,
+        'id': 88,
+        'name': 'teddy bear'
+    },
+    {
+        'color': [246, 0, 122],
+        'isthing': 1,
+        'id': 89,
+        'name': 'hair drier'
+    },
+    {
+        'color': [191, 162, 208],
+        'isthing': 1,
+        'id': 90,
+        'name': 'toothbrush'
+    },
+    {
+        'color': [255, 255, 128],
+        'isthing': 0,
+        'id': 92,
+        'name': 'banner'
+    },
+    {
+        'color': [147, 211, 203],
+        'isthing': 0,
+        'id': 93,
+        'name': 'blanket'
+    },
+    {
+        'color': [150, 100, 100],
+        'isthing': 0,
+        'id': 95,
+        'name': 'bridge'
+    },
+    {
+        'color': [168, 171, 172],
+        'isthing': 0,
+        'id': 100,
+        'name': 'cardboard'
+    },
+    {
+        'color': [146, 112, 198],
+        'isthing': 0,
+        'id': 107,
+        'name': 'counter'
+    },
+    {
+        'color': [210, 170, 100],
+        'isthing': 0,
+        'id': 109,
+        'name': 'curtain'
+    },
+    {
+        'color': [92, 136, 89],
+        'isthing': 0,
+        'id': 112,
+        'name': 'door-stuff'
+    },
+    {
+        'color': [218, 88, 184],
+        'isthing': 0,
+        'id': 118,
+        'name': 'floor-wood'
+    },
+    {
+        'color': [241, 129, 0],
+        'isthing': 0,
+        'id': 119,
+        'name': 'flower'
+    },
+    {
+        'color': [217, 17, 255],
+        'isthing': 0,
+        'id': 122,
+        'name': 'fruit'
+    },
+    {
+        'color': [124, 74, 181],
+        'isthing': 0,
+        'id': 125,
+        'name': 'gravel'
+    },
+    {
+        'color': [70, 70, 70],
+        'isthing': 0,
+        'id': 128,
+        'name': 'house'
+    },
+    {
+        'color': [255, 228, 255],
+        'isthing': 0,
+        'id': 130,
+        'name': 'light'
+    },
+    {
+        'color': [154, 208, 0],
+        'isthing': 0,
+        'id': 133,
+        'name': 'mirror-stuff'
+    },
+    {
+        'color': [193, 0, 92],
+        'isthing': 0,
+        'id': 138,
+        'name': 'net'
+    },
+    {
+        'color': [76, 91, 113],
+        'isthing': 0,
+        'id': 141,
+        'name': 'pillow'
+    },
+    {
+        'color': [255, 180, 195],
+        'isthing': 0,
+        'id': 144,
+        'name': 'platform'
+    },
+    {
+        'color': [106, 154, 176],
+        'isthing': 0,
+        'id': 145,
+        'name': 'playingfield'
+    },
+    {
+        'color': [230, 150, 140],
+        'isthing': 0,
+        'id': 147,
+        'name': 'railroad'
+    },
+    {
+        'color': [60, 143, 255],
+        'isthing': 0,
+        'id': 148,
+        'name': 'river'
+    },
+    {
+        'color': [128, 64, 128],
+        'isthing': 0,
+        'id': 149,
+        'name': 'road'
+    },
+    {
+        'color': [92, 82, 55],
+        'isthing': 0,
+        'id': 151,
+        'name': 'roof'
+    },
+    {
+        'color': [254, 212, 124],
+        'isthing': 0,
+        'id': 154,
+        'name': 'sand'
+    },
+    {
+        'color': [73, 77, 174],
+        'isthing': 0,
+        'id': 155,
+        'name': 'sea'
+    },
+    {
+        'color': [255, 160, 98],
+        'isthing': 0,
+        'id': 156,
+        'name': 'shelf'
+    },
+    {
+        'color': [255, 255, 255],
+        'isthing': 0,
+        'id': 159,
+        'name': 'snow'
+    },
+    {
+        'color': [104, 84, 109],
+        'isthing': 0,
+        'id': 161,
+        'name': 'stairs'
+    },
+    {
+        'color': [169, 164, 131],
+        'isthing': 0,
+        'id': 166,
+        'name': 'tent'
+    },
+    {
+        'color': [225, 199, 255],
+        'isthing': 0,
+        'id': 168,
+        'name': 'towel'
+    },
+    {
+        'color': [137, 54, 74],
+        'isthing': 0,
+        'id': 171,
+        'name': 'wall-brick'
+    },
+    {
+        'color': [135, 158, 223],
+        'isthing': 0,
+        'id': 175,
+        'name': 'wall-stone'
+    },
+    {
+        'color': [7, 246, 231],
+        'isthing': 0,
+        'id': 176,
+        'name': 'wall-tile'
+    },
+    {
+        'color': [107, 255, 200],
+        'isthing': 0,
+        'id': 177,
+        'name': 'wall-wood'
+    },
+    {
+        'color': [58, 41, 149],
+        'isthing': 0,
+        'id': 178,
+        'name': 'water-other'
+    },
+    {
+        'color': [183, 121, 142],
+        'isthing': 0,
+        'id': 180,
+        'name': 'window-blind'
+    },
+    {
+        'color': [255, 73, 97],
+        'isthing': 0,
+        'id': 181,
+        'name': 'window-other'
+    },
+    {
+        'color': [107, 142, 35],
+        'isthing': 0,
+        'id': 184,
+        'name': 'tree-merged'
+    },
+    {
+        'color': [190, 153, 153],
+        'isthing': 0,
+        'id': 185,
+        'name': 'fence-merged'
+    },
+    {
+        'color': [146, 139, 141],
+        'isthing': 0,
+        'id': 186,
+        'name': 'ceiling-merged'
+    },
+    {
+        'color': [70, 130, 180],
+        'isthing': 0,
+        'id': 187,
+        'name': 'sky-other-merged'
+    },
+    {
+        'color': [134, 199, 156],
+        'isthing': 0,
+        'id': 188,
+        'name': 'cabinet-merged'
+    },
+    {
+        'color': [209, 226, 140],
+        'isthing': 0,
+        'id': 189,
+        'name': 'table-merged'
+    },
+    {
+        'color': [96, 36, 108],
+        'isthing': 0,
+        'id': 190,
+        'name': 'floor-other-merged'
+    },
+    {
+        'color': [96, 96, 96],
+        'isthing': 0,
+        'id': 191,
+        'name': 'pavement-merged'
+    },
+    {
+        'color': [64, 170, 64],
+        'isthing': 0,
+        'id': 192,
+        'name': 'mountain-merged'
+    },
+    {
+        'color': [152, 251, 152],
+        'isthing': 0,
+        'id': 193,
+        'name': 'grass-merged'
+    },
+    {
+        'color': [208, 229, 228],
+        'isthing': 0,
+        'id': 194,
+        'name': 'dirt-merged'
+    },
+    {
+        'color': [206, 186, 171],
+        'isthing': 0,
+        'id': 195,
+        'name': 'paper-merged'
+    },
+    {
+        'color': [152, 161, 64],
+        'isthing': 0,
+        'id': 196,
+        'name': 'food-other-merged'
+    },
+    {
+        'color': [116, 112, 0],
+        'isthing': 0,
+        'id': 197,
+        'name': 'building-other-merged'
+    },
+    {
+        'color': [0, 114, 143],
+        'isthing': 0,
+        'id': 198,
+        'name': 'rock-merged'
+    },
+    {
+        'color': [102, 102, 156],
+        'isthing': 0,
+        'id': 199,
+        'name': 'wall-other-merged'
+    },
+    {
+        'color': [250, 141, 255],
+        'isthing': 0,
+        'id': 200,
+        'name': 'rug-merged'
+    },
+]
+
+# Novel COCO categories
+COCO_NOVEL_CATEGORIES = [
+    {
+        'color': [220, 20, 60],
+        'isthing': 1,
+        'id': 1,
+        'name': 'person'
+    },
+    {
+        'color': [119, 11, 32],
+        'isthing': 1,
+        'id': 2,
+        'name': 'bicycle'
+    },
+    {
+        'color': [0, 0, 142],
+        'isthing': 1,
+        'id': 3,
+        'name': 'car'
+    },
+    {
+        'color': [0, 0, 230],
+        'isthing': 1,
+        'id': 4,
+        'name': 'motorcycle'
+    },
+    {
+        'color': [106, 0, 228],
+        'isthing': 1,
+        'id': 5,
+        'name': 'airplane'
+    },
+    {
+        'color': [0, 60, 100],
+        'isthing': 1,
+        'id': 6,
+        'name': 'bus'
+    },
+    {
+        'color': [0, 80, 100],
+        'isthing': 1,
+        'id': 7,
+        'name': 'train'
+    },
+    {
+        'color': [0, 0, 192],
+        'isthing': 1,
+        'id': 9,
+        'name': 'boat'
+    },
+    {
+        'color': [165, 42, 42],
+        'isthing': 1,
+        'id': 16,
+        'name': 'bird'
+    },
+    {
+        'color': [255, 77, 255],
+        'isthing': 1,
+        'id': 17,
+        'name': 'cat'
+    },
+    {
+        'color': [0, 226, 252],
+        'isthing': 1,
+        'id': 18,
+        'name': 'dog'
+    },
+    {
+        'color': [182, 182, 255],
+        'isthing': 1,
+        'id': 19,
+        'name': 'horse'
+    },
+    {
+        'color': [0, 82, 0],
+        'isthing': 1,
+        'id': 20,
+        'name': 'sheep'
+    },
+    {
+        'color': [120, 166, 157],
+        'isthing': 1,
+        'id': 21,
+        'name': 'cow'
+    },
+    {
+        'color': [197, 226, 255],
+        'isthing': 1,
+        'id': 44,
+        'name': 'bottle'
+    },
+    {
+        'color': [153, 69, 1],
+        'isthing': 1,
+        'id': 62,
+        'name': 'chair'
+    },
+    {
+        'color': [3, 95, 161],
+        'isthing': 1,
+        'id': 63,
+        'name': 'couch'
+    },
+    {
+        'color': [163, 255, 0],
+        'isthing': 1,
+        'id': 64,
+        'name': 'potted plant'
+    },
+    {
+        'color': [0, 182, 199],
+        'isthing': 1,
+        'id': 67,
+        'name': 'dining table'
+    },
+    {
+        'color': [183, 130, 88],
+        'isthing': 1,
+        'id': 72,
+        'name': 'tv'
+    },
+]
+
+
+def _get_coco_fewshot_instances_meta():
+    thing_ids = [k['id'] for k in COCO_CATEGORIES if k['isthing'] == 1]
+    thing_colors = [k['color'] for k in COCO_CATEGORIES if k['isthing'] == 1]
+    assert len(thing_ids) == 80, len(thing_ids)
+    # Mapping from the incontiguous COCO category id to an id in [0, 79]
+    thing_dataset_id_to_contiguous_id = {k: i for i, k in enumerate(thing_ids)}
+    thing_classes = [k['name'] for k in COCO_CATEGORIES if k['isthing'] == 1]
+    ret = {
+        'thing_dataset_id_to_contiguous_id': thing_dataset_id_to_contiguous_id,
+        'thing_classes': thing_classes,
+        'thing_colors': thing_colors,
+    }
+
+    novel_ids = [k['id'] for k in COCO_NOVEL_CATEGORIES if k['isthing'] == 1]
+    novel_dataset_id_to_contiguous_id = {k: i for i, k in enumerate(novel_ids)}
+    novel_classes = [
+        k['name'] for k in COCO_NOVEL_CATEGORIES if k['isthing'] == 1
+    ]
+    base_categories = [
+        k for k in COCO_CATEGORIES
+        if k['isthing'] == 1 and k['name'] not in novel_classes
+    ]
+    base_ids = [k['id'] for k in base_categories]
+    base_dataset_id_to_contiguous_id = {k: i for i, k in enumerate(base_ids)}
+    base_classes = [k['name'] for k in base_categories]
+    ret['novel_dataset_id_to_contiguous_id'] = novel_dataset_id_to_contiguous_id
+    ret['novel_classes'] = novel_classes
+    ret['base_dataset_id_to_contiguous_id'] = base_dataset_id_to_contiguous_id
+    ret['base_classes'] = base_classes
+    return ret
+
+
+def load_coco_json(root, json_file, image_root, metadata, dataset_name):
+    is_shots = 'shot' in dataset_name
+    if is_shots:
+        imgid2info = {}
+        shot = dataset_name.split('_')[-2].split('shot')[0]
+        seed = int(dataset_name.split('_seed')[-1])
+        split_dir = os.path.join(root, 'cocosplit', 'seed{}'.format(seed))
+        for idx, cls in enumerate(metadata['thing_classes']):
+            json_file = os.path.join(
+                split_dir,
+                'full_box_{}shot_{}_trainval.json'.format(shot, cls))
+            json_file = PathManager.get_local_path(json_file)
+            with contextlib.redirect_stdout(io.StringIO()):
+                coco_api = COCO(json_file)
+            img_ids = sorted(list(coco_api.imgs.keys()))
+            for img_id in img_ids:
+                if img_id not in imgid2info:
+                    imgid2info[img_id] = [
+                        coco_api.loadImgs([img_id])[0],
+                        coco_api.imgToAnns[img_id]
+                    ]
+                else:
+                    for item in coco_api.imgToAnns[img_id]:
+                        imgid2info[img_id][1].append(item)
+        imgs, anns = [], []
+        for img_id in imgid2info:
+            imgs.append(imgid2info[img_id][0])
+            anns.append(imgid2info[img_id][1])
+    else:
+        json_file = PathManager.get_local_path(json_file)
+        with contextlib.redirect_stdout(io.StringIO()):
+            coco_api = COCO(json_file)
+        # sort indices for reproducible results
+        img_ids = sorted(list(coco_api.imgs.keys()))
+        imgs = coco_api.loadImgs(img_ids)
+        anns = [coco_api.imgToAnns[img_id] for img_id in img_ids]
+
+    imgs_anns = list(zip(imgs, anns))
+    id_map = metadata['thing_dataset_id_to_contiguous_id']
+
+    dataset_dicts = []
+    ann_keys = ['iscrowd', 'bbox', 'category_id']
+
+    for (img_dict, anno_dict_list) in imgs_anns:
+        record = {}
+        record['file_name'] = os.path.join(image_root, img_dict['file_name'])
+        record['height'] = img_dict['height']
+        record['width'] = img_dict['width']
+        image_id = record['image_id'] = img_dict['id']
+
+        objs = []
+        for anno in anno_dict_list:
+            assert anno['image_id'] == image_id
+            assert anno.get('ignore', 0) == 0
+
+            obj = {key: anno[key] for key in ann_keys if key in anno}
+
+            obj['bbox_mode'] = BoxMode.XYWH_ABS
+            if obj['category_id'] in id_map:
+                obj['category_id'] = id_map[obj['category_id']]
+                objs.append(obj)
+        record['annotations'] = objs
+        dataset_dicts.append(record)
+
+    return dataset_dicts
+
+
+def register_meta_coco(name, root, metadata, imgdir, annofile):
+    DatasetCatalog.register(
+        name,
+        lambda: load_coco_json(root, annofile, imgdir, metadata, name),
+    )
+
+    if '_base' in name or '_novel' in name:
+        split = 'base' if '_base' in name else 'novel'
+        metadata['thing_dataset_id_to_contiguous_id'] = metadata[
+            '{}_dataset_id_to_contiguous_id'.format(split)]
+        metadata['thing_classes'] = metadata['{}_classes'.format(split)]
+
+    MetadataCatalog.get(name).set(
+        json_file=annofile,
+        image_root=imgdir,
+        evaluator_type='coco',
+        dirname='datasets/coco',  # os.path.join(root, 'coco')
+        **metadata,
+    )
+
+
+def register_all_coco(root='datasets'):
+
+    METASPLITS = [
+        ('coco14_trainval_all', 'coco/trainval2014',
+         'cocosplit/datasplit/trainvalno5k.json'),
+        ('coco14_trainval_base', 'coco/trainval2014',
+         'cocosplit/datasplit/trainvalno5k.json'),
+        ('coco14_test_all', 'coco/val2014', 'cocosplit/datasplit/5k.json'),
+        ('coco14_test_base', 'coco/val2014', 'cocosplit/datasplit/5k.json'),
+        ('coco14_test_novel', 'coco/val2014', 'cocosplit/datasplit/5k.json'),
+    ]
+    for prefix in ['all', 'novel']:
+        for shot in [1, 2, 3, 5, 10, 30]:
+            for seed in range(10):
+                name = 'coco14_trainval_{}_{}shot_seed{}'.format(
+                    prefix, shot, seed)
+                METASPLITS.append((name, 'coco/trainval2014', ''))
+
+    for name, imgdir, annofile in METASPLITS:
+        register_meta_coco(
+            name,
+            root,
+            _get_coco_fewshot_instances_meta(),
+            os.path.join(root, imgdir),
+            os.path.join(root, annofile),
+        )
diff --git a/modelscope/models/cv/image_defrcn_fewshot/utils/configuration_mapper.py b/modelscope/models/cv/image_defrcn_fewshot/utils/configuration_mapper.py
new file mode 100644
index 00000000..f1b1ce23
--- /dev/null
+++ b/modelscope/models/cv/image_defrcn_fewshot/utils/configuration_mapper.py
@@ -0,0 +1,113 @@
+# The implementation is adopted from er-muyue/DeFRCN
+# made publicly available under the MIT License at
+# https://github.com/er-muyue/DeFRCN/blob/main/defrcn/config/defaults.py
+
+from detectron2.config.defaults import _C
+
+from modelscope.utils.config import Config
+
+
+def detectron2_default_cfg():
+
+    _CC = _C
+
+    # ----------- Backbone ----------- #
+    _CC.MODEL.BACKBONE.FREEZE = False
+    _CC.MODEL.BACKBONE.FREEZE_AT = 3
+
+    # ------------- RPN -------------- #
+    _CC.MODEL.RPN.FREEZE = False
+    _CC.MODEL.RPN.ENABLE_DECOUPLE = False
+    _CC.MODEL.RPN.BACKWARD_SCALE = 1.0
+
+    # ------------- ROI -------------- #
+    _CC.MODEL.ROI_HEADS.NAME = 'Res5ROIHeads'
+    _CC.MODEL.ROI_HEADS.FREEZE_FEAT = False
+    _CC.MODEL.ROI_HEADS.ENABLE_DECOUPLE = False
+    _CC.MODEL.ROI_HEADS.BACKWARD_SCALE = 1.0
+    _CC.MODEL.ROI_HEADS.OUTPUT_LAYER = 'FastRCNNOutputLayers'
+    _CC.MODEL.ROI_HEADS.CLS_DROPOUT = False
+    _CC.MODEL.ROI_HEADS.DROPOUT_RATIO = 0.8
+    _CC.MODEL.ROI_BOX_HEAD.POOLER_RESOLUTION = 7  # for faster
+
+    # ------------- TEST ------------- #
+    _CC.TEST.PCB_ENABLE = False
+    _CC.TEST.PCB_MODELTYPE = 'resnet'  # res-like
+    _CC.TEST.PCB_MODELPATH = ''
+    _CC.TEST.PCB_ALPHA = 0.50
+    _CC.TEST.PCB_UPPER = 1.0
+    _CC.TEST.PCB_LOWER = 0.05
+
+    # ------------ Other ------------- #
+    _CC.SOLVER.WEIGHT_DECAY = 5e-5
+    _CC.MUTE_HEADER = True
+
+    return _CC
+
+
+class CfgMapper():
+
+    def __init__(self, cfg: Config):
+
+        self.cfg = cfg
+        self.model_cfg = detectron2_default_cfg().clone()
+
+    def __call__(self, *args, **kwargs):
+        cfg_list = [
+            'MODEL.WEIGHTS',
+            self.cfg.safe_get('model.weights', ''), 'MODEL.MASK_ON',
+            self.cfg.safe_get('model.mask_on', False), 'MODEL.BACKBONE.FREEZE',
+            self.cfg.safe_get('model.backbone.freezed',
+                              False), 'MODEL.RESNETS.DEPTH',
+            self.cfg.safe_get('model.resnets.depth',
+                              101), 'MODEL.ROI_HEADS.ENABLE_DECOUPLE',
+            self.cfg.safe_get('model.roi_heads.enable_decouple',
+                              False), 'MODEL.ROI_HEADS.BACKWARD_SCALE',
+            self.cfg.safe_get('model.roi_heads.backward_scale',
+                              1.0), 'MODEL.ROI_HEADS.NUM_CLASSES',
+            self.cfg.safe_get('model.roi_heads.num_classes',
+                              80), 'MODEL.ROI_HEADS.FREEZE_FEAT',
+            self.cfg.safe_get('model.roi_heads.freeze_feat',
+                              False), 'MODEL.ROI_HEADS.CLS_DROPOUT',
+            self.cfg.safe_get('model.roi_heads.cls_dropout',
+                              False), 'MODEL.RPN.ENABLE_DECOUPLE',
+            self.cfg.safe_get('model.rpn.enable_decouple',
+                              False), 'MODEL.RPN.BACKWARD_SCALE',
+            self.cfg.safe_get('model.rpn.backward_scale',
+                              1.0), 'MODEL.RPN.FREEZE',
+            self.cfg.safe_get('model.rpn.freezed',
+                              False), 'MODEL.RPN.PRE_NMS_TOPK_TEST',
+            self.cfg.safe_get('model.rpn.pre_nms_topk_test',
+                              6000), 'MODEL.RPN.POST_NMS_TOPK_TEST',
+            self.cfg.safe_get('model.rpn.post_nms_topk_test',
+                              1000), 'DATASETS.TRAIN',
+            tuple(self.cfg.safe_get('datasets.train',
+                                    ('coco_2017_train', ))), 'DATASETS.TEST',
+            tuple(self.cfg.safe_get('datasets.test', ('coco_2017_val', ))),
+            'SOLVER.IMS_PER_BATCH',
+            self.cfg.safe_get('train.dataloader.ims_per_batch',
+                              16), 'SOLVER.BASE_LR',
+            self.cfg.safe_get('train.optimizer.lr', 0.02), 'SOLVER.STEPS',
+            tuple(
+                self.cfg.safe_get('train.lr_scheduler.steps',
+                                  (60000, 80000))), 'SOLVER.MAX_ITER',
+            self.cfg.safe_get('train.max_iter',
+                              90000), 'SOLVER.CHECKPOINT_PERIOD',
+            self.cfg.safe_get('train.checkpoint_period',
+                              5000), 'SOLVER.WARMUP_ITERS',
+            self.cfg.safe_get('train.lr_scheduler.warmup_iters',
+                              1000), 'OUTPUT_DIR',
+            self.cfg.safe_get('train.work_dir',
+                              './output/'), 'INPUT.MIN_SIZE_TRAIN',
+            tuple(
+                self.cfg.safe_get('input.min_size_train',
+                                  (640, 672, 704, 736, 768, 800))),
+            'INPUT.MIN_SIZE_TEST',
+            self.cfg.safe_get('input.min_size_test', 800), 'TEST.PCB_ENABLE',
+            self.cfg.safe_get('test.pcb_enable', False), 'TEST.PCB_MODELPATH',
+            self.cfg.safe_get('test.pcb_modelpath', '')
+        ]
+
+        self.model_cfg.merge_from_list(cfg_list)
+
+        return self.model_cfg
diff --git a/modelscope/models/cv/image_defrcn_fewshot/utils/model_surgery_op.py b/modelscope/models/cv/image_defrcn_fewshot/utils/model_surgery_op.py
new file mode 100644
index 00000000..b40cc077
--- /dev/null
+++ b/modelscope/models/cv/image_defrcn_fewshot/utils/model_surgery_op.py
@@ -0,0 +1,99 @@
+# The implementation is adopted from er-muyue/DeFRCN
+# made publicly available under the MIT License at
+# https://github.com/er-muyue/DeFRCN/blob/main/tools/model_surgery.py
+
+import argparse
+import os
+
+import torch
+
+COCO_NOVEL_CLASSES = [
+    1, 2, 3, 4, 5, 6, 7, 9, 16, 17, 18, 19, 20, 21, 44, 62, 63, 64, 67, 72
+]
+COCO_BASE_CLASSES = [
+    8, 10, 11, 13, 14, 15, 22, 23, 24, 25, 27, 28, 31, 32, 33, 34, 35, 36, 37,
+    38, 39, 40, 41, 42, 43, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58,
+    59, 60, 61, 65, 70, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 84, 85, 86, 87,
+    88, 89, 90
+]
+COCO_ALL_CLASSES = sorted(COCO_BASE_CLASSES + COCO_NOVEL_CLASSES)
+COCO_IDMAP = {v: i for i, v in enumerate(COCO_ALL_CLASSES)}
+
+
+def surgery(data_type, param_name, is_weight, tar_size, ckpt):
+    weight_name = param_name + ('.weight' if is_weight else '.bias')
+    pretrained_weight = ckpt['model'][weight_name]
+    prev_cls = pretrained_weight.size(0)
+    if 'cls_score' in param_name:
+        prev_cls -= 1
+    if is_weight:
+        feat_size = pretrained_weight.size(1)
+        new_weight = torch.rand((tar_size, feat_size))
+        torch.nn.init.normal_(new_weight, 0, 0.01)
+    else:
+        new_weight = torch.zeros(tar_size)
+    if data_type == 'coco':
+        for idx, c in enumerate(COCO_BASE_CLASSES):
+            if 'cls_score' in param_name:
+                new_weight[COCO_IDMAP[c]] = pretrained_weight[idx]
+            else:
+                new_weight[COCO_IDMAP[c] * 4:(COCO_IDMAP[c] + 1) * 4] = \
+                    pretrained_weight[idx * 4:(idx + 1) * 4]
+    else:
+        new_weight[:prev_cls] = pretrained_weight[:prev_cls]
+    if 'cls_score' in param_name:
+        new_weight[-1] = pretrained_weight[-1]  # bg class
+    ckpt['model'][weight_name] = new_weight
+
+
+def model_surgery(src_path,
+                  save_dir,
+                  data_type='pascal_voc',
+                  method='remove',
+                  params_name=[
+                      'model.roi_heads.box_predictor.cls_score',
+                      'model.roi_heads.box_predictor.bbox_pred'
+                  ]):
+    """
+    Either remove the final layer weights for fine-tuning on novel dataset or
+    append randomly initialized weights for the novel classes.
+    """
+
+    assert method in ['remove',
+                      'randinit'], '{} not implemented'.format(method)
+
+    if data_type == 'coco':
+        TAR_SIZE = 80
+    elif data_type == 'pascal_voc':
+        TAR_SIZE = 20
+    else:
+        NotImplementedError('{} dataset does not supported'.format(data_type))
+
+    save_name = 'model_reset_' + ('remove' if method == 'remove' else
+                                  'surgery') + '.pth'
+    save_path = os.path.join(save_dir, save_name)
+    os.makedirs(save_dir, exist_ok=True)
+
+    ckpt = torch.load(src_path)
+    if 'scheduler' in ckpt:
+        del ckpt['scheduler']
+    if 'optimizer' in ckpt:
+        del ckpt['optimizer']
+    if 'iteration' in ckpt:
+        ckpt['iteration'] = 0
+
+    if method == 'remove':
+        for param_name in params_name:
+            del ckpt['model'][param_name + '.weight']
+            if param_name + '.bias' in ckpt['model']:
+                del ckpt['model'][param_name + '.bias']
+    elif method == 'randinit':
+        tar_sizes = [TAR_SIZE + 1, TAR_SIZE * 4]
+        for idx, (param_name,
+                  tar_size) in enumerate(zip(params_name, tar_sizes)):
+            surgery(data_type, param_name, True, tar_size, ckpt)
+            surgery(data_type, param_name, False, tar_size, ckpt)
+    else:
+        raise NotImplementedError
+
+    torch.save(ckpt, save_path)
diff --git a/modelscope/models/cv/image_defrcn_fewshot/utils/register_data.py b/modelscope/models/cv/image_defrcn_fewshot/utils/register_data.py
new file mode 100644
index 00000000..ad392c5c
--- /dev/null
+++ b/modelscope/models/cv/image_defrcn_fewshot/utils/register_data.py
@@ -0,0 +1,21 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+from .coco_register import register_all_coco
+from .voc_register import register_all_voc
+
+
+def register_data(data_type='pascal_voc', data_dir=None):
+
+    if data_type == 'pascal_voc':
+        if data_dir:
+            register_all_voc(data_dir)
+        else:
+            register_all_voc()
+    elif data_type == 'coco':
+        if data_dir:
+            register_all_coco(data_dir)
+        else:
+            register_all_coco()
+    else:
+        raise NotImplementedError(
+            'no {} dataset was registered'.format(data_type))
diff --git a/modelscope/models/cv/image_depth_estimation_bts/__init__.py b/modelscope/models/cv/image_depth_estimation_bts/__init__.py
new file mode 100644
index 00000000..29b18261
--- /dev/null
+++ b/modelscope/models/cv/image_depth_estimation_bts/__init__.py
@@ -0,0 +1,22 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .depth_estimation_bts_model import DepthEstimationBtsModel
+
+else:
+    _import_structure = {
+        'depth_estimation_bts_model': ['DepthEstimationBtsModel']
+    }
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/models/cv/image_depth_estimation_bts/depth_estimation_bts_model.py b/modelscope/models/cv/image_depth_estimation_bts/depth_estimation_bts_model.py
new file mode 100644
index 00000000..be2eae81
--- /dev/null
+++ b/modelscope/models/cv/image_depth_estimation_bts/depth_estimation_bts_model.py
@@ -0,0 +1,73 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os.path as osp
+
+import torch
+from torchvision import transforms
+
+from modelscope.metainfo import Models
+from modelscope.models.base.base_torch_model import TorchModel
+from modelscope.models.builder import MODELS
+from modelscope.outputs import OutputKeys
+from modelscope.utils.constant import ModelFile, Tasks
+from modelscope.utils.logger import get_logger
+from .networks.bts_model import BtsModel
+
+logger = get_logger()
+__all__ = ['DepthEstimationBtsModel']
+
+
+@MODELS.register_module(
+    Tasks.image_depth_estimation, module_name=Models.bts_depth_estimation)
+class DepthEstimationBtsModel(TorchModel):
+    """ Depth estimation model bts, implemented from paper https://arxiv.org/pdf/1907.10326.pdf.
+        The network utilizes novel local planar guidance layers located at multiple stage in the decoding phase.
+        The bts model is composed with encoder and decoder, an encoder for dense feature extraction and a decoder
+        for predicting the desired depth.
+    """
+
+    def __init__(self, model_dir: str, **kwargs):
+        """initialize the bts model from the `model_dir` path.
+
+        Args:
+            model_dir (str): the model path.
+            focal: focal length, pictures that do not work are input according to
+                the camera setting value at the time of shooting
+            dataset: used to set focal value according dataset type, only support 'nyu' and 'kitti'
+        """
+        super().__init__(model_dir, **kwargs)
+        self.normalize = transforms.Normalize(
+            mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
+        self.focal = 518.8579  # focal length, different dataset has different value
+        if 'focal' in kwargs:
+            self.focal = kwargs['focal']
+        elif 'dataset' in kwargs:
+            if kwargs['dataset'] == 'nyu':
+                self.focal = 518.8579
+            elif kwargs['dataset'] == 'kitti':
+                self.focal = 721.5377
+
+        self.model = BtsModel(focal=self.focal)
+
+        model_path = osp.join(model_dir, ModelFile.TORCH_MODEL_FILE)
+        checkpoint = torch.load(model_path)
+
+        state_dict = {}
+        for k in checkpoint['model'].keys():
+            if k.startswith('module.'):
+                state_dict[k[7:]] = checkpoint['model'][k]
+            else:
+                state_dict[k] = checkpoint['model'][k]
+        self.model.load_state_dict(state_dict)
+        self.model.eval()
+
+    def forward(self, inputs):
+        imgs = self.normalize(inputs['imgs'])
+        return self.model(imgs)
+
+    def postprocess(self, inputs):
+        results = {OutputKeys.DEPTHS: inputs}
+        return results
+
+    def inference(self, data):
+        results = self.forward(data)
+        return results
diff --git a/modelscope/models/cv/image_depth_estimation_bts/networks/__init__.py b/modelscope/models/cv/image_depth_estimation_bts/networks/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/modelscope/models/cv/image_depth_estimation_bts/networks/bts_model.py b/modelscope/models/cv/image_depth_estimation_bts/networks/bts_model.py
new file mode 100644
index 00000000..dc5e6051
--- /dev/null
+++ b/modelscope/models/cv/image_depth_estimation_bts/networks/bts_model.py
@@ -0,0 +1,41 @@
+# The implementation is modified from cleinc / bts
+# made publicly available under the GPL-3.0-or-later
+# https://github.com/cleinc/bts/blob/master/pytorch/bts.py
+import torch.nn as nn
+
+from .decoder import Decoder
+from .encoder import Encoder
+
+
+class BtsModel(nn.Module):
+
+    def __init__(self, focal=518.8579):
+        """
+        initial bts model
+        Parameters
+        ----------
+        focal: focal length, pictures that do not work are input according to
+                the camera setting value at the time of shooting
+        """
+        super(BtsModel, self).__init__()
+        self.focal = focal
+        self.encoder = Encoder(encoder='densenet161_bts')
+        self.decoder = Decoder(
+            feat_out_channels=self.encoder.feat_out_channels)
+
+    def forward(self, x, focal=None):
+        """
+        model forward
+        Parameters
+        ----------
+        x: input image data
+        focal: The focal length when the picture is taken. By default, the focal length
+                of the data set when the model is created is used
+
+        Returns： Depth estimation image
+        -------
+
+        """
+        focal_run = focal if focal else self.focal
+        skip_feat = self.encoder(x)
+        return self.decoder(skip_feat, focal_run)
diff --git a/modelscope/models/cv/image_depth_estimation_bts/networks/decoder.py b/modelscope/models/cv/image_depth_estimation_bts/networks/decoder.py
new file mode 100644
index 00000000..b6d992fd
--- /dev/null
+++ b/modelscope/models/cv/image_depth_estimation_bts/networks/decoder.py
@@ -0,0 +1,204 @@
+# The implementation is modified from cleinc / bts
+# made publicly available under the GPL-3.0-or-later
+# https://github.com/cleinc/bts/blob/master/pytorch/bts.py
+import torch
+import torch.nn as nn
+import torch.nn.functional as torch_nn_func
+
+from .tools import AtrousConv, LocalPlanarGuidance, Reduction1x1, UpConv
+
+
+class Decoder(nn.Module):
+
+    def __init__(self,
+                 feat_out_channels,
+                 max_depth=10,
+                 dataset='nyu',
+                 num_features=512):
+        super(Decoder, self).__init__()
+        self.max_depth = max_depth
+        self.dataset = dataset
+
+        self.upconv5 = UpConv(feat_out_channels[4], num_features)
+        self.bn5 = nn.BatchNorm2d(
+            num_features, momentum=0.01, affine=True, eps=1.1e-5)
+
+        self.conv5 = torch.nn.Sequential(
+            nn.Conv2d(
+                num_features + feat_out_channels[3],
+                num_features,
+                3,
+                1,
+                1,
+                bias=False), nn.ELU())
+        self.upconv4 = UpConv(num_features, num_features // 2)
+        self.bn4 = nn.BatchNorm2d(
+            num_features // 2, momentum=0.01, affine=True, eps=1.1e-5)
+        self.conv4 = torch.nn.Sequential(
+            nn.Conv2d(
+                num_features // 2 + feat_out_channels[2],
+                num_features // 2,
+                3,
+                1,
+                1,
+                bias=False), nn.ELU())
+        self.bn4_2 = nn.BatchNorm2d(
+            num_features // 2, momentum=0.01, affine=True, eps=1.1e-5)
+
+        self.daspp_3 = AtrousConv(
+            num_features // 2, num_features // 4, 3, apply_bn_first=False)
+        self.daspp_6 = AtrousConv(
+            num_features // 2 + num_features // 4 + feat_out_channels[2],
+            num_features // 4, 6)
+        self.daspp_12 = AtrousConv(num_features + feat_out_channels[2],
+                                   num_features // 4, 12)
+        self.daspp_18 = AtrousConv(
+            num_features + num_features // 4 + feat_out_channels[2],
+            num_features // 4, 18)
+        self.daspp_24 = AtrousConv(
+            num_features + num_features // 2 + feat_out_channels[2],
+            num_features // 4, 24)
+        self.daspp_conv = torch.nn.Sequential(
+            nn.Conv2d(
+                num_features + num_features // 2 + num_features // 4,
+                num_features // 4,
+                3,
+                1,
+                1,
+                bias=False), nn.ELU())
+        self.reduc8x8 = Reduction1x1(num_features // 4, num_features // 4,
+                                     self.max_depth)
+        self.lpg8x8 = LocalPlanarGuidance(8)
+
+        self.upconv3 = UpConv(num_features // 4, num_features // 4)
+        self.bn3 = nn.BatchNorm2d(
+            num_features // 4, momentum=0.01, affine=True, eps=1.1e-5)
+        self.conv3 = torch.nn.Sequential(
+            nn.Conv2d(
+                num_features // 4 + feat_out_channels[1] + 1,
+                num_features // 4,
+                3,
+                1,
+                1,
+                bias=False), nn.ELU())
+        self.reduc4x4 = Reduction1x1(num_features // 4, num_features // 8,
+                                     self.max_depth)
+        self.lpg4x4 = LocalPlanarGuidance(4)
+
+        self.upconv2 = UpConv(num_features // 4, num_features // 8)
+        self.bn2 = nn.BatchNorm2d(
+            num_features // 8, momentum=0.01, affine=True, eps=1.1e-5)
+        self.conv2 = torch.nn.Sequential(
+            nn.Conv2d(
+                num_features // 8 + feat_out_channels[0] + 1,
+                num_features // 8,
+                3,
+                1,
+                1,
+                bias=False), nn.ELU())
+
+        self.reduc2x2 = Reduction1x1(num_features // 8, num_features // 16,
+                                     self.max_depth)
+        self.lpg2x2 = LocalPlanarGuidance(2)
+
+        self.upconv1 = UpConv(num_features // 8, num_features // 16)
+        self.reduc1x1 = Reduction1x1(
+            num_features // 16,
+            num_features // 32,
+            self.max_depth,
+            is_final=True)
+        self.conv1 = torch.nn.Sequential(
+            nn.Conv2d(
+                num_features // 16 + 4,
+                num_features // 16,
+                3,
+                1,
+                1,
+                bias=False), nn.ELU())
+        self.get_depth = torch.nn.Sequential(
+            nn.Conv2d(num_features // 16, 1, 3, 1, 1, bias=False),
+            nn.Sigmoid())
+
+    def forward(self, features, focal):
+        skip0, skip1, skip2, skip3 = features[1], features[2], features[
+            3], features[4]
+        dense_features = torch.nn.ReLU()(features[5])
+        upconv5 = self.upconv5(dense_features)  # H/16
+        upconv5 = self.bn5(upconv5)
+        concat5 = torch.cat([upconv5, skip3], dim=1)
+        iconv5 = self.conv5(concat5)
+
+        upconv4 = self.upconv4(iconv5)  # H/8
+        upconv4 = self.bn4(upconv4)
+        concat4 = torch.cat([upconv4, skip2], dim=1)
+        iconv4 = self.conv4(concat4)
+        iconv4 = self.bn4_2(iconv4)
+
+        daspp_3 = self.daspp_3(iconv4)
+        concat4_2 = torch.cat([concat4, daspp_3], dim=1)
+        daspp_6 = self.daspp_6(concat4_2)
+        concat4_3 = torch.cat([concat4_2, daspp_6], dim=1)
+        daspp_12 = self.daspp_12(concat4_3)
+        concat4_4 = torch.cat([concat4_3, daspp_12], dim=1)
+        daspp_18 = self.daspp_18(concat4_4)
+        concat4_5 = torch.cat([concat4_4, daspp_18], dim=1)
+        daspp_24 = self.daspp_24(concat4_5)
+        concat4_daspp = torch.cat(
+            [iconv4, daspp_3, daspp_6, daspp_12, daspp_18, daspp_24], dim=1)
+        daspp_feat = self.daspp_conv(concat4_daspp)
+
+        reduc8x8 = self.reduc8x8(daspp_feat)
+        plane_normal_8x8 = reduc8x8[:, :3, :, :]
+        plane_normal_8x8 = torch_nn_func.normalize(plane_normal_8x8, 2, 1)
+        plane_dist_8x8 = reduc8x8[:, 3, :, :]
+        plane_eq_8x8 = torch.cat(
+            [plane_normal_8x8, plane_dist_8x8.unsqueeze(1)], 1)
+        depth_8x8 = self.lpg8x8(plane_eq_8x8, focal)
+        depth_8x8_scaled = depth_8x8.unsqueeze(1) / self.max_depth
+        depth_8x8_scaled_ds = torch_nn_func.interpolate(
+            depth_8x8_scaled, scale_factor=0.25, mode='nearest')
+
+        upconv3 = self.upconv3(daspp_feat)  # H/4
+        upconv3 = self.bn3(upconv3)
+        concat3 = torch.cat([upconv3, skip1, depth_8x8_scaled_ds], dim=1)
+        iconv3 = self.conv3(concat3)
+
+        reduc4x4 = self.reduc4x4(iconv3)
+        plane_normal_4x4 = reduc4x4[:, :3, :, :]
+        plane_normal_4x4 = torch_nn_func.normalize(plane_normal_4x4, 2, 1)
+        plane_dist_4x4 = reduc4x4[:, 3, :, :]
+        plane_eq_4x4 = torch.cat(
+            [plane_normal_4x4, plane_dist_4x4.unsqueeze(1)], 1)
+        depth_4x4 = self.lpg4x4(plane_eq_4x4, focal)
+        depth_4x4_scaled = depth_4x4.unsqueeze(1) / self.max_depth
+        depth_4x4_scaled_ds = torch_nn_func.interpolate(
+            depth_4x4_scaled, scale_factor=0.5, mode='nearest')
+
+        upconv2 = self.upconv2(iconv3)  # H/2
+        upconv2 = self.bn2(upconv2)
+        concat2 = torch.cat([upconv2, skip0, depth_4x4_scaled_ds], dim=1)
+        iconv2 = self.conv2(concat2)
+
+        reduc2x2 = self.reduc2x2(iconv2)
+        plane_normal_2x2 = reduc2x2[:, :3, :, :]
+        plane_normal_2x2 = torch_nn_func.normalize(plane_normal_2x2, 2, 1)
+        plane_dist_2x2 = reduc2x2[:, 3, :, :]
+        plane_eq_2x2 = torch.cat(
+            [plane_normal_2x2, plane_dist_2x2.unsqueeze(1)], 1)
+        depth_2x2 = self.lpg2x2(plane_eq_2x2, focal)
+        depth_2x2_scaled = depth_2x2.unsqueeze(1) / self.max_depth
+
+        upconv1 = self.upconv1(iconv2)
+        reduc1x1 = self.reduc1x1(upconv1)
+        concat1_list = [
+            upconv1, reduc1x1, depth_2x2_scaled, depth_4x4_scaled,
+            depth_8x8_scaled
+        ]
+        concat1 = torch.cat(concat1_list, dim=1)
+        iconv1 = self.conv1(concat1)
+        final_depth = self.max_depth * self.get_depth(iconv1)
+        if self.dataset == 'kitti':
+            final_depth = final_depth * focal.view(-1, 1, 1,
+                                                   1).float() / 715.0873
+
+        return final_depth
diff --git a/modelscope/models/cv/image_depth_estimation_bts/networks/encoder.py b/modelscope/models/cv/image_depth_estimation_bts/networks/encoder.py
new file mode 100644
index 00000000..6782ffc8
--- /dev/null
+++ b/modelscope/models/cv/image_depth_estimation_bts/networks/encoder.py
@@ -0,0 +1,51 @@
+# The implementation is modified from cleinc / bts
+# made publicly available under the GPL-3.0-or-later
+# https://github.com/cleinc/bts/blob/master/pytorch/bts.py
+
+import torch.nn as nn
+import torchvision.models as models
+
+
+class Encoder(nn.Module):
+
+    def __init__(self, encoder='densenet161_bts', pretrained=False):
+        super(Encoder, self).__init__()
+        self.encoder = encoder
+
+        if encoder == 'densenet121_bts':
+            self.base_model = models.densenet121(
+                pretrained=pretrained).features
+            self.feat_names = [
+                'relu0', 'pool0', 'transition1', 'transition2', 'norm5'
+            ]
+            self.feat_out_channels = [64, 64, 128, 256, 1024]
+        elif encoder == 'densenet161_bts':
+            self.base_model = models.densenet161(
+                pretrained=pretrained).features
+            self.feat_names = [
+                'relu0', 'pool0', 'transition1', 'transition2', 'norm5'
+            ]
+            self.feat_out_channels = [96, 96, 192, 384, 2208]
+        elif encoder == 'resnet50_bts':
+            self.base_model = models.resnet50(pretrained=pretrained)
+            self.feat_names = ['relu', 'layer1', 'layer2', 'layer3', 'layer4']
+            self.feat_out_channels = [64, 256, 512, 1024, 2048]
+        elif encoder == 'resnet101_bts':
+            self.base_model = models.resnet101(pretrained=pretrained)
+            self.feat_names = ['relu', 'layer1', 'layer2', 'layer3', 'layer4']
+            self.feat_out_channels = [64, 256, 512, 1024, 2048]
+        else:
+            raise NotImplementedError
+
+    def forward(self, x):
+        features = [x]
+        skip_feat = [x]
+        for k, v in self.base_model._modules.items():
+            if 'resnet' in self.encoder and ('fc' in k or 'avgpool' in k):
+                continue
+            feature = v(features[-1])
+            features.append(feature)
+            if any(x in k for x in self.feat_names):
+                skip_feat.append(feature)
+
+        return skip_feat
diff --git a/modelscope/models/cv/image_depth_estimation_bts/networks/tools.py b/modelscope/models/cv/image_depth_estimation_bts/networks/tools.py
new file mode 100644
index 00000000..d317d443
--- /dev/null
+++ b/modelscope/models/cv/image_depth_estimation_bts/networks/tools.py
@@ -0,0 +1,182 @@
+# The implementation is modified from cleinc / bts
+# made publicly available under the GPL-3.0-or-later
+# https://github.com/cleinc/bts/blob/master/pytorch/bts.py
+
+import math
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as torch_nn_func
+
+
+class AtrousConv(nn.Sequential):
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 dilation,
+                 apply_bn_first=True):
+        super(AtrousConv, self).__init__()
+        self.atrous_conv = torch.nn.Sequential()
+        if apply_bn_first:
+            self.atrous_conv.add_module(
+                'first_bn',
+                nn.BatchNorm2d(
+                    in_channels,
+                    momentum=0.01,
+                    affine=True,
+                    track_running_stats=True,
+                    eps=1.1e-5))
+
+        self.atrous_conv.add_module(
+            'aconv_sequence',
+            nn.Sequential(
+                nn.ReLU(),
+                nn.Conv2d(
+                    in_channels=in_channels,
+                    out_channels=out_channels * 2,
+                    bias=False,
+                    kernel_size=1,
+                    stride=1,
+                    padding=0),
+                nn.BatchNorm2d(
+                    out_channels * 2,
+                    momentum=0.01,
+                    affine=True,
+                    track_running_stats=True), nn.ReLU(),
+                nn.Conv2d(
+                    in_channels=out_channels * 2,
+                    out_channels=out_channels,
+                    bias=False,
+                    kernel_size=3,
+                    stride=1,
+                    padding=(dilation, dilation),
+                    dilation=dilation)))
+
+    def forward(self, x):
+        return self.atrous_conv.forward(x)
+
+
+class UpConv(nn.Module):
+
+    def __init__(self, in_channels, out_channels, ratio=2):
+        super(UpConv, self).__init__()
+        self.elu = nn.ELU()
+        self.conv = nn.Conv2d(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            bias=False,
+            kernel_size=3,
+            stride=1,
+            padding=1)
+        self.ratio = ratio
+
+    def forward(self, x):
+        up_x = torch_nn_func.interpolate(
+            x, scale_factor=self.ratio, mode='nearest')
+        out = self.conv(up_x)
+        out = self.elu(out)
+        return out
+
+
+class Reduction1x1(nn.Sequential):
+
+    def __init__(self,
+                 num_in_filters,
+                 num_out_filters,
+                 max_depth,
+                 is_final=False):
+        super(Reduction1x1, self).__init__()
+        self.max_depth = max_depth
+        self.is_final = is_final
+        self.sigmoid = nn.Sigmoid()
+        self.reduc = torch.nn.Sequential()
+
+        while num_out_filters >= 4:
+            if num_out_filters < 8:
+                if self.is_final:
+                    self.reduc.add_module(
+                        'final',
+                        torch.nn.Sequential(
+                            nn.Conv2d(
+                                num_in_filters,
+                                out_channels=1,
+                                bias=False,
+                                kernel_size=1,
+                                stride=1,
+                                padding=0), nn.Sigmoid()))
+                else:
+                    self.reduc.add_module(
+                        'plane_params',
+                        torch.nn.Conv2d(
+                            num_in_filters,
+                            out_channels=3,
+                            bias=False,
+                            kernel_size=1,
+                            stride=1,
+                            padding=0))
+                break
+            else:
+                self.reduc.add_module(
+                    'inter_{}_{}'.format(num_in_filters, num_out_filters),
+                    torch.nn.Sequential(
+                        nn.Conv2d(
+                            in_channels=num_in_filters,
+                            out_channels=num_out_filters,
+                            bias=False,
+                            kernel_size=1,
+                            stride=1,
+                            padding=0), nn.ELU()))
+
+            num_in_filters = num_out_filters
+            num_out_filters = num_out_filters // 2
+
+    def forward(self, net):
+        net = self.reduc.forward(net)
+        if not self.is_final:
+            theta = self.sigmoid(net[:, 0, :, :]) * math.pi / 3
+            phi = self.sigmoid(net[:, 1, :, :]) * math.pi * 2
+            dist = self.sigmoid(net[:, 2, :, :]) * self.max_depth
+            n1 = torch.mul(torch.sin(theta), torch.cos(phi)).unsqueeze(1)
+            n2 = torch.mul(torch.sin(theta), torch.sin(phi)).unsqueeze(1)
+            n3 = torch.cos(theta).unsqueeze(1)
+            n4 = dist.unsqueeze(1)
+            net = torch.cat([n1, n2, n3, n4], dim=1)
+
+        return net
+
+
+class LocalPlanarGuidance(nn.Module):
+
+    def __init__(self, upratio):
+        super(LocalPlanarGuidance, self).__init__()
+        self.upratio = upratio
+        self.u = torch.arange(self.upratio).reshape([1, 1,
+                                                     self.upratio]).float()
+        self.v = torch.arange(int(self.upratio)).reshape([1, self.upratio,
+                                                          1]).float()
+        self.upratio = float(upratio)
+
+    def forward(self, plane_eq, focal):
+        plane_eq_expanded = torch.repeat_interleave(plane_eq,
+                                                    int(self.upratio), 2)
+        plane_eq_expanded = torch.repeat_interleave(plane_eq_expanded,
+                                                    int(self.upratio), 3)
+        n1 = plane_eq_expanded[:, 0, :, :]
+        n2 = plane_eq_expanded[:, 1, :, :]
+        n3 = plane_eq_expanded[:, 2, :, :]
+        n4 = plane_eq_expanded[:, 3, :, :]
+
+        u = self.u.repeat(
+            plane_eq.size(0),
+            plane_eq.size(2) * int(self.upratio), plane_eq.size(3)).cuda()
+        u = (u - (self.upratio - 1) * 0.5) / self.upratio
+
+        v = self.v.repeat(
+            plane_eq.size(0), plane_eq.size(2),
+            plane_eq.size(3) * int(self.upratio)).cuda()
+        v = (v - (self.upratio - 1) * 0.5) / self.upratio
+
+        d = n4 / (n1 * u + n2 * v + n3)
+
+        return d
diff --git a/modelscope/models/cv/image_driving_perception/__init__.py b/modelscope/models/cv/image_driving_perception/__init__.py
new file mode 100644
index 00000000..f7aef488
--- /dev/null
+++ b/modelscope/models/cv/image_driving_perception/__init__.py
@@ -0,0 +1,31 @@
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .image_driving_percetion_model import YOLOPv2
+    from .preprocessor import ImageDrivingPerceptionPreprocessor
+    from .utils import (scale_coords, non_max_suppression,
+                        split_for_trace_model, driving_area_mask,
+                        lane_line_mask)
+
+else:
+    _import_structure = {
+        'image_driving_percetion_model': ['YOLOPv2'],
+        'preprocessor': ['ImageDrivingPerceptionPreprocessor'],
+        'utils': [
+            'scale_coords', 'non_max_suppression', 'split_for_trace_model',
+            'driving_area_mask', 'lane_line_mask'
+        ],
+    }
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/models/cv/image_driving_perception/image_driving_percetion_model.py b/modelscope/models/cv/image_driving_perception/image_driving_percetion_model.py
new file mode 100644
index 00000000..b7de37e7
--- /dev/null
+++ b/modelscope/models/cv/image_driving_perception/image_driving_percetion_model.py
@@ -0,0 +1,60 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os
+from typing import Any, Dict
+
+import cv2
+import numpy as np
+import torch
+
+from modelscope.metainfo import Models
+from modelscope.models.base.base_torch_model import TorchModel
+from modelscope.models.builder import MODELS
+from modelscope.outputs import OutputKeys
+from modelscope.utils.constant import ModelFile, Tasks
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+__all__ = ['YOLOPv2']
+
+
+@MODELS.register_module(
+    Tasks.image_driving_perception, module_name=Models.yolopv2)
+class YOLOPv2(TorchModel):
+    """ YOLOPv2 use E-ELAN which first adopted in Yolov7 as backbone, SPP+FPN+PAN as neck and head.
+    For more infomation, please refer to https://arxiv.org/pdf/2208.11434.pdf
+    """
+
+    def __init__(self, model_dir: str, *args, **kwargs):
+        super().__init__(model_dir, *args, **kwargs)
+
+        self.model_dir = model_dir
+        self._load_pretrained_checkpoint()
+
+    def forward(self, data):
+        img = data['img']
+        with torch.no_grad():
+            [pred, anchor_grid], seg, ll = self.model(img)
+        return {
+            'img_hw': data['img'].shape[2:],
+            'pred': pred,
+            'anchor_grid': anchor_grid,
+            'driving_area_mask': seg,
+            'lane_line_mask': ll,
+        }
+
+    def postprocess(self, inputs: Dict[str, Any], **kwargs) -> Dict[str, Any]:
+        return super().postprocess(inputs, **kwargs)
+
+    def _load_pretrained_checkpoint(self):
+        model_path = os.path.join(self.model_dir, ModelFile.TORCH_MODEL_FILE)
+        logger.info(model_path)
+        if os.path.exists(model_path):
+            self.model = torch.jit.load(model_path, 'cpu')
+            self.model = self.model.eval()
+
+        else:
+            logger.error(
+                '[checkModelPath]:model path dose not exits!!! model Path:'
+                + model_path)
+            raise Exception('[checkModelPath]:model path dose not exits!')
diff --git a/modelscope/models/cv/image_driving_perception/preprocessor.py b/modelscope/models/cv/image_driving_perception/preprocessor.py
new file mode 100644
index 00000000..dbb4f761
--- /dev/null
+++ b/modelscope/models/cv/image_driving_perception/preprocessor.py
@@ -0,0 +1,120 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import Any, Dict, Union
+
+import cv2
+import numpy as np
+import torch
+
+from modelscope.metainfo import Preprocessors
+from modelscope.preprocessors import Preprocessor
+from modelscope.preprocessors.builder import PREPROCESSORS
+from modelscope.preprocessors.image import LoadImage
+from modelscope.utils.constant import Fields, ModeKeys
+from modelscope.utils.type_assert import type_assert
+
+
+@PREPROCESSORS.register_module(
+    Fields.cv, module_name=Preprocessors.image_driving_perception_preprocessor)
+class ImageDrivingPerceptionPreprocessor(Preprocessor):
+
+    def __init__(self, mode: str = ModeKeys.INFERENCE, *args, **kwargs):
+        """
+        Args:
+            model_dir (str): model directory to initialize some resource.
+            mode: The mode for the preprocessor.
+        """
+        super().__init__(mode, *args, **kwargs)
+
+    def _check_image(self, input_img):
+        whole_temp_shape = input_img.shape
+        if len(whole_temp_shape) == 2:
+            input_img = np.stack([input_img, input_img, input_img], axis=2)
+        elif whole_temp_shape[2] == 1:
+            input_img = np.concatenate([input_img, input_img, input_img],
+                                       axis=2)
+        elif whole_temp_shape[2] == 4:
+            input_img = input_img[:, :,
+                                  0:3] * 1.0 * input_img[:, :,
+                                                         3:4] * 1.0 / 255.0
+        return input_img
+
+    def _letterbox(self,
+                   img,
+                   new_shape=(640, 640),
+                   color=(114, 114, 114),
+                   auto=True,
+                   scaleFill=False,
+                   scaleup=True,
+                   stride=32):
+        # Resize and pad image while meeting stride-multiple constraints
+        shape = img.shape[:2]  # current shape [height, width]
+        if isinstance(new_shape, int):
+            new_shape = (new_shape, new_shape)
+        # Scale ratio (new / old)
+        r = min(new_shape[0] / shape[0], new_shape[1] / shape[1])
+
+        if not scaleup:  # only scale down, do not scale up (for better test mAP)
+            r = min(r, 1.0)
+
+        # Compute padding
+        ratio = r, r  # width, height ratios
+        new_unpad = int(round(shape[1] * r)), int(round(shape[0] * r))
+        dw, dh = new_shape[1] - new_unpad[0], new_shape[0] - new_unpad[
+            1]  # wh padding
+        if auto:  # minimum rectangle
+            dw, dh = np.mod(dw, stride), np.mod(dh, stride)  # wh padding
+        elif scaleFill:  # stretch
+            dw, dh = 0.0, 0.0
+            new_unpad = (new_shape[1], new_shape[0])
+            ratio = new_shape[1] / shape[1], new_shape[0] / shape[
+                0]  # width, height ratios
+
+        dw /= 2  # divide padding into 2 sides
+        dh /= 2
+
+        if shape[::-1] != new_unpad:  # resize
+            img = cv2.resize(img, new_unpad, interpolation=cv2.INTER_LINEAR)
+
+        top, bottom = int(round(dh - 0.1)), int(round(dh + 0.1))
+        left, right = int(round(dw - 0.1)), int(round(dw + 0.1))
+
+        img = cv2.copyMakeBorder(
+            img, top, bottom, left, right, cv2.BORDER_CONSTANT,
+            value=color)  # add border
+
+        return img, ratio, (dw, dh)
+
+    @type_assert(object, object)
+    def __call__(
+        self, data: str, output_shape=(1280, 720), new_shape=(640, 640)
+    ) -> Dict[str, Any]:
+        """process the raw input data
+        Args:
+            data (str): image path
+        Returns:
+            Dict[ndarry, Any]: the preprocessed data
+            {
+                "img": the preprocessed resized image (640x640)
+            }
+        """
+        img = LoadImage.convert_to_ndarray(data)
+        if img is not None:
+            img = self._check_image(img)
+        else:
+            raise Exception('img is None')
+        img = cv2.resize(img, output_shape, interpolation=cv2.INTER_LINEAR)
+        img = self._letterbox(img, new_shape)[0]
+        img = img.transpose(2, 0, 1)  # to 3x640x640
+
+        img = np.ascontiguousarray(img)
+        img = torch.from_numpy(img)
+        img = img.float()  # uint8 to fp16/32
+        # Convert
+        img /= 255.0  # 0 - 255 to 0.0 - 1.0
+
+        if img.ndimension() == 3:
+            img = img.unsqueeze(0)
+
+        return {
+            'img': img,
+        }
diff --git a/modelscope/models/cv/image_driving_perception/utils.py b/modelscope/models/cv/image_driving_perception/utils.py
new file mode 100644
index 00000000..82f16ed6
--- /dev/null
+++ b/modelscope/models/cv/image_driving_perception/utils.py
@@ -0,0 +1,208 @@
+# Part of the implementation is borrowed and modified from internet,
+# publicly available at https://github.com/CAIC-AD/YOLOPv2
+import time
+
+import numpy as np
+import torch
+from torchvision.ops import nms
+
+
+def _make_grid(nx=20, ny=20):
+    yv, xv = torch.meshgrid(
+        [torch.arange(ny), torch.arange(nx)], indexing='ij')
+    return torch.stack((xv, yv), 2).view((1, 1, ny, nx, 2)).float()
+
+
+def split_for_trace_model(pred=None, anchor_grid=None):
+    z = []
+    st = [8, 16, 32]
+    for i in range(3):
+        bs, _, ny, nx = pred[i].shape
+        pred[i] = pred[i].view(bs, 3, 85, ny, nx).permute(0, 1, 3, 4,
+                                                          2).contiguous()
+        y = pred[i].sigmoid()
+        gr = _make_grid(nx, ny).to(pred[i].device)
+        y[..., 0:2] = (y[..., 0:2] * 2. - 0.5 + gr) * st[i]  # xy
+        y[..., 2:4] = (y[..., 2:4] * 2)**2 * anchor_grid[i]  # wh
+        z.append(y.view(bs, -1, 85))
+    pred = torch.cat(z, 1)
+    return pred
+
+
+def scale_coords(img1_shape,
+                 coords,
+                 img0_shape=(720, 1280, 3),
+                 ratio_pad=None):
+    # Rescale coords (xyxy) from img1_shape to img0_shape
+    if ratio_pad is None:  # calculate from img0_shape
+        gain = min(img1_shape[0] / img0_shape[0],
+                   img1_shape[1] / img0_shape[1])  # gain  = old / new
+        pad = (img1_shape[1] - img0_shape[1] * gain) / 2, (
+            img1_shape[0] - img0_shape[0] * gain) / 2  # wh padding
+    else:
+        gain = ratio_pad[0][0]
+        pad = ratio_pad[1]
+
+    coords[:, [0, 2]] -= pad[0]  # x padding
+    coords[:, [1, 3]] -= pad[1]  # y padding
+    coords[:, :4] /= gain
+    clip_coords(coords, img0_shape)
+    return coords
+
+
+def clip_coords(boxes, img_shape):
+    # Clip bounding xyxy bounding boxes to image shape (height, width)
+    boxes[:, 0].clamp_(0, img_shape[1])  # x1
+    boxes[:, 1].clamp_(0, img_shape[0])  # y1
+    boxes[:, 2].clamp_(0, img_shape[1])  # x2
+    boxes[:, 3].clamp_(0, img_shape[0])  # y2
+
+
+def xywh2xyxy(x):
+    # Convert nx4 boxes from [x, y, w, h] to [x1, y1, x2, y2] where xy1=top-left, xy2=bottom-right
+    y = x.clone() if isinstance(x, torch.Tensor) else np.copy(x)
+    y[:, 0] = x[:, 0] - x[:, 2] / 2  # top left x
+    y[:, 1] = x[:, 1] - x[:, 3] / 2  # top left y
+    y[:, 2] = x[:, 0] + x[:, 2] / 2  # bottom right x
+    y[:, 3] = x[:, 1] + x[:, 3] / 2  # bottom right y
+    return y
+
+
+def non_max_suppression(prediction,
+                        conf_thres=0.3,
+                        iou_thres=0.45,
+                        classes=None,
+                        agnostic=False,
+                        multi_label=False,
+                        labels=()):
+    """Runs Non-Maximum Suppression (NMS) on inference results
+
+    Returns:
+         list of detections, on (n,6) tensor per image [xyxy, conf, cls]
+    """
+
+    nc = prediction.shape[2] - 5  # number of classes
+    xc = prediction[..., 4] > conf_thres  # candidates
+
+    # Settings
+    max_wh = 4096  # (pixels) minimum and maximum box width and height
+    max_det = 300  # maximum number of detections per image
+    max_nms = 30000  # maximum number of boxes into torchvision.ops.nms()
+    time_limit = 10.0  # seconds to quit after
+    redundant = True  # require redundant detections
+    multi_label &= nc > 1  # multiple labels per box (adds 0.5ms/img)
+    merge = False  # use merge-NMS
+
+    t = time.time()
+    output = [torch.zeros(
+        (0, 6), device=prediction.device)] * prediction.shape[0]
+    for xi, x in enumerate(prediction):  # image index, image inference
+        # Apply constraints
+        x = x[xc[xi]]  # confidence
+
+        # Cat apriori labels if autolabelling
+        if labels and len(labels[xi]):
+            lbs = labels[xi]
+            v = torch.zeros((len(lbs), nc + 5), device=x.device)
+            v[:, :4] = lbs[:, 1:5]  # box
+            v[:, 4] = 1.0  # conf
+            v[range(len(lbs)), lbs[:, 0].long() + 5] = 1.0  # cls
+            x = torch.cat((x, v), 0)
+
+        # If none remain process next image
+        if not x.shape[0]:
+            continue
+
+        # Compute conf
+        x[:, 5:] *= x[:, 4:5]  # conf = obj_conf * cls_conf
+
+        # Box (center x, center y, width, height) to (x1, y1, x2, y2)
+        box = xywh2xyxy(x[:, :4])
+
+        # Detections matrix nx6 (xyxy, conf, cls)
+        if multi_label:
+            i, j = (x[:, 5:] > conf_thres).nonzero(as_tuple=False).T
+            x = torch.cat((box[i], x[i, j + 5, None], j[:, None].float()), 1)
+        else:  # best class only
+            conf, j = x[:, 5:].max(1, keepdim=True)
+            x = torch.cat((box, conf, j.float()),
+                          1)[conf.view(-1) > conf_thres]
+
+        # Filter by class
+        if classes is not None:
+            x = x[(x[:, 5:6] == torch.tensor(classes, device=x.device)).any(1)]
+
+        # Check shape
+        n = x.shape[0]  # number of boxes
+        if not n:  # no boxes
+            continue
+        elif n > max_nms:  # excess boxes
+            x = x[x[:, 4].argsort(
+                descending=True)[:max_nms]]  # sort by confidence
+
+        # Batched NMS
+        c = x[:, 5:6] * (0 if agnostic else max_wh)  # classes
+        boxes, scores = x[:, :4] + c, x[:,
+                                        4]  # boxes (offset by class), scores
+        i = nms(boxes, scores, iou_thres)  # NMS
+        if i.shape[0] > max_det:  # limit detections
+            i = i[:max_det]
+        if merge and (1 < n and n < 3E3):
+            # update boxes as boxes(i,4) = weights(i,n) * boxes(n,4)
+            iou = box_iou(boxes[i], boxes) > iou_thres  # iou matrix
+            weights = iou * scores[None]  # box weights
+            x[i, :4] = torch.mm(weights, x[:, :4]).float() / weights.sum(
+                1, keepdim=True)  # merged boxes
+            if redundant:
+                i = i[iou.sum(1) > 1]  # require redundancy
+
+        output[xi] = x[i]
+        if (time.time() - t) > time_limit:
+            print(f'WARNING: NMS time limit {time_limit}s exceeded')
+            break  # time limit exceeded
+
+    return output
+
+
+def box_iou(box1, box2):
+    # https://github.com/pytorch/vision/blob/master/torchvision/ops/boxes.py
+    """
+    Return intersection-over-union (Jaccard index) of boxes.
+    Both sets of boxes are expected to be in (x1, y1, x2, y2) format.
+    Args:
+        box1 (Tensor[N, 4])
+        box2 (Tensor[M, 4])
+    Returns:
+        iou (Tensor[N, M]): the NxM matrix containing the pairwise
+            IoU values for every element in boxes1 and boxes2
+    """
+
+    def box_area(box):
+        # box = 4xn
+        return (box[2] - box[0]) * (box[3] - box[1])
+
+    area1 = box_area(box1.T)
+    area2 = box_area(box2.T)
+
+    inter = (torch.min(box1[:, None, 2:], box2[:, 2:])
+             - torch.max(box1[:, None, :2], box2[:, :2])).clamp(0).prod(2)
+    return inter / (area1[:, None] + area2 - inter
+                    )  # iou = inter / (area1 + area2 - inter)
+
+
+def driving_area_mask(seg=None):
+    da_predict = seg[:, :, 12:372, :]
+    da_seg_mask = torch.nn.functional.interpolate(
+        da_predict, scale_factor=2, mode='bilinear')
+    _, da_seg_mask = torch.max(da_seg_mask, 1)
+    da_seg_mask = da_seg_mask.int().squeeze().cpu().numpy()
+    return da_seg_mask
+
+
+def lane_line_mask(ll=None):
+    ll_predict = ll[:, :, 12:372, :]
+    ll_seg_mask = torch.nn.functional.interpolate(
+        ll_predict, scale_factor=2, mode='bilinear')
+    ll_seg_mask = torch.round(ll_seg_mask).squeeze(1)
+    ll_seg_mask = ll_seg_mask.int().squeeze().cpu().numpy()
+    return ll_seg_mask
diff --git a/modelscope/models/cv/image_human_parsing/__init__.py b/modelscope/models/cv/image_human_parsing/__init__.py
new file mode 100644
index 00000000..787c0353
--- /dev/null
+++ b/modelscope/models/cv/image_human_parsing/__init__.py
@@ -0,0 +1,23 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .m2fp_net import M2FP
+    from parsing_utils import center_to_target_size_test
+else:
+    _import_structure = {
+        'm2fp_net': ['M2FP'],
+        'parsing_utils': ['center_to_target_size_test']
+    }
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/models/cv/image_human_parsing/backbone/__init__.py b/modelscope/models/cv/image_human_parsing/backbone/__init__.py
new file mode 100644
index 00000000..47bf7f72
--- /dev/null
+++ b/modelscope/models/cv/image_human_parsing/backbone/__init__.py
@@ -0,0 +1,22 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .deeplab_resnet import build_resnet_deeplab_backbone
+
+else:
+    _import_structure = {
+        'deeplab_resnet': ['build_resnet_deeplab_backbone'],
+    }
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/models/cv/image_human_parsing/backbone/deeplab_resnet.py b/modelscope/models/cv/image_human_parsing/backbone/deeplab_resnet.py
new file mode 100644
index 00000000..d8f890b8
--- /dev/null
+++ b/modelscope/models/cv/image_human_parsing/backbone/deeplab_resnet.py
@@ -0,0 +1,377 @@
+# Part of the implementation is borrowed and modified from Detectron2, publicly available at
+# https://github.com/facebookresearch/detectron2/blob/main/projects/DeepLab/deeplab/resnet.py
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+from modelscope.models.cv.image_instance_segmentation.maskdino.utils import \
+    Conv2d
+
+
+def get_norm(norm, out_channels):
+    if norm is None:
+        return None
+    if isinstance(norm, str):
+        if len(norm) == 0:
+            return None
+        norm = {
+            'BN': torch.nn.BatchNorm2d,
+            'GN': lambda channels: nn.GroupNorm(32, channels),
+            'nnSyncBN': nn.SyncBatchNorm,
+        }[norm]
+    return norm(out_channels)
+
+
+class BasicBlock(nn.Module):
+
+    def __init__(self, in_channels, out_channels, *, stride=1, norm='BN'):
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.stride = stride
+
+        if in_channels != out_channels:
+            self.shortcut = Conv2d(
+                in_channels,
+                out_channels,
+                kernel_size=1,
+                stride=stride,
+                bias=False,
+                norm=get_norm(norm, out_channels))
+        else:
+            self.shortcut = None
+
+        self.conv1 = Conv2d(
+            in_channels,
+            out_channels,
+            kernel_size=3,
+            stride=stride,
+            padding=1,
+            bias=False,
+            norm=get_norm(norm, out_channels))
+
+        self.conv2 = Conv2d(
+            out_channels,
+            out_channels,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            bias=False,
+            norm=get_norm(norm, out_channels))
+
+    def forward(self, x):
+        out = self.conv1(x)
+        out = F.relu_(out)
+        out = self.conv2(out)
+
+        if self.shortcut is not None:
+            shortcut = self.shortcut(x)
+        else:
+            shortcut = x
+
+        out += shortcut
+        out = F.relu_(out)
+        return out
+
+
+class BottleneckBlock(nn.Module):
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 *,
+                 bottleneck_channels,
+                 stride=1,
+                 num_groups=1,
+                 norm='BN',
+                 stride_in_1x1=False,
+                 dilation=1):
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.stride = stride
+
+        if in_channels != out_channels:
+            self.shortcut = Conv2d(
+                in_channels,
+                out_channels,
+                kernel_size=1,
+                stride=stride,
+                bias=False,
+                norm=get_norm(norm, out_channels),
+            )
+        else:
+            self.shortcut = None
+
+        stride_1x1, stride_3x3 = (stride, 1) if stride_in_1x1 else (1, stride)
+
+        self.conv1 = Conv2d(
+            in_channels,
+            bottleneck_channels,
+            kernel_size=1,
+            stride=stride_1x1,
+            bias=False,
+            norm=get_norm(norm, bottleneck_channels))
+        self.conv2 = Conv2d(
+            bottleneck_channels,
+            bottleneck_channels,
+            kernel_size=3,
+            stride=stride_3x3,
+            padding=1 * dilation,
+            bias=False,
+            groups=num_groups,
+            dilation=dilation,
+            norm=get_norm(norm, bottleneck_channels))
+        self.conv3 = Conv2d(
+            bottleneck_channels,
+            out_channels,
+            kernel_size=1,
+            bias=False,
+            norm=get_norm(norm, out_channels))
+
+    def forward(self, x):
+        out = self.conv1(x)
+        out = F.relu_(out)
+
+        out = self.conv2(out)
+        out = F.relu_(out)
+
+        out = self.conv3(out)
+
+        if self.shortcut is not None:
+            shortcut = self.shortcut(x)
+        else:
+            shortcut = x
+
+        out += shortcut
+        out = F.relu_(out)
+        return out
+
+
+class DeepLabStem(nn.Module):
+
+    def __init__(self, in_channels=3, out_channels=128, norm='BN'):
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.stride = 4
+        self.conv1 = Conv2d(
+            in_channels,
+            out_channels // 2,
+            kernel_size=3,
+            stride=2,
+            padding=1,
+            bias=False,
+            norm=get_norm(norm, out_channels // 2))
+        self.conv2 = Conv2d(
+            out_channels // 2,
+            out_channels // 2,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            bias=False,
+            norm=get_norm(norm, out_channels // 2))
+        self.conv3 = Conv2d(
+            out_channels // 2,
+            out_channels,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            bias=False,
+            norm=get_norm(norm, out_channels))
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = F.relu_(x)
+        x = self.conv2(x)
+        x = F.relu_(x)
+        x = self.conv3(x)
+        x = F.relu_(x)
+        x = F.max_pool2d(x, kernel_size=3, stride=2, padding=1)
+        return x
+
+
+class DeeplabResNet(nn.Module):
+
+    def __init__(self, stem, stages, num_classes=None, out_features=None):
+        super().__init__()
+        self.stem = stem
+        self.num_classes = num_classes
+
+        current_stride = self.stem.stride
+        self._out_feature_strides = {'stem': current_stride}
+        self._out_feature_channels = {'stem': self.stem.out_channels}
+
+        self.stage_names, self.stages = [], []
+
+        if out_features is not None:
+            num_stages = max([{
+                'res2': 1,
+                'res3': 2,
+                'res4': 3,
+                'res5': 4
+            }.get(f, 0) for f in out_features])
+            stages = stages[:num_stages]
+        for i, blocks in enumerate(stages):
+            assert len(blocks) > 0, len(blocks)
+            for block in blocks:
+                assert isinstance(block, nn.Module), block
+
+            name = 'res' + str(i + 2)
+            stage = nn.Sequential(*blocks)
+
+            self.add_module(name, stage)
+            self.stage_names.append(name)
+            self.stages.append(stage)
+
+            self._out_feature_strides[name] = current_stride = int(
+                current_stride * np.prod([k.stride for k in blocks]))
+            self._out_feature_channels[name] = curr_channels = blocks[
+                -1].out_channels
+        self.stage_names = tuple(
+            self.stage_names)  # Make it static for scripting
+
+        if num_classes is not None:
+            self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
+            self.linear = nn.Linear(curr_channels, num_classes)
+            nn.init.normal_(self.linear.weight, std=0.01)
+            name = 'linear'
+
+        if out_features is None:
+            out_features = [name]
+        self._out_features = out_features
+        assert len(self._out_features)
+        children = [x[0] for x in self.named_children()]
+        for out_feature in self._out_features:
+            assert out_feature in children, 'Available children: {}'.format(
+                ', '.join(children))
+
+    def forward(self, x):
+        assert x.dim(
+        ) == 4, f'ResNet takes an input of shape (N, C, H, W). Got {x.shape} instead!'
+        outputs = {}
+        x = self.stem(x)
+        if 'stem' in self._out_features:
+            outputs['stem'] = x
+        for name, stage in zip(self.stage_names, self.stages):
+            x = stage(x)
+            if name in self._out_features:
+                outputs[name] = x
+        if self.num_classes is not None:
+            x = self.avgpool(x)
+            x = torch.flatten(x, 1)
+            x = self.linear(x)
+            if 'linear' in self._out_features:
+                outputs['linear'] = x
+        return outputs
+
+    def output_shape(self):
+        return {
+            name: dict(
+                channels=self._out_feature_channels[name],
+                stride=self._out_feature_strides[name])
+            for name in self._out_features
+        }
+
+    @property
+    def size_divisibility(self) -> int:
+        return 0
+
+    @staticmethod
+    def make_stage(block_class, num_blocks, *, in_channels, out_channels,
+                   **kwargs):
+        blocks = []
+        for i in range(num_blocks):
+            curr_kwargs = {}
+            for k, v in kwargs.items():
+                if k.endswith('_per_block'):
+                    assert len(v) == num_blocks, (
+                        f"Argument '{k}' of make_stage should have the "
+                        f'same length as num_blocks={num_blocks}.')
+                    newk = k[:-len('_per_block')]
+                    assert newk not in kwargs, f'Cannot call make_stage with both {k} and {newk}!'
+                    curr_kwargs[newk] = v[i]
+                else:
+                    curr_kwargs[k] = v
+
+            blocks.append(
+                block_class(
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    **curr_kwargs))
+            in_channels = out_channels
+        return blocks
+
+
+def build_resnet_deeplab_backbone(out_features, depth, num_groups,
+                                  width_per_group, norm, stem_out_channels,
+                                  res2_out_channels, stride_in_1x1,
+                                  res4_dilation, res5_dilation,
+                                  res5_multi_grid, input_shape):
+    stem = DeepLabStem(
+        in_channels=input_shape['channels'],
+        out_channels=stem_out_channels,
+        norm=norm)
+    bottleneck_channels = num_groups * width_per_group
+    in_channels = stem_out_channels
+    out_channels = res2_out_channels
+
+    assert res4_dilation in {
+        1, 2
+    }, 'res4_dilation cannot be {}.'.format(res4_dilation)
+    assert res5_dilation in {
+        1, 2, 4
+    }, 'res5_dilation cannot be {}.'.format(res5_dilation)
+    if res4_dilation == 2:
+        # Always dilate res5 if res4 is dilated.
+        assert res5_dilation == 4
+
+    num_blocks_per_stage = {
+        50: [3, 4, 6, 3],
+        101: [3, 4, 23, 3],
+        152: [3, 8, 36, 3]
+    }[depth]
+
+    stages = []
+    out_stage_idx = [{
+        'res2': 2,
+        'res3': 3,
+        'res4': 4,
+        'res5': 5
+    }[f] for f in out_features]
+    max_stage_idx = max(out_stage_idx)
+    for idx, stage_idx in enumerate(range(2, max_stage_idx + 1)):
+        if stage_idx == 4:
+            dilation = res4_dilation
+        elif stage_idx == 5:
+            dilation = res5_dilation
+        else:
+            dilation = 1
+        first_stride = 1 if idx == 0 or dilation > 1 else 2
+        stride_per_block = [first_stride]
+        stride_per_block += [1] * (num_blocks_per_stage[idx] - 1)
+        stage_kargs = {
+            'num_blocks': num_blocks_per_stage[idx],
+            'stride_per_block': stride_per_block,
+            'in_channels': in_channels,
+            'out_channels': out_channels,
+            'norm': norm,
+            'bottleneck_channels': bottleneck_channels,
+            'stride_in_1x1': stride_in_1x1,
+            'dilation': dilation,
+            'num_groups': num_groups,
+            'block_class': BottleneckBlock
+        }
+        if stage_idx == 5:
+            stage_kargs.pop('dilation')
+            stage_kargs['dilation_per_block'] = [
+                dilation * mg for mg in res5_multi_grid
+            ]
+        blocks = DeeplabResNet.make_stage(**stage_kargs)
+        in_channels = out_channels
+        out_channels *= 2
+        bottleneck_channels *= 2
+        stages.append(blocks)
+    return DeeplabResNet(stem, stages, out_features=out_features)
diff --git a/modelscope/models/cv/image_human_parsing/m2fp/__init__.py b/modelscope/models/cv/image_human_parsing/m2fp/__init__.py
new file mode 100644
index 00000000..d90618d3
--- /dev/null
+++ b/modelscope/models/cv/image_human_parsing/m2fp/__init__.py
@@ -0,0 +1,24 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .m2fp_encoder import MSDeformAttnPixelDecoder
+    from .m2fp_decoder import MultiScaleMaskedTransformerDecoder
+
+else:
+    _import_structure = {
+        'm2fp_encoder': ['MSDeformAttnPixelDecoder'],
+        'm2fp_decoder': ['MultiScaleMaskedTransformerDecoder'],
+    }
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/models/cv/image_human_parsing/m2fp/m2fp_decoder.py b/modelscope/models/cv/image_human_parsing/m2fp/m2fp_decoder.py
new file mode 100644
index 00000000..f21eae46
--- /dev/null
+++ b/modelscope/models/cv/image_human_parsing/m2fp/m2fp_decoder.py
@@ -0,0 +1,221 @@
+# The implementation is adopted from Mask2Former, made publicly available under the MIT License at
+# https://github.com/facebookresearch/Mask2Former
+
+import torch
+from torch import nn
+from torch.nn import functional as F
+
+from modelscope.models.cv.image_colorization.ddcolor.utils.transformer_utils import (
+    MLP, CrossAttentionLayer, FFNLayer, SelfAttentionLayer)
+from modelscope.models.cv.image_instance_segmentation.maskdino.position_encoding import \
+    PositionEmbeddingSine
+from modelscope.models.cv.image_instance_segmentation.maskdino.utils import \
+    Conv2d
+
+
+class MultiScaleMaskedTransformerDecoder(nn.Module):
+
+    def __init__(
+        self,
+        in_channels,
+        mask_classification=True,
+        *,
+        num_classes: int,
+        hidden_dim: int,
+        num_queries: int,
+        nheads: int,
+        dim_feedforward: int,
+        dec_layers: int,
+        pre_norm: bool,
+        mask_dim: int,
+        enforce_input_project: bool,
+    ):
+        """
+        NOTE: this interface is experimental.
+        Args:
+            in_channels: channels of the input features
+            mask_classification: whether to add mask classifier or not
+            num_classes: number of classes
+            hidden_dim: Transformer feature dimension
+            num_queries: number of queries
+            nheads: number of heads
+            dim_feedforward: feature dimension in feedforward network
+            dec_layers: number of Transformer decoder layers
+            pre_norm: whether to use pre-LayerNorm or not
+            mask_dim: mask feature dimension
+            enforce_input_project: add input project 1x1 conv even if input
+                channels and hidden dim is identical
+        """
+        super().__init__()
+
+        assert mask_classification, 'Only support mask classification model'
+        self.mask_classification = mask_classification
+
+        # positional encoding
+        N_steps = hidden_dim // 2
+        self.pe_layer = PositionEmbeddingSine(N_steps, normalize=True)
+
+        # define Transformer decoder here
+        self.num_heads = nheads
+        self.num_layers = dec_layers
+        self.num_classes = num_classes
+        self.transformer_self_attention_layers = nn.ModuleList()
+        self.transformer_cross_attention_layers = nn.ModuleList()
+        self.transformer_ffn_layers = nn.ModuleList()
+
+        for _ in range(self.num_layers):
+            self.transformer_self_attention_layers.append(
+                SelfAttentionLayer(
+                    d_model=hidden_dim,
+                    nhead=nheads,
+                    dropout=0.0,
+                    normalize_before=pre_norm,
+                ))
+
+            self.transformer_cross_attention_layers.append(
+                CrossAttentionLayer(
+                    d_model=hidden_dim,
+                    nhead=nheads,
+                    dropout=0.0,
+                    normalize_before=pre_norm,
+                ))
+
+            self.transformer_ffn_layers.append(
+                FFNLayer(
+                    d_model=hidden_dim,
+                    dim_feedforward=dim_feedforward,
+                    dropout=0.0,
+                    normalize_before=pre_norm,
+                ))
+
+        self.decoder_norm = nn.LayerNorm(hidden_dim)
+
+        self.num_queries = num_queries
+        # learnable query features
+        self.query_feat = nn.Embedding(num_queries, hidden_dim)
+        # learnable query p.e.
+        self.query_embed = nn.Embedding(num_queries, hidden_dim)
+
+        # level embedding (we always use 3 scales)
+        self.num_feature_levels = 3
+        self.level_embed = nn.Embedding(self.num_feature_levels, hidden_dim)
+        self.input_proj = nn.ModuleList()
+        for _ in range(self.num_feature_levels):
+            if in_channels != hidden_dim or enforce_input_project:
+                self.input_proj.append(
+                    Conv2d(in_channels, hidden_dim, kernel_size=1))
+            else:
+                self.input_proj.append(nn.Sequential())
+
+        # output FFNs
+        if self.mask_classification:
+            self.class_embed = nn.Linear(hidden_dim, num_classes + 1)
+        self.mask_embed = MLP(hidden_dim, hidden_dim, mask_dim, 3)
+
+    def forward(self, x, mask_features, mask=None):
+        # x is a list of multi-scale feature
+        assert len(x) == self.num_feature_levels
+        src = []
+        pos = []
+        size_list = []
+
+        # disable mask, it does not affect performance
+        del mask
+
+        for i in range(self.num_feature_levels):
+            size_list.append(x[i].shape[-2:])
+            pos.append(self.pe_layer(x[i], None).flatten(2))
+            src.append(self.input_proj[i](x[i]).flatten(2)
+                       + self.level_embed.weight[i][None, :, None])
+
+            # flatten NxCxHxW to HWxNxC
+            pos[-1] = pos[-1].permute(2, 0, 1)
+            src[-1] = src[-1].permute(2, 0, 1)
+
+        _, bs, _ = src[0].shape
+
+        # QxNxC
+        query_embed = self.query_embed.weight.unsqueeze(1).repeat(1, bs, 1)
+        output = self.query_feat.weight.unsqueeze(1).repeat(1, bs, 1)
+
+        predictions_class = []
+        predictions_mask = []
+
+        # prediction heads on learnable query features
+        outputs_class, outputs_mask, attn_mask = self.forward_prediction_heads(
+            output, mask_features, attn_mask_target_size=size_list[0])
+        predictions_class.append(outputs_class)
+        predictions_mask.append(outputs_mask)
+
+        for i in range(self.num_layers):
+            level_index = i % self.num_feature_levels
+            attn_mask[torch.where(
+                attn_mask.sum(-1) == attn_mask.shape[-1])] = False
+            # attention: cross-attention first
+            output = self.transformer_cross_attention_layers[i](
+                output,
+                src[level_index],
+                memory_mask=attn_mask,
+                memory_key_padding_mask=
+                None,  # here we do not apply masking on padded region
+                pos=pos[level_index],
+                query_pos=query_embed)
+
+            output = self.transformer_self_attention_layers[i](
+                output,
+                tgt_mask=None,
+                tgt_key_padding_mask=None,
+                query_pos=query_embed)
+
+            # FFN
+            output = self.transformer_ffn_layers[i](output)
+
+            outputs_class, outputs_mask, attn_mask = \
+                self.forward_prediction_heads(
+                    output, mask_features, attn_mask_target_size=size_list[(i + 1) % self.num_feature_levels])
+            predictions_class.append(outputs_class)
+            predictions_mask.append(outputs_mask)
+
+        assert len(predictions_class) == self.num_layers + 1
+
+        out = {
+            'pred_logits':
+            predictions_class[-1],
+            'pred_masks':
+            predictions_mask[-1],
+            'aux_outputs':
+            self._set_aux_loss(
+                predictions_class if self.mask_classification else None,
+                predictions_mask)
+        }
+        return out
+
+    def forward_prediction_heads(self, output, mask_features,
+                                 attn_mask_target_size):
+        decoder_output = self.decoder_norm(output)
+        decoder_output = decoder_output.transpose(0, 1)
+        outputs_class = self.class_embed(decoder_output)
+        mask_embed = self.mask_embed(decoder_output)
+        outputs_mask = torch.einsum('bqc,bchw->bqhw', mask_embed,
+                                    mask_features)
+
+        attn_mask = F.interpolate(
+            outputs_mask,
+            size=attn_mask_target_size,
+            mode='bilinear',
+            align_corners=False)
+        attn_mask = (attn_mask.sigmoid().flatten(2).unsqueeze(1).repeat(
+            1, self.num_heads, 1, 1).flatten(0, 1) < 0.5).bool()
+        attn_mask = attn_mask.detach()
+
+        return outputs_class, outputs_mask, attn_mask
+
+    @torch.jit.unused
+    def _set_aux_loss(self, outputs_class, outputs_seg_masks):
+        if self.mask_classification:
+            return [{
+                'pred_logits': a,
+                'pred_masks': b
+            } for a, b in zip(outputs_class[:-1], outputs_seg_masks[:-1])]
+        else:
+            return [{'pred_masks': b} for b in outputs_seg_masks[:-1]]
diff --git a/modelscope/models/cv/image_human_parsing/m2fp/m2fp_encoder.py b/modelscope/models/cv/image_human_parsing/m2fp/m2fp_encoder.py
new file mode 100644
index 00000000..7b9cf78d
--- /dev/null
+++ b/modelscope/models/cv/image_human_parsing/m2fp/m2fp_encoder.py
@@ -0,0 +1,215 @@
+# The implementation is adopted from Mask2Former, made publicly available under the MIT License at
+# https://github.com/facebookresearch/Mask2Former
+
+from typing import Any, Dict, List
+
+import numpy as np
+import torch
+from torch import nn
+from torch.cuda.amp import autocast
+from torch.nn import functional as F
+
+from modelscope.models.cv.image_instance_segmentation.maskdino.maskdino_encoder import \
+    MSDeformAttnTransformerEncoderOnly
+from modelscope.models.cv.image_instance_segmentation.maskdino.position_encoding import \
+    PositionEmbeddingSine
+from modelscope.models.cv.image_instance_segmentation.maskdino.utils import \
+    Conv2d
+
+
+class MSDeformAttnPixelDecoder(nn.Module):
+
+    def __init__(
+        self,
+        input_shape: Dict[str, Any],
+        *,
+        transformer_dropout: float,
+        transformer_nheads: int,
+        transformer_dim_feedforward: int,
+        transformer_enc_layers: int,
+        conv_dim: int,
+        mask_dim: int,
+        # deformable transformer encoder args
+        transformer_in_features: List[str],
+        common_stride: int,
+    ):
+        """
+        NOTE: this interface is experimental.
+        Args:
+            input_shape: shapes (channels and stride) of the input features
+            transformer_dropout: dropout probability in transformer
+            transformer_nheads: number of heads in transformer
+            transformer_dim_feedforward: dimension of feedforward network
+            transformer_enc_layers: number of transformer encoder layers
+            conv_dim: number of output channels for the intermediate conv layers.
+            mask_dim: number of output channels for the final conv layer.
+        """
+        super().__init__()
+        self.conv_dim = conv_dim
+
+        transformer_input_shape = {
+            k: v
+            for k, v in input_shape.items() if k in transformer_in_features
+        }
+
+        # this is the input shape of pixel decoder
+        input_shape = sorted(input_shape.items(), key=lambda x: x[1]['stride'])
+        self.in_features = [k for k, v in input_shape
+                            ]  # starting from "res2" to "res5"
+        self.feature_strides = [v['stride'] for k, v in input_shape]
+        self.feature_channels = [v['channels'] for k, v in input_shape]
+
+        # this is the input shape of transformer encoder (could use less features than pixel decoder
+        transformer_input_shape = sorted(
+            transformer_input_shape.items(), key=lambda x: x[1]['stride'])
+        self.transformer_in_features = [k for k, v in transformer_input_shape
+                                        ]  # starting from "res2" to "res5"
+        transformer_in_channels = [
+            v['channels'] for k, v in transformer_input_shape
+        ]
+        self.transformer_feature_strides = [
+            v['stride'] for k, v in transformer_input_shape
+        ]  # to decide extra FPN layers
+
+        self.transformer_num_feature_levels = len(self.transformer_in_features)
+        if self.transformer_num_feature_levels > 1:
+            input_proj_list = []
+            # from low resolution to high resolution (res5 -> res2)
+            for in_channels in transformer_in_channels[::-1]:
+                input_proj_list.append(
+                    nn.Sequential(
+                        nn.Conv2d(in_channels, conv_dim, kernel_size=1),
+                        nn.GroupNorm(32, conv_dim),
+                    ))
+            self.input_proj = nn.ModuleList(input_proj_list)
+        else:
+            self.input_proj = nn.ModuleList([
+                nn.Sequential(
+                    nn.Conv2d(
+                        transformer_in_channels[-1], conv_dim, kernel_size=1),
+                    nn.GroupNorm(32, conv_dim),
+                )
+            ])
+
+        for proj in self.input_proj:
+            nn.init.xavier_uniform_(proj[0].weight, gain=1)
+            nn.init.constant_(proj[0].bias, 0)
+
+        self.transformer = MSDeformAttnTransformerEncoderOnly(
+            d_model=conv_dim,
+            dropout=transformer_dropout,
+            nhead=transformer_nheads,
+            dim_feedforward=transformer_dim_feedforward,
+            num_encoder_layers=transformer_enc_layers,
+            num_feature_levels=self.transformer_num_feature_levels,
+        )
+        N_steps = conv_dim // 2
+        self.pe_layer = PositionEmbeddingSine(N_steps, normalize=True)
+
+        self.mask_dim = mask_dim
+        # use 1x1 conv instead
+        self.mask_features = Conv2d(
+            conv_dim,
+            mask_dim,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+        )
+
+        self.maskformer_num_feature_levels = 3  # always use 3 scales
+        self.common_stride = common_stride
+
+        # extra fpn levels
+        stride = min(self.transformer_feature_strides)
+        self.num_fpn_levels = int(
+            np.log2(stride) - np.log2(self.common_stride))
+
+        lateral_convs = []
+        output_convs = []
+
+        use_bias = False
+        for idx, in_channels in enumerate(
+                self.feature_channels[:self.num_fpn_levels]):
+            lateral_norm = nn.GroupNorm(32, conv_dim)
+            output_norm = nn.GroupNorm(32, conv_dim)
+
+            lateral_conv = Conv2d(
+                in_channels,
+                conv_dim,
+                kernel_size=1,
+                bias=use_bias,
+                norm=lateral_norm)
+            output_conv = Conv2d(
+                conv_dim,
+                conv_dim,
+                kernel_size=3,
+                stride=1,
+                padding=1,
+                bias=use_bias,
+                norm=output_norm,
+                activation=F.relu,
+            )
+            self.add_module('adapter_{}'.format(idx + 1), lateral_conv)
+            self.add_module('layer_{}'.format(idx + 1), output_conv)
+
+            lateral_convs.append(lateral_conv)
+            output_convs.append(output_conv)
+        # Place convs into top-down order (from low to high resolution)
+        # to make the top-down computation in forward clearer.
+        self.lateral_convs = lateral_convs[::-1]
+        self.output_convs = output_convs[::-1]
+
+    @autocast(enabled=False)
+    def forward_features(self, features):
+        srcs = []
+        pos = []
+        # Reverse feature maps into top-down order (from low to high resolution)
+        for idx, f in enumerate(self.transformer_in_features[::-1]):
+            x = features[f].float(
+            )  # deformable detr does not support half precision
+            srcs.append(self.input_proj[idx](x))
+            pos.append(self.pe_layer(x))
+
+        y, spatial_shapes, level_start_index = self.transformer(
+            srcs, None, pos)
+        bs = y.shape[0]
+
+        split_size_or_sections = [None] * self.transformer_num_feature_levels
+        for i in range(self.transformer_num_feature_levels):
+            if i < self.transformer_num_feature_levels - 1:
+                split_size_or_sections[i] = level_start_index[
+                    i + 1] - level_start_index[i]
+            else:
+                split_size_or_sections[i] = y.shape[1] - level_start_index[i]
+        y = torch.split(y, split_size_or_sections, dim=1)
+
+        out = []
+        multi_scale_features = []
+        num_cur_levels = 0
+        for i, z in enumerate(y):
+            out.append(
+                z.transpose(1, 2).view(bs, -1, spatial_shapes[i][0],
+                                       spatial_shapes[i][1]))
+
+        # append `out` with extra FPN levels
+        # Reverse feature maps into top-down order (from low to high resolution)
+        for idx, f in enumerate(self.in_features[:self.num_fpn_levels][::-1]):
+            x = features[f].float()
+            lateral_conv = self.lateral_convs[idx]
+            output_conv = self.output_convs[idx]
+            cur_fpn = lateral_conv(x)
+            # Following FPN implementation, we use nearest upsampling here
+            y = cur_fpn + F.interpolate(
+                out[-1],
+                size=cur_fpn.shape[-2:],
+                mode='bilinear',
+                align_corners=False)
+            y = output_conv(y)
+            out.append(y)
+
+        for o in out:
+            if num_cur_levels < self.maskformer_num_feature_levels:
+                multi_scale_features.append(o)
+                num_cur_levels += 1
+
+        return self.mask_features(out[-1]), out[0], multi_scale_features
diff --git a/modelscope/models/cv/image_human_parsing/m2fp_net.py b/modelscope/models/cv/image_human_parsing/m2fp_net.py
new file mode 100644
index 00000000..3f771663
--- /dev/null
+++ b/modelscope/models/cv/image_human_parsing/m2fp_net.py
@@ -0,0 +1,363 @@
+# Part of the implementation is borrowed and modified from M2FP, made publicly available
+# under the CC BY-NC 4.0 License at https://github.com/soeaver/M2FP
+import os
+from typing import Any, Dict
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from modelscope.metainfo import Models
+from modelscope.models.base import TorchModel
+from modelscope.models.builder import MODELS
+from modelscope.models.cv.image_instance_segmentation.maskdino_swin import \
+    ImageList
+from modelscope.utils.constant import ModelFile, Tasks
+from modelscope.utils.logger import get_logger
+from .backbone import build_resnet_deeplab_backbone
+from .m2fp.m2fp_decoder import MultiScaleMaskedTransformerDecoder
+from .m2fp.m2fp_encoder import MSDeformAttnPixelDecoder
+
+logger = get_logger()
+
+
+@MODELS.register_module(Tasks.image_segmentation, module_name=Models.m2fp)
+class M2FP(TorchModel):
+
+    def __init__(self,
+                 model_dir,
+                 backbone=None,
+                 encoder=None,
+                 decoder=None,
+                 pretrained=None,
+                 input_single_human=None,
+                 classes=None,
+                 num_parsing=None,
+                 single_human=True,
+                 parsing_ins_score_thr=0.5,
+                 parsing_on=False,
+                 semantic_on=True,
+                 sem_seg_postprocess_before_inference=True,
+                 **kwargs):
+        """
+        Deep Learning Technique for Human Parsing: A Survey and Outlook. See https://arxiv.org/abs/2301.00394
+        Args:
+            backbone (dict): backbone config.
+            encoder (dict): encoder config.
+            decoder (dict): decoder config.
+            pretrained (bool): whether to use pretrained model
+            input_single_human (dict): input size config for single human parsing
+            classes (list): class names
+            num_parsing (int): total number of parsing instances, only for multiple human parsing
+            single_human (bool): whether the task is single human parsing
+            parsing_ins_score_thr: instance score threshold for multiple human parsing
+            parsing_on (bool): whether to parse results, only for multiple human parsing
+            semantic_on (bool): whether to output semantic map
+            sem_seg_postprocess_before_inference: whether to resize the prediction back
+                to original input size before semantic segmentation inference or after.
+        """
+        super(M2FP, self).__init__(model_dir, **kwargs)
+
+        self.register_buffer(
+            'pixel_mean',
+            torch.Tensor([123.675, 116.28, 103.53]).view(-1, 1, 1), False)
+        self.register_buffer(
+            'pixel_std',
+            torch.Tensor([58.395, 57.12, 57.375]).view(-1, 1, 1), False)
+        self.size_divisibility = 32
+
+        self.backbone = build_resnet_deeplab_backbone(
+            **backbone, input_shape={'channels': 3})
+        in_features = encoder.pop('in_features')
+        input_shape = {
+            k: v
+            for k, v in self.backbone.output_shape().items()
+            if k in in_features
+        }
+        encoder = MSDeformAttnPixelDecoder(input_shape=input_shape, **encoder)
+        decoder = MultiScaleMaskedTransformerDecoder(
+            in_channels=encoder.conv_dim, **decoder)
+        self.sem_seg_head = M2FPHead(
+            pixel_decoder=encoder, transformer_predictor=decoder)
+        self.num_classes = decoder.num_classes
+        self.num_queries = decoder.num_queries
+        self.test_topk_per_image = 100
+
+        self.input_single_human = input_single_human
+        self.classes = classes
+        self.num_parsing = num_parsing
+        self.single_human = single_human
+        self.parsing_ins_score_thr = parsing_ins_score_thr
+        self.parsing_on = parsing_on
+        self.semantic_on = semantic_on
+        self.sem_seg_postprocess_before_inference = sem_seg_postprocess_before_inference or parsing_on
+
+        if not self.semantic_on:
+            assert self.sem_seg_postprocess_before_inference
+
+        if pretrained:
+            model_path = os.path.join(model_dir, ModelFile.TORCH_MODEL_FILE)
+            logger.info(f'loading model from {model_path}')
+            weight = torch.load(model_path, map_location='cpu')['model']
+            tgt_weight = self.state_dict()
+            for name in list(weight.keys()):
+                if name in tgt_weight:
+                    load_size = weight[name].size()
+                    tgt_size = tgt_weight[name].size()
+                    mis_match = False
+                    if len(load_size) != len(tgt_size):
+                        mis_match = True
+                    else:
+                        for n1, n2 in zip(load_size, tgt_size):
+                            if n1 != n2:
+                                mis_match = True
+                                break
+                    if mis_match:
+                        logger.info(
+                            f'size mismatch for {name} '
+                            f'({load_size} -> {tgt_size}), skip loading.')
+                        del weight[name]
+                else:
+                    logger.info(
+                        f'{name} doesn\'t exist in current model, skip loading.'
+                    )
+
+            self.load_state_dict(weight, strict=False)
+            logger.info('load model done')
+
+    def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
+        batched_inputs = input['batched_inputs']
+        images = [x['image'].to(self.device) for x in batched_inputs]
+        images = [(x - self.pixel_mean) / self.pixel_std for x in images]
+        images = ImageList.from_tensors(images, self.size_divisibility)
+
+        features = self.backbone(images.tensor)
+        outputs = self.sem_seg_head(features)
+
+        return dict(
+            outputs=outputs, batched_inputs=batched_inputs, images=images)
+
+    def postprocess(self, input: Dict[str, Any]) -> Dict[str, Any]:
+        outputs = input['outputs']
+        batched_inputs = input['batched_inputs']
+        images = input['images']
+        if self.training:
+            raise NotImplementedError
+        else:
+            mask_cls_results = outputs['pred_logits']  # (B, Q, C+1)
+            mask_pred_results = outputs['pred_masks']  # (B, Q, H, W)
+            # upsample masks
+            mask_pred_results = F.interpolate(
+                mask_pred_results,
+                size=(images.tensor.shape[-2], images.tensor.shape[-1]),
+                mode='bilinear',
+                align_corners=False,
+            )
+
+            del outputs
+
+            processed_results = []
+            for mask_cls_result, mask_pred_result, input_per_image, image_size in zip(
+                    mask_cls_results, mask_pred_results, batched_inputs,
+                    images.image_sizes):
+                height = input_per_image.get('height', image_size[0])
+                width = input_per_image.get('width', image_size[1])
+                processed_results.append({})  # for each image
+
+                if self.sem_seg_postprocess_before_inference:
+                    if not self.single_human:
+                        mask_pred_result = self.sem_seg_postprocess(
+                            mask_pred_result, image_size, height, width)
+                    else:
+                        mask_pred_result = self.single_human_sem_seg_postprocess(
+                            mask_pred_result, image_size,
+                            input_per_image['crop_box'], height, width)
+                    mask_cls_result = mask_cls_result.to(mask_pred_result)
+
+                # semantic segmentation inference
+                if self.semantic_on:
+                    r = self.semantic_inference(mask_cls_result,
+                                                mask_pred_result)
+                    if not self.sem_seg_postprocess_before_inference:
+                        if not self.single_human:
+                            r = self.sem_seg_postprocess(
+                                r, image_size, height, width)
+                        else:
+                            r = self.single_human_sem_seg_postprocess(
+                                r, image_size, input_per_image['crop_box'],
+                                height, width)
+                        processed_results[-1]['sem_seg'] = r
+
+                # parsing inference
+                if self.parsing_on:
+                    parsing_r = self.instance_parsing_inference(
+                        mask_cls_result, mask_pred_result)
+                    processed_results[-1]['parsing'] = parsing_r
+
+        return dict(eval_result=processed_results)
+
+    @property
+    def device(self):
+        return self.pixel_mean.device
+
+    def single_human_sem_seg_postprocess(self, result, img_size, crop_box,
+                                         output_height, output_width):
+        result = result[:, :img_size[0], :img_size[1]]
+        result = result[:, crop_box[1]:crop_box[3],
+                        crop_box[0]:crop_box[2]].expand(1, -1, -1, -1)
+        result = F.interpolate(
+            result,
+            size=(output_height, output_width),
+            mode='bilinear',
+            align_corners=False)[0]
+        return result
+
+    def sem_seg_postprocess(self, result, img_size, output_height,
+                            output_width):
+        result = result[:, :img_size[0], :img_size[1]].expand(1, -1, -1, -1)
+        result = F.interpolate(
+            result,
+            size=(output_height, output_width),
+            mode='bilinear',
+            align_corners=False)[0]
+        return result
+
+    def semantic_inference(self, mask_cls, mask_pred):
+        mask_cls = F.softmax(
+            mask_cls, dim=-1)[..., :-1]  # discard non-sense category
+        mask_pred = mask_pred.sigmoid()
+        semseg = torch.einsum('qc,qhw->chw', mask_cls, mask_pred)
+        return semseg
+
+    def instance_parsing_inference(self, mask_cls, mask_pred):
+        scores = F.softmax(mask_cls, dim=-1)[:, :-1]
+        labels = torch.arange(
+            self.num_classes,
+            device=self.device).unsqueeze(0).repeat(self.num_queries,
+                                                    1).flatten(0, 1)
+
+        scores_per_image, topk_indices = scores.flatten(0, 1).topk(
+            self.test_topk_per_image, sorted=False)
+        labels_per_image = labels[topk_indices]
+
+        topk_indices = topk_indices // self.num_classes
+        mask_pred = mask_pred[topk_indices]
+
+        binary_pred_masks = (mask_pred > 0).float()
+        mask_scores_per_image = (mask_pred.sigmoid().flatten(1) * binary_pred_masks.flatten(1)).sum(1) / \
+                                (binary_pred_masks.flatten(1).sum(1) + 1e-6)
+
+        pred_scores = scores_per_image * mask_scores_per_image
+        pred_labels = labels_per_image
+        pred_masks = mask_pred
+
+        # prepare outputs
+        part_instance_res = []
+        human_instance_res = []
+
+        # bkg and part instances
+        bkg_part_index = torch.where(pred_labels != self.num_parsing)[0]
+        bkg_part_labels = pred_labels[bkg_part_index]
+        bkg_part_scores = pred_scores[bkg_part_index]
+        bkg_part_masks = pred_masks[bkg_part_index, :, :]
+
+        # human instances
+        human_index = torch.where(pred_labels == self.num_parsing)[0]
+        human_labels = pred_labels[human_index]
+        human_scores = pred_scores[human_index]
+        human_masks = pred_masks[human_index, :, :]
+
+        semantic_res = self.paste_instance_to_semseg_probs(
+            bkg_part_labels, bkg_part_scores, bkg_part_masks)
+
+        # part instances
+        part_index = torch.where(bkg_part_labels != 0)[0]
+        part_labels = bkg_part_labels[part_index]
+        part_scores = bkg_part_scores[part_index]
+        part_masks = bkg_part_masks[part_index, :, :]
+
+        # part instance results
+        for idx in range(part_labels.shape[0]):
+            if part_scores[idx] < 0.1:
+                continue
+            part_instance_res.append({
+                'category_id':
+                part_labels[idx].cpu().tolist(),
+                'score':
+                part_scores[idx].cpu().tolist(),
+                'mask':
+                part_masks[idx],
+            })
+
+        # human instance results
+        for human_idx in range(human_scores.shape[0]):
+            if human_scores[human_idx] > 0.1:
+                human_instance_res.append({
+                    'category_id':
+                    human_labels[human_idx].cpu().tolist(),
+                    'score':
+                    human_scores[human_idx].cpu().tolist(),
+                    'mask':
+                    human_masks[human_idx],
+                })
+
+        return {
+            'semantic_outputs': semantic_res,
+            'part_outputs': part_instance_res,
+            'human_outputs': human_instance_res,
+        }
+
+    def paste_instance_to_semseg_probs(self, labels, scores, mask_probs):
+        im_h, im_w = mask_probs.shape[-2:]
+        semseg_im = []
+        for cls_ind in range(self.num_parsing):
+            cate_inds = torch.where(labels == cls_ind)[0]
+            cate_scores = scores[cate_inds]
+            cate_mask_probs = mask_probs[cate_inds, :, :].sigmoid()
+            semseg_im.append(
+                self.paste_category_probs(cate_scores, cate_mask_probs, im_h,
+                                          im_w))
+
+        return torch.stack(semseg_im, dim=0)
+
+    def paste_category_probs(self, scores, mask_probs, h, w):
+        category_probs = torch.zeros((h, w),
+                                     dtype=torch.float32,
+                                     device=mask_probs.device)
+        paste_times = torch.zeros((h, w),
+                                  dtype=torch.float32,
+                                  device=mask_probs.device)
+
+        index = scores.argsort()
+        for k in range(len(index)):
+            if scores[index[k]] < self.parsing_ins_score_thr:
+                continue
+            ins_mask_probs = mask_probs[index[k], :, :] * scores[index[k]]
+            category_probs = torch.where(ins_mask_probs > 0.5,
+                                         ins_mask_probs + category_probs,
+                                         category_probs)
+            paste_times += torch.where(ins_mask_probs > 0.5, 1, 0)
+
+        paste_times = torch.where(paste_times == 0, paste_times + 1,
+                                  paste_times)
+        category_probs /= paste_times
+
+        return category_probs
+
+
+class M2FPHead(nn.Module):
+
+    def __init__(self, pixel_decoder: nn.Module,
+                 transformer_predictor: nn.Module):
+        super().__init__()
+        self.pixel_decoder = pixel_decoder
+        self.predictor = transformer_predictor
+
+    def forward(self, features, mask=None):
+        return self.layers(features, mask)
+
+    def layers(self, features, mask=None):
+        mask_features, transformer_encoder_features, multi_scale_features = self.pixel_decoder.forward_features(
+            features)
+        predictions = self.predictor(multi_scale_features, mask_features, mask)
+        return predictions
diff --git a/modelscope/models/cv/image_human_parsing/parsing_utils.py b/modelscope/models/cv/image_human_parsing/parsing_utils.py
new file mode 100644
index 00000000..a1c20072
--- /dev/null
+++ b/modelscope/models/cv/image_human_parsing/parsing_utils.py
@@ -0,0 +1,156 @@
+# Part of the implementation is borrowed and modified from M2FP, made publicly available
+# under the CC BY-NC 4.0 License at https://github.com/soeaver/M2FP
+# Part of the implementation is borrowed and modified from Detectron2, made publicly available
+# under the Apache-2.0 License at https://github.com/facebookresearch/detectron2
+
+import copy
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+from PIL import Image
+
+
+def center_to_target_size_test(img, target_size):
+    src_h, src_w = img.shape[0], img.shape[1]
+    trg_h, trg_w = target_size[1], target_size[0]
+
+    new_h, new_w = 0, 0
+    tfm_list = []
+    if src_h > trg_h and src_w > trg_w:
+        if src_h > src_w:
+            new_h = trg_h
+            new_w = int(new_h * src_w / src_h)
+            if new_w > trg_w:
+                new_w = trg_w
+                new_h = int(new_w * src_h / src_w)
+        elif src_w > src_h:
+            new_w = trg_w
+            new_h = int(new_w * src_h / src_w)
+            if new_h > trg_h:
+                new_h = trg_h
+                new_w = int(new_h * src_w / src_h)
+        tfm_list.append(ResizeTransform(src_h, src_w, new_h, new_w))
+        tfm_list.append(PadTransform(new_h, new_w, trg_h, trg_w))
+
+    elif src_h > trg_h and src_w <= trg_w:
+        new_h = trg_h
+        new_w = int(new_h * src_w / src_h)
+        tfm_list.append(ResizeTransform(src_h, src_w, new_h, new_w))
+        tfm_list.append(PadTransform(new_h, new_w, trg_h, trg_w))
+
+    elif src_h <= trg_h and src_w > trg_w:
+        new_w = trg_w
+        new_h = int(new_w * src_h / src_w)
+        tfm_list.append(ResizeTransform(src_h, src_w, new_h, new_w))
+        tfm_list.append(PadTransform(new_h, new_w, trg_h, trg_w))
+
+    else:
+        new_h, new_w = src_h, src_w
+        tfm_list.append(PadTransform(new_h, new_w, trg_h, trg_w))
+
+    box = get_box(new_h, new_w, trg_h, trg_w)
+
+    new_img = copy.deepcopy(img)
+    for tfm in tfm_list:
+        new_img = tfm.apply_image(new_img)
+
+    return new_img, box
+
+
+def get_box(src_h, src_w, trg_h, trg_w):
+    assert src_h <= trg_h, 'expect src_h <= trg_h'
+    assert src_w <= trg_w, 'expect src_w <= trg_w'
+
+    x0 = int((trg_w - src_w) / 2)
+    x1 = src_w + x0
+    y0 = int((trg_h - src_h) / 2)
+    y1 = src_h + y0
+
+    box = [x0, y0, x1, y1]
+    return box
+
+
+class PadTransform:
+
+    def __init__(self, src_h, src_w, trg_h, trg_w):
+        super().__init__()
+        assert src_h <= trg_h, 'expect src_h <= trg_h'
+        assert src_w <= trg_w, 'expect src_w <= trg_w'
+
+        self.src_h, self.src_w = src_h, src_w
+        self.trg_h, self.trg_w = trg_h, trg_w
+        self.pad_left = int((trg_w - src_w) / 2)
+        self.pad_right = trg_w - src_w - self.pad_left
+        self.pad_top = int((trg_h - src_h) / 2)
+        self.pad_bottom = trg_h - src_h - self.pad_top
+
+    def apply_image(self, img, pad_value=128):
+        if self.pad_left == 0 and self.pad_top == 0:
+            return img
+
+        if len(img.shape) == 2:
+            return np.pad(
+                img, ((self.pad_top, self.pad_bottom),
+                      (self.pad_left, self.pad_right)),
+                'constant',
+                constant_values=((pad_value, pad_value), (pad_value,
+                                                          pad_value)))
+        elif len(img.shape) == 3:
+            return np.pad(
+                img, ((self.pad_top, self.pad_bottom),
+                      (self.pad_left, self.pad_right), (0, 0)),
+                'constant',
+                constant_values=((pad_value, pad_value),
+                                 (pad_value, pad_value), (pad_value,
+                                                          pad_value)))
+
+
+class ResizeTransform:
+
+    def __init__(self, h, w, new_h, new_w, interp=None):
+        super().__init__()
+        if interp is None:
+            interp = Image.BILINEAR
+        self.h, self.w = h, w
+        self.new_h, self.new_w = new_h, new_w
+        self.interp = interp
+
+    def apply_image(self, img, interp=None):
+        assert img.shape[:2] == (self.h, self.w)
+        assert len(img.shape) <= 4
+        interp_method = interp if interp is not None else self.interp
+
+        if img.dtype == np.uint8:
+            if len(img.shape) > 2 and img.shape[2] == 1:
+                pil_image = Image.fromarray(img[:, :, 0], mode='L')
+            else:
+                pil_image = Image.fromarray(img)
+            pil_image = pil_image.resize((self.new_w, self.new_h),
+                                         interp_method)
+            ret = np.asarray(pil_image)
+            if len(img.shape) > 2 and img.shape[2] == 1:
+                ret = np.expand_dims(ret, -1)
+        else:
+            # PIL only supports uint8
+            if any(x < 0 for x in img.strides):
+                img = np.ascontiguousarray(img)
+            img = torch.from_numpy(img)
+            shape = list(img.shape)
+            shape_4d = shape[:2] + [1] * (4 - len(shape)) + shape[2:]
+            img = img.view(shape_4d).permute(2, 3, 0, 1)  # hw(c) -> nchw
+            _PIL_RESIZE_TO_INTERPOLATE_MODE = {
+                Image.NEAREST: 'nearest',
+                Image.BILINEAR: 'bilinear',
+                Image.BICUBIC: 'bicubic',
+            }
+            mode = _PIL_RESIZE_TO_INTERPOLATE_MODE[interp_method]
+            align_corners = None if mode == 'nearest' else False
+            img = F.interpolate(
+                img, (self.new_h, self.new_w),
+                mode=mode,
+                align_corners=align_corners)
+            shape[:2] = (self.new_h, self.new_w)
+            ret = img.permute(2, 3, 0, 1).view(shape).numpy()  # nchw -> hw(c)
+
+        return ret
diff --git a/modelscope/models/cv/image_instance_segmentation/cascade_mask_rcnn_swin.py b/modelscope/models/cv/image_instance_segmentation/cascade_mask_rcnn_swin.py
index ff83271e..375a5e45 100644
--- a/modelscope/models/cv/image_instance_segmentation/cascade_mask_rcnn_swin.py
+++ b/modelscope/models/cv/image_instance_segmentation/cascade_mask_rcnn_swin.py
@@ -89,7 +89,7 @@ class CascadeMaskRCNNSwin(nn.Module):
             model_path = os.path.join(kwargs['model_dir'],
                                       ModelFile.TORCH_MODEL_FILE)
             logger.info(f'loading model from {model_path}')
-            weight = torch.load(model_path)['state_dict']
+            weight = torch.load(model_path, map_location='cpu')['state_dict']
             tgt_weight = self.state_dict()
             for name in list(weight.keys()):
                 if name in tgt_weight:
diff --git a/modelscope/models/cv/image_instance_segmentation/maskdino_swin.py b/modelscope/models/cv/image_instance_segmentation/maskdino_swin.py
index 5b60eb40..8c2aa7d2 100644
--- a/modelscope/models/cv/image_instance_segmentation/maskdino_swin.py
+++ b/modelscope/models/cv/image_instance_segmentation/maskdino_swin.py
@@ -61,7 +61,7 @@ class MaskDINOSwin(nn.Module):
             model_path = os.path.join(kwargs['model_dir'],
                                       ModelFile.TORCH_MODEL_FILE)
             logger.info(f'loading model from {model_path}')
-            weight = torch.load(model_path)['model']
+            weight = torch.load(model_path, map_location='cpu')['model']
             tgt_weight = self.state_dict()
             for name in list(weight.keys()):
                 if name in tgt_weight:
diff --git a/modelscope/models/cv/image_paintbyexample/__init__.py b/modelscope/models/cv/image_paintbyexample/__init__.py
new file mode 100644
index 00000000..b58d4305
--- /dev/null
+++ b/modelscope/models/cv/image_paintbyexample/__init__.py
@@ -0,0 +1,22 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .model import StablediffusionPaintbyexample
+
+else:
+    _import_structure = {
+        'model': ['StablediffusionPaintbyexample'],
+    }
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/models/cv/image_paintbyexample/model.py b/modelscope/models/cv/image_paintbyexample/model.py
new file mode 100644
index 00000000..1cddafc6
--- /dev/null
+++ b/modelscope/models/cv/image_paintbyexample/model.py
@@ -0,0 +1,49 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os
+from typing import Any, Dict, Optional, Union
+
+import torch
+from omegaconf import OmegaConf
+from paint_ldm.util import instantiate_from_config
+
+from modelscope.metainfo import Models
+from modelscope.models.base.base_torch_model import TorchModel
+from modelscope.models.builder import MODELS
+from modelscope.utils.constant import ModelFile, Tasks
+from modelscope.utils.logger import get_logger
+
+LOGGER = get_logger()
+
+
+def load_model_from_config(config, ckpt, verbose=False):
+    LOGGER.info(f'Loading model from {ckpt}')
+    pl_sd = torch.load(ckpt, map_location='cpu')
+    if 'global_step' in pl_sd:
+        LOGGER.info(f"Global Step: {pl_sd['global_step']}")
+    sd = pl_sd['state_dict']
+    model = instantiate_from_config(config.model)
+    m, u = model.load_state_dict(sd, strict=False)
+    if len(m) > 0 and verbose:
+        LOGGER.info('missing keys:')
+        LOGGER.info(m)
+    if len(u) > 0 and verbose:
+        LOGGER.info('unexpected keys:')
+        LOGGER.info(u)
+
+    return model
+
+
+@MODELS.register_module(
+    Tasks.image_paintbyexample, module_name=Models.image_paintbyexample)
+class StablediffusionPaintbyexample(TorchModel):
+
+    def __init__(self, model_dir: str, **kwargs):
+        super().__init__(model_dir, **kwargs)
+
+        config = OmegaConf.load(os.path.join(model_dir, 'v1.yaml'))
+        model = load_model_from_config(
+            config, os.path.join(model_dir, 'pytorch_model.pt'))
+        self.model = model
+
+    def forward(self, inputs):
+        return self.model(inputs)
diff --git a/modelscope/models/cv/image_probing_model/__init__.py b/modelscope/models/cv/image_probing_model/__init__.py
new file mode 100644
index 00000000..e97a1b77
--- /dev/null
+++ b/modelscope/models/cv/image_probing_model/__init__.py
@@ -0,0 +1,24 @@
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
+
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+
+    from .model import StructuredProbingModel
+
+else:
+    _import_structure = {
+        'model': ['StructuredProbingModel'],
+    }
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/models/cv/image_probing_model/backbone.py b/modelscope/models/cv/image_probing_model/backbone.py
new file mode 100644
index 00000000..8f3ed5b6
--- /dev/null
+++ b/modelscope/models/cv/image_probing_model/backbone.py
@@ -0,0 +1,308 @@
+# The implementation is adopted from OpenAI-CLIP,
+# made pubicly available under the MIT License at https://github.com/openai/CLIP
+
+import math
+import sys
+from collections import OrderedDict
+from functools import reduce
+from operator import mul
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from PIL import Image
+from torchvision import models
+
+from .utils import convert_weights, load_pretrained
+
+
+class Bottleneck(nn.Module):
+    expansion = 4
+
+    def __init__(self, inplanes, planes, stride=1):
+        super().__init__()
+
+        # all conv layers have stride 1. an avgpool is performed
+        # after the second convolution when stride > 1
+        self.conv1 = nn.Conv2d(inplanes, planes, 1, bias=False)
+        self.bn1 = nn.BatchNorm2d(planes)
+
+        self.conv2 = nn.Conv2d(planes, planes, 3, padding=1, bias=False)
+        self.bn2 = nn.BatchNorm2d(planes)
+
+        self.avgpool = nn.AvgPool2d(stride) if stride > 1 else nn.Identity()
+
+        self.conv3 = nn.Conv2d(planes, planes * self.expansion, 1, bias=False)
+        self.bn3 = nn.BatchNorm2d(planes * self.expansion)
+
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = None
+        self.stride = stride
+
+        if stride > 1 or inplanes != planes * Bottleneck.expansion:
+            # downsampling layer is prepended with an avgpool,
+            # and the subsequent convolution has stride 1
+            self.downsample = nn.Sequential(
+                OrderedDict([('-1', nn.AvgPool2d(stride)),
+                             ('0',
+                              nn.Conv2d(
+                                  inplanes,
+                                  planes * self.expansion,
+                                  1,
+                                  stride=1,
+                                  bias=False)),
+                             ('1', nn.BatchNorm2d(planes * self.expansion))]))
+
+    def forward(self, x: torch.Tensor):
+        identity = x
+
+        out = self.relu(self.bn1(self.conv1(x)))
+        out = self.relu(self.bn2(self.conv2(out)))
+        out = self.avgpool(out)
+        out = self.bn3(self.conv3(out))
+
+        if self.downsample is not None:
+            identity = self.downsample(x)
+
+        out += identity
+        out = self.relu(out)
+        return out
+
+
+class AttentionPool2d(nn.Module):
+
+    def __init__(self,
+                 spacial_dim: int,
+                 embed_dim: int,
+                 num_heads: int,
+                 output_dim: int = None):
+        super().__init__()
+        self.positional_embedding = nn.Parameter(
+            torch.randn(spacial_dim**2 + 1, embed_dim) / embed_dim**0.5)
+        self.k_proj = nn.Linear(embed_dim, embed_dim)
+        self.q_proj = nn.Linear(embed_dim, embed_dim)
+        self.v_proj = nn.Linear(embed_dim, embed_dim)
+        self.c_proj = nn.Linear(embed_dim, output_dim or embed_dim)
+        self.num_heads = num_heads
+
+    def forward(self, x):
+        x = x.reshape(x.shape[0], x.shape[1],
+                      x.shape[2] * x.shape[3]).permute(2, 0, 1)
+        x = torch.cat([x.mean(dim=0, keepdim=True), x], dim=0)
+        x = x + self.positional_embedding[:, None, :].to(x.dtype)
+        x, _ = F.multi_head_attention_forward(
+            query=x,
+            key=x,
+            value=x,
+            embed_dim_to_check=x.shape[-1],
+            num_heads=self.num_heads,
+            q_proj_weight=self.q_proj.weight,
+            k_proj_weight=self.k_proj.weight,
+            v_proj_weight=self.v_proj.weight,
+            in_proj_weight=None,
+            in_proj_bias=torch.cat(
+                [self.q_proj.bias, self.k_proj.bias, self.v_proj.bias]),
+            bias_k=None,
+            bias_v=None,
+            add_zero_attn=False,
+            dropout_p=0,
+            out_proj_weight=self.c_proj.weight,
+            out_proj_bias=self.c_proj.bias,
+            use_separate_proj_weight=True,
+            training=self.training,
+            need_weights=False)
+
+        return x[0]
+
+
+class LayerNorm(nn.LayerNorm):
+    """Subclass torch's LayerNorm to handle fp16."""
+
+    def forward(self, x: torch.Tensor):
+        orig_type = x.dtype
+        ret = super().forward(x.type(torch.float32))
+        return ret.type(orig_type)
+
+
+class QuickGELU(nn.Module):
+
+    def forward(self, x: torch.Tensor):
+        return x * torch.sigmoid(1.702 * x)
+
+
+class ResidualAttentionBlock(nn.Module):
+
+    def __init__(self,
+                 d_model: int,
+                 n_head: int,
+                 attn_mask: torch.Tensor = None):
+        super().__init__()
+
+        self.attn = nn.MultiheadAttention(d_model, n_head)
+        self.ln_1 = LayerNorm(d_model)
+        self.mlp = nn.Sequential(
+            OrderedDict([('c_fc', nn.Linear(d_model, d_model * 4)),
+                         ('gelu', QuickGELU()),
+                         ('c_proj', nn.Linear(d_model * 4, d_model))]))
+        self.ln_2 = LayerNorm(d_model)
+        self.attn_mask = attn_mask
+
+    def attention(self, x: torch.Tensor):
+        self.attn_mask = self.attn_mask.to(
+            dtype=x.dtype,
+            device=x.device) if self.attn_mask is not None else None
+        return self.attn(
+            x, x, x, need_weights=False, attn_mask=self.attn_mask)[0]
+
+    def forward(self, x: torch.Tensor, idx):
+        features = {}
+        x_norm = self.ln_1(x)
+        features['layer_{}_pre_attn'.format(idx)] = x_norm.permute(1, 0, 2)
+        attn = self.attention(x_norm)
+        features['layer_{}_attn'.format(idx)] = attn.permute(1, 0, 2)
+        x = x + attn
+        mlp = self.mlp(self.ln_2(x))
+        features['layer_{}_mlp'.format(idx)] = mlp.permute(1, 0, 2)
+        x = x + mlp
+        return x, features
+
+
+class Transformer(nn.Module):
+
+    def __init__(self,
+                 width: int,
+                 layers: int,
+                 heads: int,
+                 attn_mask: torch.Tensor = None):
+        super().__init__()
+        self.width = width
+        self.layers = layers
+        self.resblocks = nn.ModuleList()
+        for i in range(layers):
+            block = ResidualAttentionBlock(width, heads, attn_mask)
+            self.resblocks.append(block)
+
+    def forward(self, x: torch.Tensor):
+        features = {}
+        for idx, block in enumerate(self.resblocks):
+            x, block_feats = block(x, idx)
+            features.update(block_feats)
+        return x, features
+
+
+class VisualTransformer(nn.Module):
+
+    def __init__(self, input_resolution: int, patch_size: int, width: int,
+                 layers: int, heads: int, output_dim: int):
+        super().__init__()
+        print(input_resolution, patch_size, width, layers, heads, output_dim)
+        self.input_resolution = input_resolution
+        self.output_dim = output_dim
+        self.conv1 = nn.Conv2d(
+            in_channels=3,
+            out_channels=width,
+            kernel_size=patch_size,
+            stride=patch_size,
+            bias=False)
+
+        scale = width**-0.5
+        self.class_embedding = nn.Parameter(scale * torch.randn(width))
+        self.positional_embedding = nn.Parameter(scale * torch.randn(
+            (input_resolution // patch_size)**2 + 1, width))
+        self.ln_pre = LayerNorm(width)
+
+        self.transformer = Transformer(width, layers, heads)
+
+        self.ln_post = LayerNorm(width)
+        self.proj = nn.Parameter(scale * torch.randn(width, output_dim))
+
+    def forward(self, x: torch.Tensor, return_all=True):
+        x = self.conv1(x)  # shape = [*, width, grid, grid]
+        x = x.reshape(x.shape[0], x.shape[1],
+                      -1)  # shape = [*, width, grid ** 2]
+        x = x.permute(0, 2, 1)  # shape = [*, grid ** 2, width]
+        zeros = torch.zeros(
+            x.shape[0], 1, x.shape[-1], dtype=x.dtype, device=x.device)
+        # shape = [*, grid ** 2 + 1, width]
+        x = torch.cat([self.class_embedding.to(x.dtype) + zeros, x], dim=1)
+        x = x + self.positional_embedding.to(x.dtype)
+        x = self.ln_pre(x)
+
+        x = x.permute(1, 0, 2)  # NLD -> LND
+        x, features = self.transformer(x)
+        x = x.permute(1, 0, 2)  # LND -> NLD
+
+        x = self.ln_post(x[:, 0, :])
+
+        if return_all:
+            features['pre_logits'] = x
+            return features
+
+        if self.proj is not None:
+            x = x @ self.proj
+
+        return x
+
+
+class CLIPNet(nn.Module):
+
+    def __init__(self, arch_name, pretrained, **kwargs):
+        super(CLIPNet, self).__init__()
+
+        if arch_name == 'CLIP_ViTB32':
+            self.clip = VisualTransformer(
+                input_resolution=224,
+                patch_size=32,
+                width=768,
+                layers=12,
+                heads=12,
+                output_dim=512)
+
+        elif arch_name in ('CLIP_ViTB16', 'CLIP_ViTB16_FP16'):
+            self.clip = VisualTransformer(
+                input_resolution=224,
+                patch_size=16,
+                width=768,
+                layers=12,
+                heads=12,
+                output_dim=512)
+
+        elif arch_name in ('CLIP_ViTL14', 'CLIP_ViTL14_FP16'):
+            self.clip = VisualTransformer(
+                input_resolution=224,
+                patch_size=14,
+                width=1024,
+                layers=24,
+                heads=16,
+                output_dim=768)
+
+        else:
+            raise KeyError(f'Unsupported arch_name for CLIP, {arch_name}')
+
+    def forward(self, input_data):
+        output = self.clip(input_data)
+        return output
+
+
+def CLIP(arch_name='CLIP_RN50',
+         use_pretrain=False,
+         load_from='',
+         state_dict=None,
+         **kwargs):
+    model = CLIPNet(arch_name=arch_name, pretrained=None, **kwargs)
+    if use_pretrain:
+        if arch_name.endswith('FP16'):
+            convert_weights(model.clip)
+        load_pretrained(model.clip, state_dict, load_from)
+    return model
+
+
+class ProbingModel(torch.nn.Module):
+
+    def __init__(self, feat_size, num_classes):
+        super(ProbingModel, self).__init__()
+        self.linear = torch.nn.Linear(feat_size, num_classes)
+
+    def forward(self, x):
+        return self.linear(x)
diff --git a/modelscope/models/cv/image_probing_model/model.py b/modelscope/models/cv/image_probing_model/model.py
new file mode 100644
index 00000000..e7636f40
--- /dev/null
+++ b/modelscope/models/cv/image_probing_model/model.py
@@ -0,0 +1,93 @@
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
+
+import os
+from typing import Any, Dict
+
+import json
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from modelscope.metainfo import Models
+from modelscope.models.base.base_torch_model import TorchModel
+from modelscope.models.builder import MODELS
+from modelscope.outputs import OutputKeys
+from modelscope.utils.constant import ModelFile, Tasks
+from .backbone import CLIP, ProbingModel
+
+
+@MODELS.register_module(
+    Tasks.image_classification, module_name=Models.image_probing_model)
+class StructuredProbingModel(TorchModel):
+    """
+    The implementation of 'Structured Model Probing: Empowering
+        Efficient Adaptation by Structured Regularization'.
+    """
+
+    def __init__(self, model_dir, *args, **kwargs):
+        """
+        Initialize a probing model.
+        Args:
+            model_dir: model id or path
+        """
+        super(StructuredProbingModel, self).__init__()
+        model_dir = os.path.join(model_dir, 'food101-clip-vitl14-full.pt')
+        model_file = torch.load(model_dir)
+        self.feature_size = model_file['meta_info']['feature_size']
+        self.num_classes = model_file['meta_info']['num_classes']
+        self.backbone = CLIP(
+            'CLIP_ViTL14_FP16',
+            use_pretrain=True,
+            state_dict=model_file['backbone_model_state_dict'])
+        self.probing_model = ProbingModel(self.feature_size, self.num_classes)
+        self.probing_model.load_state_dict(
+            model_file['probing_model_state_dict'])
+
+    def forward(self, x):
+        """
+        Forward Function of SMP.
+        Args:
+            x: the input images (B, 3, H, W)
+        """
+
+        keys = []
+        for idx in range(0, 24):
+            keys.append('layer_{}_pre_attn'.format(idx))
+            keys.append('layer_{}_attn'.format(idx))
+            keys.append('layer_{}_mlp'.format(idx))
+        keys.append('pre_logits')
+        features = self.backbone(x.half())
+        features_agg = []
+        for i in keys:
+            aggregated_feature = self.aggregate_token(features[i], 1024)
+            features_agg.append(aggregated_feature)
+        features_agg = torch.cat((features_agg), dim=1)
+        outputs = self.probing_model(features_agg.float())
+        return outputs
+
+    def aggregate_token(self, output, target_size):
+        """
+        Aggregating features from tokens.
+        Args:
+            output: the output of intermidiant features
+                from a ViT model
+            target_size: target aggregated feature size
+        """
+        if len(output.shape) == 3:
+            _, n_token, channels = output.shape
+            if channels >= target_size:
+                pool_size = 0
+            else:
+                n_groups = target_size / channels
+                pool_size = int(n_token / n_groups)
+
+            if pool_size > 0:
+                output = torch.permute(output, (0, 2, 1))
+                output = torch.nn.AvgPool1d(
+                    kernel_size=pool_size, stride=pool_size)(
+                        output)
+                output = torch.flatten(output, start_dim=1)
+            else:
+                output = torch.mean(output, dim=1)
+        output = torch.nn.functional.normalize(output, dim=1)
+        return output
diff --git a/modelscope/models/cv/image_probing_model/utils.py b/modelscope/models/cv/image_probing_model/utils.py
new file mode 100644
index 00000000..c2b13ae5
--- /dev/null
+++ b/modelscope/models/cv/image_probing_model/utils.py
@@ -0,0 +1,148 @@
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
+
+import re
+
+import torch
+import torch.nn as nn
+
+
+def load_pretrained(model: torch.nn.Module,
+                    state_dict,
+                    local_path: str,
+                    map_location='cpu',
+                    logger=None,
+                    sub_level=None):
+    return load_pretrained_dict(model, state_dict, logger, sub_level=sub_level)
+
+
+def load_pretrained_dict(model: torch.nn.Module,
+                         state_dict: dict,
+                         logger=None,
+                         sub_level=None):
+    """
+    Load parameters to model with
+    1. Sub name by revise_keys For DataParallelModel or DistributeParallelModel.
+    2. Load 'state_dict' again if possible by key 'state_dict' or 'model_state'.
+    3. Take sub level keys from source, e.g. load 'backbone' part from a classifier into a backbone model.
+    4. Auto remove invalid parameters from source.
+    5. Log or warning if unexpected key exists or key misses.
+
+    Args:
+        model (torch.nn.Module):
+        state_dict (dict): dict of parameters
+        logger (logging.Logger, None):
+        sub_level (str, optional): If not None, parameters with key startswith sub_level will remove the prefix
+            to fit actual model keys. This action happens if user want to load sub module parameters
+            into a sub module model.
+    """
+    revise_keys = [(r'^module\.', '')]
+
+    if 'state_dict' in state_dict:
+        state_dict = state_dict['state_dict']
+    if 'model_state' in state_dict:
+        state_dict = state_dict['model_state']
+
+    for p, r in revise_keys:
+        state_dict = {re.sub(p, r, k): v for k, v in state_dict.items()}
+
+    if sub_level:
+        sub_level = sub_level if sub_level.endswith('.') else (sub_level + '.')
+        sub_level_len = len(sub_level)
+        state_dict = {
+            key[sub_level_len:]: value
+            for key, value in state_dict.items() if key.startswith(sub_level)
+        }
+
+    state_dict = _auto_drop_invalid(model, state_dict, logger=logger)
+
+    load_status = model.load_state_dict(state_dict, strict=False)
+    unexpected_keys = load_status.unexpected_keys
+    missing_keys = load_status.missing_keys
+    err_msgs = []
+    if unexpected_keys:
+        err_msgs.append('unexpected key in source '
+                        f'state_dict: {", ".join(unexpected_keys)}\n')
+    if missing_keys:
+        err_msgs.append('missing key in source '
+                        f'state_dict: {", ".join(missing_keys)}\n')
+    err_msgs = '\n'.join(err_msgs)
+
+    if len(err_msgs) > 0:
+        if logger:
+            logger.warning(err_msgs)
+        else:
+            import warnings
+            warnings.warn(err_msgs)
+
+
+def convert_weights(model: nn.Module):
+    """
+    Convert applicable model parameters to fp16.
+    """
+
+    def _convert_weights_to_fp16(layer):
+        if isinstance(layer, (nn.Conv1d, nn.Conv2d, nn.Linear)):
+            layer.weight.data = layer.weight.data.half()
+            if layer.bias is not None:
+                layer.bias.data = layer.bias.data.half()
+
+        if isinstance(layer, nn.MultiheadAttention):
+            for attr in [
+                    *[f'{s}_proj_weight' for s in ['in', 'q', 'k', 'v']],
+                    'in_proj_bias', 'bias_k', 'bias_v'
+            ]:
+                tensor = getattr(layer, attr)
+                if tensor is not None:
+                    tensor.data = tensor.data.half()
+
+        for name in ['text_projection', 'proj']:
+            if hasattr(layer, name):
+                attr = getattr(layer, name)
+                if attr is not None:
+                    attr.data = attr.data.half()
+
+        for name in ['prompt_embeddings']:
+            if hasattr(layer, name):
+                attr = getattr(layer, name)
+                if attr is not None:
+                    attr.data = attr.data.half()
+
+    model.apply(_convert_weights_to_fp16)
+
+
+def _auto_drop_invalid(model: torch.nn.Module, state_dict: dict, logger=None):
+    """
+    Strip unmatched parameters in state_dict, e.g. shape not matched, type not matched.
+
+    Args:
+        model (torch.nn.Module):
+        state_dict (dict):
+        logger (logging.Logger, None):
+
+    Returns:
+        A new state dict.
+    """
+    ret_dict = state_dict.copy()
+    invalid_msgs = []
+    for key, value in model.state_dict().items():
+        if key in state_dict:
+            # Check shape
+            new_value = state_dict[key]
+            if value.shape != new_value.shape:
+                invalid_msgs.append(
+                    f'{key}: invalid shape, dst {value.shape} vs. src {new_value.shape}'
+                )
+                ret_dict.pop(key)
+            elif value.dtype != new_value.dtype:
+                invalid_msgs.append(
+                    f'{key}: invalid dtype, dst {value.dtype} vs. src {new_value.dtype}'
+                )
+                ret_dict.pop(key)
+    if len(invalid_msgs) > 0:
+        warning_msg = 'ignore keys from source: \n' + '\n'.join(invalid_msgs)
+        if logger:
+            logger.warning(warning_msg)
+        else:
+            import warnings
+            warnings.warn(warning_msg)
+    return ret_dict
diff --git a/modelscope/models/cv/image_quality_assessment_degradation/__init__.py b/modelscope/models/cv/image_quality_assessment_degradation/__init__.py
new file mode 100644
index 00000000..ffd10243
--- /dev/null
+++ b/modelscope/models/cv/image_quality_assessment_degradation/__init__.py
@@ -0,0 +1,23 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .image_quality_assessment_degradation import ImageQualityAssessmentDegradation
+
+else:
+    _import_structure = {
+        'image_quality_assessment_degradation':
+        ['ImageQualityAssessmentDegradation']
+    }
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/models/cv/image_quality_assessment_degradation/degradation_model.py b/modelscope/models/cv/image_quality_assessment_degradation/degradation_model.py
new file mode 100644
index 00000000..639e349f
--- /dev/null
+++ b/modelscope/models/cv/image_quality_assessment_degradation/degradation_model.py
@@ -0,0 +1,127 @@
+import time
+from collections import defaultdict
+
+import cv2
+import json
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torchvision
+from torchvision import models
+
+
+class DegradationIQA(nn.Module):
+
+    def __init__(self):
+        super(DegradationIQA, self).__init__()
+        # [64, 128, 128]
+        features = list(
+            models.__dict__['resnet50'](pretrained=False).children())[:-2]
+        features = nn.Sequential(*features)
+        # features = list(models.__dict__['vgg16'](pretrained=True).children())[:-2][0][:-1]
+        self.features = features
+        self.lcn_radius = 7
+        self.classifier = nn.Sequential(
+            nn.Conv2d(2048, 1024, kernel_size=3, padding=1, dilation=1),
+            nn.ReLU(True),
+            nn.Conv2d(1024, 256, kernel_size=3, padding=1, dilation=1),
+            nn.ReLU(True),
+            # nn.BatchNorm1d(256),
+            nn.Conv2d(256, 64, kernel_size=3, padding=1, dilation=1),
+            nn.ReLU(True),
+            nn.Conv2d(64, 1, kernel_size=3, padding=1, dilation=1))
+
+        self.noise_regression = nn.Sequential(
+            nn.Conv2d(2048, 1024, kernel_size=3, padding=1, dilation=1),
+            nn.ReLU(True),
+            nn.Conv2d(1024, 256, kernel_size=3, padding=1, dilation=1),
+            nn.ReLU(True),
+            # nn.BatchNorm1d(256),
+            nn.Conv2d(256, 64, kernel_size=3, padding=1, dilation=1),
+            nn.ReLU(True),
+            nn.Conv2d(64, 1, kernel_size=3, padding=1, dilation=1))
+
+        self.blur_regression = nn.Sequential(
+            nn.Conv2d(2048, 1024, kernel_size=3, padding=1, dilation=1),
+            nn.ReLU(True),
+            nn.Conv2d(1024, 256, kernel_size=3, padding=1, dilation=1),
+            nn.ReLU(True),
+            # nn.BatchNorm1d(256),
+            nn.Conv2d(256, 64, kernel_size=3, padding=1, dilation=1),
+            nn.ReLU(True),
+            nn.Conv2d(64, 1, kernel_size=3, padding=1, dilation=1))
+
+        self.compression_regression = nn.Sequential(
+            nn.Conv2d(2048, 1024, kernel_size=3, padding=1, dilation=1),
+            nn.ReLU(True),
+            nn.Conv2d(1024, 256, kernel_size=3, padding=1, dilation=1),
+            nn.ReLU(True),
+            # nn.BatchNorm1d(256),
+            nn.Conv2d(256, 64, kernel_size=3, padding=1, dilation=1),
+            nn.ReLU(True),
+            nn.Conv2d(64, 1, kernel_size=3, padding=1, dilation=1))
+
+        self.bright_regression = nn.Sequential(
+            nn.Conv2d(2048, 1024, kernel_size=3, padding=1, dilation=1),
+            nn.ReLU(True),
+            nn.Conv2d(1024, 256, kernel_size=3, padding=1, dilation=1),
+            nn.ReLU(True),
+            # nn.BatchNorm1d(256),
+            nn.Conv2d(256, 64, kernel_size=3, padding=1, dilation=1),
+            nn.ReLU(True),
+            nn.Conv2d(64, 1, kernel_size=3, padding=1, dilation=1))
+
+        self.color_regression = nn.Sequential(
+            nn.Conv2d(2048, 1024, kernel_size=3, padding=1, dilation=1),
+            nn.ReLU(True),
+            nn.Conv2d(1024, 256, kernel_size=3, padding=1, dilation=1),
+            nn.ReLU(True),
+            # nn.BatchNorm1d(256),
+            nn.Conv2d(256, 64, kernel_size=3, padding=1, dilation=1),
+            nn.ReLU(True),
+            nn.Conv2d(64, 1, kernel_size=3, padding=1, dilation=1))
+
+        self._initialize_weights()
+        # activation function
+        self.lrelu = nn.LeakyReLU(negative_slope=0.2, inplace=True)
+
+    def _initialize_weights(self):
+        initialize_layers = [
+            x for j in [
+                self.classifier, self.noise_regression, self.blur_regression,
+                self.compression_regression, self.bright_regression,
+                self.color_regression
+            ] for x in j
+        ]
+        for m in initialize_layers:
+            if isinstance(m, nn.Conv2d):
+                nn.init.xavier_uniform_(m.weight.data)
+                if m.bias is not None:
+                    m.bias.data.zero_()
+
+    def forward(self, x, require_map=False):
+
+        for model in self.features:
+            x = model(x)
+
+        fea = x
+        out_map = self.classifier(fea)
+        noise_map = self.noise_regression(fea)
+        blur_map = self.blur_regression(fea)
+        comp_map = self.compression_regression(fea)
+        bright_map = self.bright_regression(fea)
+        color_map = self.color_regression(fea)
+        out = torch.mean(torch.mean(out_map, dim=2), dim=2)
+        noise_out = torch.mean(torch.mean(noise_map, dim=2), dim=2)
+        blur_out = torch.mean(torch.mean(blur_map, dim=2), dim=2)
+        comp_out = torch.mean(torch.mean(comp_map, dim=2), dim=2)
+        bright_out = torch.mean(torch.mean(bright_map, dim=2), dim=2)
+        color_out = torch.mean(torch.mean(color_map, dim=2), dim=2)
+
+        if not require_map:
+            return out, [noise_out, blur_out, comp_out, bright_out, color_out]
+        else:
+            return out, [
+                noise_out, blur_out, comp_out, bright_out, color_out
+            ], [noise_map, blur_map, comp_map, bright_map, color_map]
diff --git a/modelscope/models/cv/image_quality_assessment_degradation/image_quality_assessment_degradation.py b/modelscope/models/cv/image_quality_assessment_degradation/image_quality_assessment_degradation.py
new file mode 100644
index 00000000..ddc73177
--- /dev/null
+++ b/modelscope/models/cv/image_quality_assessment_degradation/image_quality_assessment_degradation.py
@@ -0,0 +1,121 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os
+from typing import Any, Dict, Union
+
+import torch.cuda
+import torch.nn as nn
+import torch.nn.functional as F
+
+from modelscope.metainfo import Models
+from modelscope.models.base import Tensor
+from modelscope.models.base.base_torch_model import TorchModel
+from modelscope.models.builder import MODELS
+from modelscope.models.cv.image_quality_assessment_degradation.degradation_model import \
+    DegradationIQA
+from modelscope.utils.config import Config
+from modelscope.utils.constant import ModelFile, Tasks
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+__all__ = ['ImageQualityAssessmentDegradation']
+
+
+@MODELS.register_module(
+    Tasks.image_quality_assessment_degradation,
+    module_name=Models.image_quality_assessment_degradation)
+class ImageQualityAssessmentDegradation(TorchModel):
+    """
+    Its architecture is based on the modified resnet50, output with blur degree, noise degree, compression degree.
+    Reference: Rich features for perceptual quality assessment of UGC videos.
+    """
+
+    def __init__(self, model_dir: str, *args, **kwargs):
+        """initialize the image_quality_assessment_degradation model from the `model_dir` path.
+
+        Args:
+            model_dir (str): the model path.
+
+        """
+        super().__init__(model_dir, *args, **kwargs)
+        self.model_dir = model_dir
+        self.sigmoid_layer = nn.Sigmoid()
+        self.config = Config.from_file(
+            os.path.join(self.model_dir, ModelFile.CONFIGURATION))
+        model_path = os.path.join(model_dir, ModelFile.TORCH_MODEL_FILE)
+
+        self.model = DegradationIQA()
+        self.model = self._load_pretrained(self.model, model_path)
+        self.model.eval()
+
+    def _train_forward(self, input: Tensor,
+                       target: Tensor) -> Dict[str, Tensor]:
+        losses = dict()
+        return losses
+
+    def _inference_forward(self, input: Tensor) -> Dict[str, Tensor]:
+        # Todo
+        # if img_tensor.shape[2]*img_tensor.shape[3] > 720*1280:
+        #     img_tensor = torchvision.transforms.functional.resize(img_tensor, 720)
+        preds = self.model(input, require_map=False)
+        noise_degree, blur_degree, comp_degree = preds[1][:3]
+        noise_degree, blur_degree, comp_degree = self.sigmoid_layer(
+            noise_degree), self.sigmoid_layer(blur_degree), self.sigmoid_layer(
+                comp_degree)
+        if noise_degree > 0.3:
+            noise_degree = noise_degree + 0.1
+        if noise_degree >= 0.2 and noise_degree <= 0.3:
+            noise_degree = (noise_degree - 0.2) * 2 + 0.2
+        blur_degree = blur_degree + comp_degree / 2
+
+        return {
+            'noise_degree': noise_degree,
+            'blur_degree': blur_degree,
+            'comp_degree': comp_degree
+        }
+
+    def _evaluate_postprocess(self, input: Tensor, item_id: Tensor,
+                              distortion_type: Tensor, target: Tensor,
+                              **kwargs) -> Dict[str, list]:
+        torch.cuda.empty_cache()
+        with torch.no_grad():
+            preds = self.model(input, require_map=False)
+            noise_degree, blur_degree, comp_degree = preds[1][:3]
+            noise_degree, blur_degree, comp_degree = self.sigmoid_layer(
+                noise_degree), self.sigmoid_layer(
+                    blur_degree), self.sigmoid_layer(comp_degree)
+            noise_degree, blur_degree, comp_degree = noise_degree.cpu(
+            ), blur_degree.cpu(), comp_degree.cpu()
+            if noise_degree > 0.3:
+                noise_degree = noise_degree + 0.1
+            if noise_degree >= 0.2 and noise_degree <= 0.3:
+                noise_degree = (noise_degree - 0.2) * 2 + 0.2
+            blur_degree = blur_degree + comp_degree / 2
+        del input
+        target = target.cpu()
+        torch.cuda.empty_cache()
+        return {
+            'item_id': item_id,
+            'distortion_type': distortion_type,
+            'noise_degree': noise_degree,
+            'blur_degree': blur_degree,
+            'comp_degree': comp_degree,
+            'target': target
+        }
+
+    def forward(self, inputs: Dict[str,
+                                   Tensor]) -> Dict[str, Union[list, Tensor]]:
+        """return the result by the model
+
+        Args:
+            inputs (Tensor): the preprocessed data
+
+        Returns:
+            Dict[str, Tensor]: results
+        """
+        if self.training:
+            return self._train_forward(**inputs)
+        elif 'target' in inputs:
+            return self._evaluate_postprocess(**inputs)
+        else:
+            return self._inference_forward(**inputs)
diff --git a/modelscope/models/cv/image_quality_assessment_mos/__init__.py b/modelscope/models/cv/image_quality_assessment_mos/__init__.py
new file mode 100644
index 00000000..188702a2
--- /dev/null
+++ b/modelscope/models/cv/image_quality_assessment_mos/__init__.py
@@ -0,0 +1,22 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .image_quality_assessment_mos import ImageQualityAssessmentMos
+
+else:
+    _import_structure = {
+        'image_quality_assessment_mos': ['ImageQualityAssessmentMos']
+    }
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/models/cv/image_quality_assessment_mos/backbones/__init__.py b/modelscope/models/cv/image_quality_assessment_mos/backbones/__init__.py
new file mode 100644
index 00000000..fe55f4aa
--- /dev/null
+++ b/modelscope/models/cv/image_quality_assessment_mos/backbones/__init__.py
@@ -0,0 +1,8 @@
+# @Time    : 2021/5/12 11:49 上午
+# @Author  : shaoguowen
+# @Email   : shaoguowen@tencent.com
+# @FileName: __init__.py
+# @Software: PyCharm
+
+from .resnet import (resnet18, resnet34, resnet50, resnet101, resnet152,
+                     resnext50_32x4d, resnext101_32x8d)
diff --git a/modelscope/models/cv/image_quality_assessment_mos/backbones/resnet.py b/modelscope/models/cv/image_quality_assessment_mos/backbones/resnet.py
new file mode 100644
index 00000000..e153e5f9
--- /dev/null
+++ b/modelscope/models/cv/image_quality_assessment_mos/backbones/resnet.py
@@ -0,0 +1,452 @@
+# The implementation is adopted from CenseoQoE, made pubicly available under the MIT License at
+# https://github.com/Tencent/CenseoQoE
+import os
+
+import torch
+import torch.nn as nn
+
+try:
+    from torch.hub import load_state_dict_from_url
+except ImportError:
+    from torch.utils.model_zoo import load_url as load_state_dict_from_url
+
+__all__ = [
+    'ResNet', 'resnet18', 'resnet34', 'resnet50', 'resnet101', 'resnet152',
+    'resnext50_32x4d', 'resnext101_32x8d', 'wide_resnet50_2',
+    'wide_resnet101_2'
+]
+
+model_urls = {
+    'resnet18':
+    'https://download.pytorch.org/models/resnet18-5c106cde.pth',
+    'resnet34':
+    'https://download.pytorch.org/models/resnet34-333f7ec4.pth',
+    'resnet50':
+    'https://download.pytorch.org/models/resnet50-19c8e357.pth',
+    'resnet101':
+    'https://download.pytorch.org/models/resnet101-5d3b4d8f.pth',
+    'resnet152':
+    'https://download.pytorch.org/models/resnet152-b121ed2d.pth',
+    'resnext50_32x4d':
+    'https://download.pytorch.org/models/resnext50_32x4d-7cdf4587.pth',
+    'resnext101_32x8d':
+    'https://download.pytorch.org/models/resnext101_32x8d-8ba56ff5.pth',
+    'wide_resnet50_2':
+    'https://download.pytorch.org/models/wide_resnet50_2-95faca4d.pth',
+    'wide_resnet101_2':
+    'https://download.pytorch.org/models/wide_resnet101_2-32ee1156.pth',
+}
+
+
+def conv3x3(in_planes, out_planes, stride=1, groups=1, dilation=1):
+    """3x3 convolution with padding"""
+    return nn.Conv2d(
+        in_planes,
+        out_planes,
+        kernel_size=3,
+        stride=stride,
+        padding=dilation,
+        groups=groups,
+        bias=False,
+        dilation=dilation)
+
+
+def conv1x1(in_planes, out_planes, stride=1):
+    """1x1 convolution"""
+    return nn.Conv2d(
+        in_planes, out_planes, kernel_size=1, stride=stride, bias=False)
+
+
+class BasicBlock(nn.Module):
+    expansion = 1
+
+    def __init__(self,
+                 inplanes,
+                 planes,
+                 stride=1,
+                 downsample=None,
+                 groups=1,
+                 base_width=64,
+                 dilation=1,
+                 norm_layer=None):
+        super(BasicBlock, self).__init__()
+        if norm_layer is None:
+            norm_layer = nn.BatchNorm2d
+        if groups != 1 or base_width != 64:
+            raise ValueError(
+                'BasicBlock only supports groups=1 and base_width=64')
+        if dilation > 1:
+            raise NotImplementedError(
+                'Dilation > 1 not supported in BasicBlock')
+        # Both self.conv1 and self.downsample layers downsample the input when stride != 1
+        self.conv1 = conv3x3(inplanes, planes, stride)
+        self.bn1 = norm_layer(planes)
+        self.relu = nn.ReLU(inplace=True)
+        self.conv2 = conv3x3(planes, planes)
+        self.bn2 = norm_layer(planes)
+        self.downsample = downsample
+        self.stride = stride
+
+    def forward(self, x):
+        identity = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+
+        if self.downsample is not None:
+            identity = self.downsample(x)
+
+        out += identity
+        out = self.relu(out)
+
+        return out
+
+
+class Bottleneck(nn.Module):
+    # Bottleneck in torchvision places the stride for downsampling at 3x3 convolution(self.conv2)
+    # while original implementation places the stride at the first 1x1 convolution(self.conv1)
+    # according to "Deep residual learning for image recognition"https://arxiv.org/abs/1512.03385.
+    # This variant is also known as ResNet V1.5 and improves accuracy according to
+    # https://ngc.nvidia.com/catalog/model-scripts/nvidia:resnet_50_v1_5_for_pytorch.
+
+    expansion = 4
+
+    def __init__(self,
+                 inplanes,
+                 planes,
+                 stride=1,
+                 downsample=None,
+                 groups=1,
+                 base_width=64,
+                 dilation=1,
+                 norm_layer=None):
+        super(Bottleneck, self).__init__()
+        if norm_layer is None:
+            norm_layer = nn.BatchNorm2d
+        width = int(planes * (base_width / 64.)) * groups
+        # Both self.conv2 and self.downsample layers downsample the input when stride != 1
+        self.conv1 = conv1x1(inplanes, width)
+        self.bn1 = norm_layer(width)
+        self.conv2 = conv3x3(width, width, stride, groups, dilation)
+        self.bn2 = norm_layer(width)
+        self.conv3 = conv1x1(width, planes * self.expansion)
+        self.bn3 = norm_layer(planes * self.expansion)
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = downsample
+        self.stride = stride
+
+    def forward(self, x):
+        identity = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out = self.relu(out)
+
+        out = self.conv3(out)
+        out = self.bn3(out)
+
+        if self.downsample is not None:
+            identity = self.downsample(x)
+
+        out += identity
+        out = self.relu(out)
+
+        return out
+
+
+class ResNet(nn.Module):
+    """
+    ResNet, ref from pytorch official website，
+    Differ from torch vidion resnet, this net :
+        a. delete some pooling and fc layers.
+        b. support output mid-layer feature.
+    """
+
+    def __init__(self,
+                 block,
+                 layers,
+                 zero_init_residual=False,
+                 groups=1,
+                 width_per_group=64,
+                 replace_stride_with_dilation=None,
+                 norm_layer=None,
+                 input_channels=3,
+                 out_indices=(3, ),
+                 strides=(2, 2, 2)):
+        super(ResNet, self).__init__()
+        if norm_layer is None:
+            norm_layer = nn.BatchNorm2d
+        self._norm_layer = norm_layer
+        self.block = block
+        self.inplanes = 64
+        self.dilation = 1
+        if replace_stride_with_dilation is None:
+            # each element in the tuple indicates if we should replace
+            # the 2x2 stride with a dilated convolution instead
+            replace_stride_with_dilation = [False, False, False]
+        if len(replace_stride_with_dilation) != 3:
+            raise ValueError('replace_stride_with_dilation should be None '
+                             'or a 3-element tuple, got {}'.format(
+                                 replace_stride_with_dilation))
+        self.groups = groups
+        self.base_width = width_per_group
+        self.conv1 = nn.Conv2d(
+            input_channels,
+            self.inplanes,
+            kernel_size=7,
+            stride=2,
+            padding=3,
+            bias=False)
+        self.bn1 = norm_layer(self.inplanes)
+        self.relu = nn.ReLU(inplace=True)
+        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+        self.ouput_dims = []
+        self.layer1 = self._make_layer(block, 64, layers[0])
+        self.ouput_dims.append(block.expansion * 64)
+        self.layer2 = self._make_layer(
+            block,
+            128,
+            layers[1],
+            stride=strides[0],
+            dilate=replace_stride_with_dilation[0])
+        self.ouput_dims.append(block.expansion * 128)
+        self.layer3 = self._make_layer(
+            block,
+            256,
+            layers[2],
+            stride=strides[1],
+            dilate=replace_stride_with_dilation[1])
+        self.ouput_dims.append(block.expansion * 256)
+        self.layer4 = self._make_layer(
+            block,
+            512,
+            layers[3],
+            stride=strides[2],
+            dilate=replace_stride_with_dilation[2])
+        self.ouput_dims.append(block.expansion * 512)
+        self.out_indices = out_indices
+
+        self.ouput_dims = [
+            x for i, x in enumerate(self.ouput_dims) if i in out_indices
+        ]
+
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(
+                    m.weight, mode='fan_out', nonlinearity='relu')
+            elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)):
+                nn.init.constant_(m.weight, 1)
+                nn.init.constant_(m.bias, 0)
+
+        # Zero-initialize the last BN in each residual branch,
+        # so that the residual branch starts with zeros, and each residual block behaves like an identity.
+        # This improves the model by 0.2~0.3% according to https://arxiv.org/abs/1706.02677
+        if zero_init_residual:
+            for m in self.modules():
+                if isinstance(m, Bottleneck):
+                    nn.init.constant_(m.bn3.weight, 0)
+                elif isinstance(m, BasicBlock):
+                    nn.init.constant_(m.bn2.weight, 0)
+
+    def _make_layer(self, block, planes, blocks, stride=1, dilate=False):
+        norm_layer = self._norm_layer
+        downsample = None
+        previous_dilation = self.dilation
+        if dilate:
+            self.dilation *= stride
+            stride = 1
+        if stride != 1 or self.inplanes != planes * block.expansion:
+            downsample = nn.Sequential(
+                conv1x1(self.inplanes, planes * block.expansion, stride),
+                norm_layer(planes * block.expansion),
+            )
+
+        layers = []
+        layers.append(
+            block(self.inplanes, planes, stride, downsample, self.groups,
+                  self.base_width, previous_dilation, norm_layer))
+        self.inplanes = planes * block.expansion
+        for _ in range(1, blocks):
+            layers.append(
+                block(
+                    self.inplanes,
+                    planes,
+                    groups=self.groups,
+                    base_width=self.base_width,
+                    dilation=self.dilation,
+                    norm_layer=norm_layer))
+
+        return nn.Sequential(*layers)
+
+    def forward(self, x):
+        outs = []
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = self.relu(x)
+        x = self.maxpool(x)
+
+        x1 = self.layer1(x)
+        outs.append(x1)
+        x2 = self.layer2(x1)
+        outs.append(x2)
+        x3 = self.layer3(x2)
+        outs.append(x3)
+        x4 = self.layer4(x3)
+        outs.append(x4)
+
+        outs = [x for i, x in enumerate(outs) if i in self.out_indices]
+        return tuple(outs)
+
+
+def _resnet(arch, block, layers, pretrained, progress, **kwargs):
+    model = ResNet(block, layers, **kwargs)
+    if pretrained:
+        print('load pretrained model>>>>>')
+        if isinstance(pretrained, str) and os.path.exists(pretrained):
+            print('load pretrained from local:', pretrained)
+            state_dict = torch.load(pretrained)
+        else:
+            print('load pretrained from url:', model_urls[arch])
+            state_dict = load_state_dict_from_url(
+                model_urls[arch], progress=progress)
+        net_dict = model.state_dict()
+        pretrained_dict = {
+            k: v
+            for k, v in state_dict.items()
+            if k in net_dict and v.size() == net_dict[k].size()
+        }
+        net_dict.update(pretrained_dict)
+        model.load_state_dict(net_dict)
+    return model
+
+
+def resnet18(pretrained=False, progress=True, **kwargs):
+    r"""ResNet-18 model from
+    `"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>`_
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    return _resnet('resnet18', BasicBlock, [2, 2, 2, 2], pretrained, progress,
+                   **kwargs)
+
+
+def resnet34(pretrained=False, progress=True, **kwargs):
+    r"""ResNet-34 model from
+    `"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>`_
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    return _resnet('resnet34', BasicBlock, [3, 4, 6, 3], pretrained, progress,
+                   **kwargs)
+
+
+def resnet50(pretrained=False, progress=True, **kwargs):
+    r"""ResNet-50 model from
+    `"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>`_
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    return _resnet('resnet50', Bottleneck, [3, 4, 6, 3], pretrained, progress,
+                   **kwargs)
+
+
+def resnet101(pretrained=False, progress=True, **kwargs):
+    r"""ResNet-101 model from
+    `"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>`_
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    return _resnet('resnet101', Bottleneck, [3, 4, 23, 3], pretrained,
+                   progress, **kwargs)
+
+
+def resnet152(pretrained=False, progress=True, **kwargs):
+    r"""ResNet-152 model from
+    `"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>`_
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    return _resnet('resnet152', Bottleneck, [3, 8, 36, 3], pretrained,
+                   progress, **kwargs)
+
+
+def resnext50_32x4d(pretrained=False, progress=True, **kwargs):
+    r"""ResNeXt-50 32x4d model from
+    `"Aggregated Residual Transformation for Deep Neural Networks" <https://arxiv.org/pdf/1611.05431.pdf>`_
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    kwargs['groups'] = 32
+    kwargs['width_per_group'] = 4
+    return _resnet('resnext50_32x4d', Bottleneck, [3, 4, 6, 3], pretrained,
+                   progress, **kwargs)
+
+
+def resnext101_32x8d(pretrained=False, progress=True, **kwargs):
+    r"""ResNeXt-101 32x8d model from
+    `"Aggregated Residual Transformation for Deep Neural Networks" <https://arxiv.org/pdf/1611.05431.pdf>`_
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    kwargs['groups'] = 32
+    kwargs['width_per_group'] = 8
+    return _resnet('resnext101_32x8d', Bottleneck, [3, 4, 23, 3], pretrained,
+                   progress, **kwargs)
+
+
+def wide_resnet50_2(pretrained=False, progress=True, **kwargs):
+    r"""Wide ResNet-50-2 model from
+    `"Wide Residual Networks" <https://arxiv.org/pdf/1605.07146.pdf>`_
+
+    The model is the same as ResNet except for the bottleneck number of channels
+    which is twice larger in every block. The number of channels in outer 1x1
+    convolutions is the same, e.g. last block in ResNet-50 has 2048-512-2048
+    channels, and in Wide ResNet-50-2 has 2048-1024-2048.
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    kwargs['width_per_group'] = 64 * 2
+    return _resnet('wide_resnet50_2', Bottleneck, [3, 4, 6, 3], pretrained,
+                   progress, **kwargs)
+
+
+def wide_resnet101_2(pretrained=False, progress=True, **kwargs):
+    r"""Wide ResNet-101-2 model from
+    `"Wide Residual Networks" <https://arxiv.org/pdf/1605.07146.pdf>`_
+
+    The model is the same as ResNet except for the bottleneck number of channels
+    which is twice larger in every block. The number of channels in outer 1x1
+    convolutions is the same, e.g. last block in ResNet-50 has 2048-512-2048
+    channels, and in Wide ResNet-50-2 has 2048-1024-2048.
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    kwargs['width_per_group'] = 64 * 2
+    return _resnet('wide_resnet101_2', Bottleneck, [3, 4, 23, 3], pretrained,
+                   progress, **kwargs)
diff --git a/modelscope/models/cv/image_quality_assessment_mos/censeo_ivqa_model.py b/modelscope/models/cv/image_quality_assessment_mos/censeo_ivqa_model.py
new file mode 100644
index 00000000..fbe40e6a
--- /dev/null
+++ b/modelscope/models/cv/image_quality_assessment_mos/censeo_ivqa_model.py
@@ -0,0 +1,33 @@
+# The implementation is adopted from CenseoQoE, made pubicly available under the MIT License at
+# https://github.com/Tencent/CenseoQoE
+
+import torch
+from torch import nn
+
+from . import backbones, heads
+
+
+class CenseoIVQAModel(nn.Module):
+    """
+    A strong baseline model for image quality assessment.
+    Its architecture is based on the modified resnet18 and reach SOTA interms of PLCC and SRCC.
+    The reference papaer is https://arxiv.org/pdf/2111.07104.pdf
+    """
+
+    def __init__(self, pretrained=True):
+        super(CenseoIVQAModel, self).__init__()
+        input_channels = 3
+        model_name = 'resnet18'
+        self.backbone = getattr(backbones, model_name)(
+            input_channels=input_channels,
+            pretrained=pretrained,
+            out_indices=(3, ),
+            strides=(2, 2, 2))
+        self.head = getattr(heads, 'SimpleHead')(
+            self.backbone.ouput_dims, out_num=1)
+
+    def forward(self, x):
+        feats = self.backbone(x)
+        out = self.head(feats)
+        out = torch.sigmoid(out)
+        return out
diff --git a/modelscope/models/cv/image_quality_assessment_mos/heads/__init__.py b/modelscope/models/cv/image_quality_assessment_mos/heads/__init__.py
new file mode 100644
index 00000000..55d364b2
--- /dev/null
+++ b/modelscope/models/cv/image_quality_assessment_mos/heads/__init__.py
@@ -0,0 +1 @@
+from .simple_head import SimpleHead
diff --git a/modelscope/models/cv/image_quality_assessment_mos/heads/simple_head.py b/modelscope/models/cv/image_quality_assessment_mos/heads/simple_head.py
new file mode 100644
index 00000000..95153aca
--- /dev/null
+++ b/modelscope/models/cv/image_quality_assessment_mos/heads/simple_head.py
@@ -0,0 +1,20 @@
+import torch
+from torch import nn
+
+
+class SimpleHead(torch.nn.Module):
+
+    def __init__(self, feats_dims, out_num=1):
+        super(SimpleHead, self).__init__()
+        self.avg_pool = nn.AdaptiveAvgPool2d(1)
+        out_num = out_num
+        self.fc_out = nn.Sequential(
+            nn.Linear(feats_dims[-1], 1024), nn.ReLU(inplace=True),
+            nn.Linear(1024, out_num))
+
+    def forward(self, x):
+        x = x[-1]
+        x = self.avg_pool(x)
+        x = torch.flatten(x, 1)
+        x = self.fc_out(x)
+        return x
diff --git a/modelscope/models/cv/image_quality_assessment_mos/image_quality_assessment_mos.py b/modelscope/models/cv/image_quality_assessment_mos/image_quality_assessment_mos.py
new file mode 100644
index 00000000..8da84b0b
--- /dev/null
+++ b/modelscope/models/cv/image_quality_assessment_mos/image_quality_assessment_mos.py
@@ -0,0 +1,80 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os
+from typing import Any, Dict, Union
+
+import torch.cuda
+import torch.nn.functional as F
+
+from modelscope.metainfo import Models
+from modelscope.models.base import Tensor
+from modelscope.models.base.base_torch_model import TorchModel
+from modelscope.models.builder import MODELS
+from modelscope.models.cv.image_quality_assessment_mos.censeo_ivqa_model import \
+    CenseoIVQAModel
+from modelscope.utils.config import Config
+from modelscope.utils.constant import ModelFile, Tasks
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+__all__ = ['ImageQualityAssessmentMos']
+
+
+@MODELS.register_module(
+    Tasks.image_quality_assessment_mos,
+    module_name=Models.image_quality_assessment_mos)
+class ImageQualityAssessmentMos(TorchModel):
+
+    def __init__(self, model_dir: str, *args, **kwargs):
+        """initialize the image_quality_assessment_mos model from the `model_dir` path.
+
+        Args:
+            model_dir (str): the model path.
+
+        """
+        super().__init__(model_dir, *args, **kwargs)
+        self.model_dir = model_dir
+        self.config = Config.from_file(
+            os.path.join(self.model_dir, ModelFile.CONFIGURATION))
+        model_path = os.path.join(model_dir, ModelFile.TORCH_MODEL_FILE)
+
+        self.model = CenseoIVQAModel(pretrained=False)
+        self.model = self._load_pretrained(self.model, model_path)
+        self.model.eval()
+
+    def _train_forward(self, input: Tensor,
+                       target: Tensor) -> Dict[str, Tensor]:
+        losses = dict()
+        return losses
+
+    def _inference_forward(self, input: Tensor) -> Dict[str, Tensor]:
+        return {'output': self.model(input).clamp(0, 1)}
+
+    def _evaluate_postprocess(self, input: Tensor,
+                              target: Tensor) -> Dict[str, list]:
+
+        torch.cuda.empty_cache()
+        with torch.no_grad():
+            preds = self.model(input)
+            preds = preds.clamp(0, 1).cpu()
+        del input
+        target = target.cpu()
+        torch.cuda.empty_cache()
+        return {'pred': preds, 'target': target}
+
+    def forward(self, inputs: Dict[str,
+                                   Tensor]) -> Dict[str, Union[list, Tensor]]:
+        """return the result by the model
+
+        Args:
+            inputs (Tensor): the preprocessed data
+
+        Returns:
+            Dict[str, Tensor]: results
+        """
+        if self.training:
+            return self._train_forward(**inputs)
+        elif 'target' in inputs:
+            return self._evaluate_postprocess(**inputs)
+        else:
+            return self._inference_forward(**inputs)
diff --git a/modelscope/models/cv/image_restoration/__init__.py b/modelscope/models/cv/image_restoration/__init__.py
new file mode 100644
index 00000000..91ee2636
--- /dev/null
+++ b/modelscope/models/cv/image_restoration/__init__.py
@@ -0,0 +1,22 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .image_restoration_model import ImageRestorationModel
+
+else:
+    _import_structure = {
+        'image_restoration_model': ['ImageRestorationModel'],
+    }
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/models/cv/image_restoration/demoire_models/__init__.py b/modelscope/models/cv/image_restoration/demoire_models/__init__.py
new file mode 100644
index 00000000..66328b33
--- /dev/null
+++ b/modelscope/models/cv/image_restoration/demoire_models/__init__.py
@@ -0,0 +1,3 @@
+from .nets import my_model as ESDNet
+
+model_map = {'ESDNet': ESDNet}
diff --git a/modelscope/models/cv/image_restoration/demoire_models/nets.py b/modelscope/models/cv/image_restoration/demoire_models/nets.py
new file mode 100644
index 00000000..1f554b80
--- /dev/null
+++ b/modelscope/models/cv/image_restoration/demoire_models/nets.py
@@ -0,0 +1,310 @@
+# The implementation is adopted from UHDM work, made publicly available under the Apache 2.0 License
+# source code avaiable via https://github.com/CVMI-Lab/UHDM/
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class my_model(nn.Module):
+
+    def __init__(
+        self,
+        en_feature_num,
+        en_inter_num,
+        de_feature_num,
+        de_inter_num,
+        sam_number=1,
+    ):
+        super(my_model, self).__init__()
+        self.encoder = Encoder(
+            feature_num=en_feature_num,
+            inter_num=en_inter_num,
+            sam_number=sam_number)
+        self.decoder = Decoder(
+            en_num=en_feature_num,
+            feature_num=de_feature_num,
+            inter_num=de_inter_num,
+            sam_number=sam_number)
+
+    def forward(self, x):
+        y_1, y_2, y_3 = self.encoder(x)
+        out_1, out_2, out_3 = self.decoder(y_1, y_2, y_3)
+        return out_1, out_2, out_3
+
+    def _initialize_weights(self):
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                m.weight.data.normal_(0.0, 0.02)
+                if m.bias is not None:
+                    m.bias.data.normal_(0.0, 0.02)
+            if isinstance(m, nn.ConvTranspose2d):
+                m.weight.data.normal_(0.0, 0.02)
+
+
+class Decoder(nn.Module):
+
+    def __init__(self, en_num, feature_num, inter_num, sam_number):
+        super(Decoder, self).__init__()
+        self.preconv_3 = conv_relu(4 * en_num, feature_num, 3, padding=1)
+        self.decoder_3 = Decoder_Level(feature_num, inter_num, sam_number)
+        self.preconv_2 = conv_relu(
+            2 * en_num + feature_num, feature_num, 3, padding=1)
+        self.decoder_2 = Decoder_Level(feature_num, inter_num, sam_number)
+        self.preconv_1 = conv_relu(
+            en_num + feature_num, feature_num, 3, padding=1)
+        self.decoder_1 = Decoder_Level(feature_num, inter_num, sam_number)
+
+    def forward(self, y_1, y_2, y_3):
+        x_3 = y_3
+        x_3 = self.preconv_3(x_3)
+        out_3, feat_3 = self.decoder_3(x_3)
+        x_2 = torch.cat([y_2, feat_3], dim=1)
+        x_2 = self.preconv_2(x_2)
+        out_2, feat_2 = self.decoder_2(x_2)
+        x_1 = torch.cat([y_1, feat_2], dim=1)
+        x_1 = self.preconv_1(x_1)
+        out_1 = self.decoder_1(x_1, feat=False)
+        return out_1, out_2, out_3
+
+
+class Encoder(nn.Module):
+
+    def __init__(self, feature_num, inter_num, sam_number):
+        super(Encoder, self).__init__()
+        self.conv_first = nn.Sequential(
+            nn.Conv2d(
+                12, feature_num, kernel_size=5, stride=1, padding=2,
+                bias=True), nn.ReLU(inplace=True))
+        self.encoder_1 = Encoder_Level(
+            feature_num, inter_num, level=1, sam_number=sam_number)
+        self.encoder_2 = Encoder_Level(
+            2 * feature_num, inter_num, level=2, sam_number=sam_number)
+        self.encoder_3 = Encoder_Level(
+            4 * feature_num, inter_num, level=3, sam_number=sam_number)
+
+    def forward(self, x):
+        x = F.pixel_unshuffle(x, 2)
+        x = self.conv_first(x)
+        out_feature_1, down_feature_1 = self.encoder_1(x)
+        out_feature_2, down_feature_2 = self.encoder_2(down_feature_1)
+        out_feature_3 = self.encoder_3(down_feature_2)
+        return out_feature_1, out_feature_2, out_feature_3
+
+
+class Encoder_Level(nn.Module):
+
+    def __init__(self, feature_num, inter_num, level, sam_number):
+        super(Encoder_Level, self).__init__()
+        self.rdb = RDB(
+            in_channel=feature_num, d_list=(1, 2, 1), inter_num=inter_num)
+        self.sam_blocks = nn.ModuleList()
+        for _ in range(sam_number):
+            sam_block = SAM(
+                in_channel=feature_num,
+                d_list=(1, 2, 3, 2, 1),
+                inter_num=inter_num)
+            self.sam_blocks.append(sam_block)
+        if level < 3:
+            self.down = nn.Sequential(
+                nn.Conv2d(
+                    feature_num,
+                    2 * feature_num,
+                    kernel_size=3,
+                    stride=2,
+                    padding=1,
+                    bias=True), nn.ReLU(inplace=True))
+        self.level = level
+
+    def forward(self, x):
+        out_feature = self.rdb(x)
+        for sam_block in self.sam_blocks:
+            out_feature = sam_block(out_feature)
+        if self.level < 3:
+            down_feature = self.down(out_feature)
+            return out_feature, down_feature
+        return out_feature
+
+
+class Decoder_Level(nn.Module):
+
+    def __init__(self, feature_num, inter_num, sam_number):
+        super(Decoder_Level, self).__init__()
+        self.rdb = RDB(feature_num, (1, 2, 1), inter_num)
+        self.sam_blocks = nn.ModuleList()
+        for _ in range(sam_number):
+            sam_block = SAM(
+                in_channel=feature_num,
+                d_list=(1, 2, 3, 2, 1),
+                inter_num=inter_num)
+            self.sam_blocks.append(sam_block)
+        self.conv = conv(
+            in_channel=feature_num, out_channel=12, kernel_size=3, padding=1)
+
+    def forward(self, x, feat=True):
+        x = self.rdb(x)
+        for sam_block in self.sam_blocks:
+            x = sam_block(x)
+        out = self.conv(x)
+        out = F.pixel_shuffle(out, 2)
+        if feat:
+            feature = F.interpolate(x, scale_factor=2, mode='bilinear')
+            return out, feature
+        else:
+            return out
+
+
+class DB(nn.Module):
+
+    def __init__(self, in_channel, d_list, inter_num):
+        super(DB, self).__init__()
+        self.d_list = d_list
+        self.conv_layers = nn.ModuleList()
+        c = in_channel
+        for i in range(len(d_list)):
+            dense_conv = conv_relu(
+                in_channel=c,
+                out_channel=inter_num,
+                kernel_size=3,
+                dilation_rate=d_list[i],
+                padding=d_list[i])
+            self.conv_layers.append(dense_conv)
+            c = c + inter_num
+        self.conv_post = conv(
+            in_channel=c, out_channel=in_channel, kernel_size=1)
+
+    def forward(self, x):
+        t = x
+        for conv_layer in self.conv_layers:
+            _t = conv_layer(t)
+            t = torch.cat([_t, t], dim=1)
+        t = self.conv_post(t)
+        return t
+
+
+class SAM(nn.Module):
+
+    def __init__(self, in_channel, d_list, inter_num):
+        super(SAM, self).__init__()
+        self.basic_block = DB(
+            in_channel=in_channel, d_list=d_list, inter_num=inter_num)
+        self.basic_block_2 = DB(
+            in_channel=in_channel, d_list=d_list, inter_num=inter_num)
+        self.basic_block_4 = DB(
+            in_channel=in_channel, d_list=d_list, inter_num=inter_num)
+        self.fusion = CSAF(3 * in_channel)
+
+    def forward(self, x):
+        x_0 = x
+        x_2 = F.interpolate(x, scale_factor=0.5, mode='bilinear')
+        x_4 = F.interpolate(x, scale_factor=0.25, mode='bilinear')
+        y_0 = self.basic_block(x_0)
+        y_2 = self.basic_block_2(x_2)
+        y_4 = self.basic_block_4(x_4)
+        y_2 = F.interpolate(y_2, scale_factor=2, mode='bilinear')
+        y_4 = F.interpolate(y_4, scale_factor=4, mode='bilinear')
+        y = self.fusion(y_0, y_2, y_4)
+        y = x + y
+        return y
+
+
+class CSAF(nn.Module):
+
+    def __init__(self, in_chnls, ratio=4):
+        super(CSAF, self).__init__()
+        self.squeeze = nn.AdaptiveAvgPool2d((1, 1))
+        self.compress1 = nn.Conv2d(in_chnls, in_chnls // ratio, 1, 1, 0)
+        self.compress2 = nn.Conv2d(in_chnls // ratio, in_chnls // ratio, 1, 1,
+                                   0)
+        self.excitation = nn.Conv2d(in_chnls // ratio, in_chnls, 1, 1, 0)
+
+    def forward(self, x0, x2, x4):
+        out0 = self.squeeze(x0)
+        out2 = self.squeeze(x2)
+        out4 = self.squeeze(x4)
+        out = torch.cat([out0, out2, out4], dim=1)
+        out = self.compress1(out)
+        out = F.relu(out)
+        out = self.compress2(out)
+        out = F.relu(out)
+        out = self.excitation(out)
+        out = F.sigmoid(out)
+        w0, w2, w4 = torch.chunk(out, 3, dim=1)
+        x = x0 * w0 + x2 * w2 + x4 * w4
+        return x
+
+
+class RDB(nn.Module):
+
+    def __init__(self, in_channel, d_list, inter_num):
+        super(RDB, self).__init__()
+        self.d_list = d_list
+        self.conv_layers = nn.ModuleList()
+        c = in_channel
+        for i in range(len(d_list)):
+            dense_conv = conv_relu(
+                in_channel=c,
+                out_channel=inter_num,
+                kernel_size=3,
+                dilation_rate=d_list[i],
+                padding=d_list[i])
+            self.conv_layers.append(dense_conv)
+            c = c + inter_num
+        self.conv_post = conv(
+            in_channel=c, out_channel=in_channel, kernel_size=1)
+
+    def forward(self, x):
+        t = x
+        for conv_layer in self.conv_layers:
+            _t = conv_layer(t)
+            t = torch.cat([_t, t], dim=1)
+        t = self.conv_post(t)
+        return t + x
+
+
+class conv(nn.Module):
+
+    def __init__(self,
+                 in_channel,
+                 out_channel,
+                 kernel_size,
+                 dilation_rate=1,
+                 padding=0,
+                 stride=1):
+        super(conv, self).__init__()
+        self.conv = nn.Conv2d(
+            in_channels=in_channel,
+            out_channels=out_channel,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            bias=True,
+            dilation=dilation_rate)
+
+    def forward(self, x_input):
+        out = self.conv(x_input)
+        return out
+
+
+class conv_relu(nn.Module):
+
+    def __init__(self,
+                 in_channel,
+                 out_channel,
+                 kernel_size,
+                 dilation_rate=1,
+                 padding=0,
+                 stride=1):
+        super(conv_relu, self).__init__()
+        self.conv = nn.Sequential(
+            nn.Conv2d(
+                in_channels=in_channel,
+                out_channels=out_channel,
+                kernel_size=kernel_size,
+                stride=stride,
+                padding=padding,
+                bias=True,
+                dilation=dilation_rate), nn.ReLU(inplace=True))
+
+    def forward(self, x_input):
+        out = self.conv(x_input)
+        return out
diff --git a/modelscope/models/cv/image_restoration/image_restoration_model.py b/modelscope/models/cv/image_restoration/image_restoration_model.py
new file mode 100644
index 00000000..b60a360c
--- /dev/null
+++ b/modelscope/models/cv/image_restoration/image_restoration_model.py
@@ -0,0 +1,69 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os.path as osp
+
+import cv2
+import numpy as np
+import torch
+
+from modelscope.metainfo import Models
+from modelscope.models.base.base_torch_model import TorchModel
+from modelscope.models.builder import MODELS
+from modelscope.utils.config import Config
+from modelscope.utils.constant import ModelFile, Tasks
+from .demoire_models import model_map
+
+
+@MODELS.register_module(
+    Tasks.image_demoireing, module_name=Models.image_restoration)
+class ImageRestorationModel(TorchModel):
+
+    def __init__(self, model_dir: str, *args, **kwargs):
+        """str -- model file root."""
+        super().__init__(model_dir, *args, **kwargs)
+        model_path = osp.join(model_dir, ModelFile.TORCH_MODEL_FILE)
+        config_path = osp.join(model_dir, ModelFile.CONFIGURATION)
+        config = Config.from_file(config_path)
+        model_name = config.model.network_type
+        model_class = model_map[model_name]
+        self.model = model_class(**config.model.network_param)
+        checkpoint = torch.load(model_path, map_location='cpu')
+        self.model.load_state_dict(checkpoint)
+        self.model.eval()
+        self.pad_32 = config.preprocessor.pad_32
+
+    def inference(self, data):
+        """data is tensor -1 * C * H * W ---> return tensor -1 * C * H * W ."""
+        if next(self.model.parameters()).is_cuda:
+            data = data.to(
+                torch.device([next(self.model.parameters()).device][0]))
+        with torch.no_grad():
+            results = self.model(data)
+        if next(self.model.parameters()).is_cuda:
+            return results[0].cpu()
+        return results[0]
+
+    def forward(self, inputs):
+        """inputs is dict"""
+        data = self.inference(inputs['img'])
+        outputs = inputs
+        outputs['img'] = data
+        return outputs
+
+    def postprocess(self, inputs):
+        """ inputs is dict return is numpy"""
+        data = inputs['img'][0, :, :, :]
+        if self.pad_32:
+            h_pad = inputs['h_pad']
+            h_odd_pad = inputs['h_odd_pad']
+            w_pad = inputs['w_pad']
+            w_odd_pad = inputs['w_odd_pad']
+            if h_pad != 0:
+                data = data[:, h_pad:-h_odd_pad, :]
+            if w_pad != 0:
+                data = data[:, :, w_pad:-w_odd_pad]
+        data_norm_np = np.array(np.clip(data.numpy(), 0, 1)
+                                * 255).astype('uint8').transpose(1, 2, 0)
+        if data_norm_np.shape[0] != inputs['img_h']:
+            data_norm_np = cv2.resize(data_norm_np,
+                                      (inputs['img_w'], inputs['img_h']))
+        return data_norm_np
diff --git a/modelscope/models/cv/image_semantic_segmentation/__init__.py b/modelscope/models/cv/image_semantic_segmentation/__init__.py
index df56c5b8..46eeeee4 100644
--- a/modelscope/models/cv/image_semantic_segmentation/__init__.py
+++ b/modelscope/models/cv/image_semantic_segmentation/__init__.py
@@ -6,11 +6,13 @@ from modelscope.utils.import_utils import LazyImportModule
 if TYPE_CHECKING:
     from .semantic_seg_model import SemanticSegmentation
     from .segformer import Segformer
+    from .ddpm_segmentation_model import DDPMSegmentationModel
 
 else:
     _import_structure = {
         'semantic_seg_model': ['SemanticSegmentation'],
-        'segformer': ['Segformer']
+        'segformer': ['Segformer'],
+        'ddpm_segmentation_model': ['DDPMSegmentationModel']
     }
 
     import sys
diff --git a/modelscope/models/cv/image_semantic_segmentation/ddpm_seg/__init__.py b/modelscope/models/cv/image_semantic_segmentation/ddpm_seg/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/modelscope/models/cv/image_semantic_segmentation/ddpm_seg/data_util.py b/modelscope/models/cv/image_semantic_segmentation/ddpm_seg/data_util.py
new file mode 100644
index 00000000..b84d223d
--- /dev/null
+++ b/modelscope/models/cv/image_semantic_segmentation/ddpm_seg/data_util.py
@@ -0,0 +1,291 @@
+# The implementation here is adopted from ddpm-segmentation,
+# originally Apache 2.0 License and publicly avaialbe at https://github.com/yandex-research/ddpm-segmentation
+
+
+def get_palette(category):
+    if category == 'ffhq_34':
+        return ffhq_palette
+    elif category == 'bedroom_28':
+        return bedroom_palette
+    elif category == 'cat_15':
+        return cat_palette
+    elif category == 'horse_21':
+        return horse_palette
+    elif category == 'ade_bedroom_30':
+        return ade_bedroom_30_palette
+    elif category == 'celeba_19':
+        return celeba_palette
+
+
+def get_class_names(category):
+    if category == 'ffhq_34':
+        return ffhq_class
+    elif category == 'bedroom_28':
+        return bedroom_class
+    elif category == 'cat_15':
+        return cat_class
+    elif category == 'horse_21':
+        return horse_class
+    elif category == 'ade_bedroom_30':
+        return ade_bedroom_30_class
+    elif category == 'celeba_19':
+        return celeba_class
+
+
+###############
+# Class names #
+###############
+
+bedroom_class = [
+    'background', 'bed', 'bed***footboard', 'bed***headboard',
+    'bed***side rail', 'carpet', 'ceiling', 'chandelier / ceiling fan blade',
+    'curtain', 'cushion', 'floor', 'table/nightstand/dresser',
+    'table/nightstand/dresser***top', 'picture / mirrow', 'pillow',
+    'lamp***column', 'lamp***shade', 'wall', 'window', 'curtain rod',
+    'window***frame', 'chair', 'picture / mirror***frame', 'plinth',
+    'door / door frame', 'pouf', 'wardrobe', 'plant', 'table staff'
+]
+
+ffhq_class = [
+    'background', 'head', 'head***cheek', 'head***chin', 'head***ear',
+    'head***ear***helix', 'head***ear***lobule', 'head***eye***bottom lid',
+    'head***eye***eyelashes', 'head***eye***iris', 'head***eye***pupil',
+    'head***eye***sclera', 'head***eye***tear duct', 'head***eye***top lid',
+    'head***eyebrow', 'head***forehead', 'head***frown', 'head***hair',
+    'head***hair***sideburns', 'head***jaw', 'head***moustache',
+    'head***mouth***inferior lip', 'head***mouth***oral commissure',
+    'head***mouth***superior lip', 'head***mouth***teeth', 'head***neck',
+    'head***nose', 'head***nose***ala of nose', 'head***nose***bridge',
+    'head***nose***nose tip', 'head***nose***nostril', 'head***philtrum',
+    'head***temple', 'head***wrinkles'
+]
+
+cat_class = [
+    'background', 'back', 'belly', 'chest', 'leg', 'paw', 'head', 'ear', 'eye',
+    'mouth', 'tongue', 'nose', 'tail', 'whiskers', 'neck'
+]
+
+horse_class = [
+    'background', 'person', 'back', 'barrel', 'bridle', 'chest', 'ear', 'eye',
+    'forelock', 'head', 'hoof', 'leg', 'mane', 'muzzle', 'neck', 'nostril',
+    'tail', 'thigh', 'saddle', 'shoulder', 'leg protection'
+]
+
+celeba_class = [
+    'background', 'cloth', 'ear_r', 'eye_g', 'hair', 'hat', 'l_brow', 'l_ear',
+    'l_eye', 'l_lip', 'mouth', 'neck', 'neck_l', 'nose', 'r_brow', 'r_ear',
+    'r_eye', 'skin', 'u_lip'
+]
+
+ade_bedroom_50_class = [
+    'wall', 'bed', 'floor', 'table', 'lamp', 'ceiling', 'painting',
+    'windowpane', 'pillow', 'curtain', 'cushion', 'door', 'chair', 'cabinet',
+    'chest', 'mirror', 'rug', 'armchair', 'book', 'sconce', 'plant',
+    'wardrobe', 'clock', 'light', 'flower', 'vase', 'fan', 'box', 'shelf',
+    'television', 'blind', 'pot', 'ottoman', 'sofa', 'desk', 'basket',
+    'blanket', 'coffee', 'plaything', 'radiator', 'tray', 'stool', 'bottle',
+    'chandelier', 'fireplacel', 'towel', 'railing', 'canopy', 'glass', 'plate'
+]
+
+ade_bedroom_40_class = ade_bedroom_50_class[:40]
+ade_bedroom_30_class = ade_bedroom_50_class[:30]
+
+###########
+# Palette #
+###########
+
+ffhq_palette = [
+    1.0000, 1.0000, 1.0000, 0.4420, 0.5100, 0.4234, 0.8562, 0.9537, 0.3188,
+    0.2405, 0.4699, 0.9918, 0.8434, 0.9329, 0.7544, 0.3748, 0.7917, 0.3256,
+    0.0190, 0.4943, 0.3782, 0.7461, 0.0137, 0.5684, 0.1644, 0.2402, 0.7324,
+    0.0200, 0.4379, 0.4100, 0.5853, 0.8880, 0.6137, 0.7991, 0.9132, 0.9720,
+    0.6816, 0.6237, 0.8562, 0.9981, 0.4692, 0.3849, 0.5351, 0.8242, 0.2731,
+    0.1747, 0.3626, 0.8345, 0.5323, 0.6668, 0.4922, 0.2122, 0.3483, 0.4707,
+    0.6844, 0.1238, 0.1452, 0.3882, 0.4664, 0.1003, 0.2296, 0.0401, 0.3030,
+    0.5751, 0.5467, 0.9835, 0.1308, 0.9628, 0.0777, 0.2849, 0.1846, 0.2625,
+    0.9764, 0.9420, 0.6628, 0.3893, 0.4456, 0.6433, 0.8705, 0.3957, 0.0963,
+    0.6117, 0.9702, 0.0247, 0.3668, 0.6694, 0.3117, 0.6451, 0.7302, 0.9542,
+    0.6171, 0.1097, 0.9053, 0.3377, 0.4950, 0.7284, 0.1655, 0.9254, 0.6557,
+    0.9450, 0.6721, 0.6162
+]
+
+ffhq_palette = [int(item * 255) for item in ffhq_palette]
+
+bedroom_palette = [
+    255,
+    255,
+    255,  # bg
+    238,
+    229,
+    102,  # bed
+    255,
+    72,
+    69,  # bed footboard
+    124,
+    99,
+    34,  # bed headboard
+    193,
+    127,
+    15,  # bed side rail
+    106,
+    177,
+    21,  # carpet
+    248,
+    213,
+    43,  # ceiling
+    252,
+    155,
+    83,  # chandelier / ceiling fan blade
+    220,
+    147,
+    77,  # curtain
+    99,
+    83,
+    3,  # cushion
+    116,
+    116,
+    138,  # floor
+    63,
+    182,
+    24,  # table/nightstand/dresser
+    200,
+    226,
+    37,  # table/nightstand/dresser top
+    225,
+    184,
+    161,  # picture / mirrow
+    233,
+    5,
+    219,  # pillow
+    142,
+    172,
+    248,  # lamp column
+    153,
+    112,
+    146,  # lamp shade
+    38,
+    112,
+    254,  # wall
+    229,
+    30,
+    141,  # window
+    99,
+    205,
+    255,  # curtain rod
+    74,
+    59,
+    83,  # window frame
+    186,
+    9,
+    0,  # chair
+    107,
+    121,
+    0,  # picture / mirrow frame
+    0,
+    194,
+    160,  # plinth
+    255,
+    170,
+    146,  # door / door frame
+    255,
+    144,
+    201,  # pouf
+    185,
+    3,
+    170,  # wardrobe
+    221,
+    239,
+    255,  # plant
+    0,
+    0,
+    53,  # table staff
+]
+
+cat_palette = [
+    255, 255, 255, 190, 153, 153, 250, 170, 30, 220, 220, 0, 107, 142, 35, 102,
+    102, 156, 152, 251, 152, 119, 11, 32, 244, 35, 232, 220, 20, 60, 52, 83,
+    84, 194, 87, 125, 143, 176, 255, 31, 102, 211, 104, 131, 101
+]
+
+horse_palette = [
+    255, 255, 255, 255, 74, 70, 0, 137, 65, 0, 111, 166, 163, 0, 89, 255, 219,
+    229, 122, 73, 0, 0, 0, 166, 99, 255, 172, 183, 151, 98, 0, 77, 67, 143,
+    176, 255, 241, 38, 110, 27, 210, 105, 128, 150, 147, 228, 230, 158, 160,
+    136, 106, 79, 198, 1, 59, 93, 255, 115, 214, 209, 255, 47, 128
+]
+
+celeba_palette = [
+    255,
+    255,
+    255,  # 0 background
+    238,
+    229,
+    102,  # 1 cloth
+    250,
+    150,
+    50,  # 2 ear_r
+    124,
+    99,
+    34,  # 3 eye_g
+    193,
+    127,
+    15,  # 4 hair
+    225,
+    96,
+    18,  # 5 hat
+    220,
+    147,
+    77,  # 6 l_brow
+    99,
+    83,
+    3,  # 7 l_ear
+    116,
+    116,
+    138,  # 8 l_eye
+    200,
+    226,
+    37,  # 9 l_lip
+    225,
+    184,
+    161,  # 10 mouth
+    142,
+    172,
+    248,  # 11 neck
+    153,
+    112,
+    146,  # 12 neck_l
+    38,
+    112,
+    254,  # 13 nose
+    229,
+    30,
+    141,  # 14 r_brow
+    52,
+    83,
+    84,  # 15 r_ear
+    194,
+    87,
+    125,  # 16 r_eye
+    248,
+    213,
+    42,  # 17 skin
+    31,
+    102,
+    211,  # 18 u_lip
+]
+
+ade_bedroom_50_palette = [
+    240, 156, 206, 69, 88, 93, 240, 49, 184, 27, 107, 126, 50, 82, 241, 54,
+    250, 147, 156, 213, 3, 176, 108, 79, 251, 150, 149, 66, 51, 34, 210, 97,
+    53, 30, 53, 102, 232, 164, 118, 204, 150, 17, 101, 86, 178, 249, 20, 213,
+    54, 35, 82, 157, 68, 216, 58, 161, 73, 174, 67, 67, 193, 181, 78, 169, 60,
+    178, 220, 204, 166, 4, 127, 85, 245, 106, 216, 222, 172, 168, 84, 148, 105,
+    137, 220, 89, 68, 252, 126, 29, 193, 187, 74, 40, 101, 52, 71, 61, 38, 92,
+    205, 40, 104, 224, 146, 74, 160, 69, 43, 220, 70, 78, 213, 249, 93, 254,
+    235, 71, 119, 193, 255, 102, 152, 55, 238, 133, 12, 223, 106, 116, 123, 86,
+    14, 174, 244, 160, 161, 142, 105, 60, 153, 61, 124, 195, 156, 253, 241, 84,
+    222, 202, 171, 227
+]
+
+ade_bedroom_40_palette = ade_bedroom_50_palette[:120]
+ade_bedroom_30_palette = ade_bedroom_50_palette[:90]
diff --git a/modelscope/models/cv/image_semantic_segmentation/ddpm_seg/feature_extractors.py b/modelscope/models/cv/image_semantic_segmentation/ddpm_seg/feature_extractors.py
new file mode 100644
index 00000000..4973b6b7
--- /dev/null
+++ b/modelscope/models/cv/image_semantic_segmentation/ddpm_seg/feature_extractors.py
@@ -0,0 +1,137 @@
+# The implementation here is modified based on ddpm-segmentation,
+# originally Apache 2.0 License and publicly avaialbe at https://github.com/yandex-research/ddpm-segmentation
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+from typing import List
+
+import torch
+from torch import nn
+
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+
+def create_feature_extractor(model_type, **kwargs):
+    """ Create the feature extractor for <model_type> architecture. """
+    if model_type == 'ddpm':
+        logger.info('Creating DDPM Feature Extractor...')
+        feature_extractor = FeatureExtractorDDPM(**kwargs)
+    else:
+        raise Exception(f'Wrong model type: {model_type}')
+    return feature_extractor
+
+
+def save_tensors(module: nn.Module, features, name: str):
+    """ Process and save activations in the module. """
+    if type(features) in [list, tuple]:
+        features = [
+            f.detach().float() if f is not None else None for f in features
+        ]
+        setattr(module, name, features)
+    elif isinstance(features, dict):
+        features = {k: f.detach().float() for k, f in features.items()}
+        setattr(module, name, features)
+    else:
+        setattr(module, name, features.detach().float())
+
+
+def save_out_hook(self, inp, out):
+    save_tensors(self, out, 'activations')
+    # return out
+
+
+def save_input_hook(self, inp, out):
+    save_tensors(self, inp[0], 'activations')
+    # return out
+
+
+class FeatureExtractor(nn.Module):
+
+    def __init__(self, model_path: str, input_activations: bool, **kwargs):
+        '''
+        Parent feature extractor class.
+
+        param: model_path: path to the pretrained model
+        param: input_activations:
+            If True, features are input activations of the corresponding blocks
+            If False, features are output activations of the corresponding blocks
+        '''
+        super().__init__()
+        self._load_pretrained_model(model_path, **kwargs)
+        logger.info(
+            f'Pretrained model is successfully loaded from {model_path}')
+        self.save_hook = save_input_hook if input_activations else save_out_hook
+        self.feature_blocks = []
+
+    def _load_pretrained_model(self, model_path: str, **kwargs):
+        pass
+
+
+class FeatureExtractorDDPM(FeatureExtractor):
+    '''
+    Wrapper to extract features from pretrained DDPMs.
+
+    :param steps: list of diffusion steps t.
+    :param blocks: list of the UNet decoder blocks.
+    '''
+
+    def __init__(self, steps: List[int], blocks: List[int], **kwargs):
+        super().__init__(**kwargs)
+        self.steps = steps
+
+        # Save decoder activations
+        for idx, block in enumerate(self.model.output_blocks):
+            if idx in blocks:
+                block.register_forward_hook(self.save_hook)
+                self.feature_blocks.append(block)
+
+    def _load_pretrained_model(self, model_path, **kwargs):
+        import inspect
+        from ddpm_guided_diffusion.script_util import create_model_and_diffusion
+
+        # Needed to pass only expected args to the function
+        argnames = inspect.getfullargspec(create_model_and_diffusion)[0]
+        expected_args = {name: kwargs[name] for name in argnames}
+        self.model, self.diffusion = create_model_and_diffusion(
+            **expected_args)
+
+        state_dict = torch.load(model_path, map_location='cpu')
+
+        self.model.load_state_dict(state_dict)
+
+        if kwargs['use_fp16']:
+            self.model.convert_to_fp16()
+        self.model.eval()
+
+    @torch.no_grad()
+    def forward(self, x, noise=None):
+        activations = []
+        for t in self.steps:
+            # Compute x_t and run DDPM
+            t = torch.tensor([t]).to(x.device)
+
+            noisy_x = self.diffusion.q_sample(x, t, noise=noise)
+            self.model(noisy_x, self.diffusion._scale_timesteps(t))
+
+            # Extract activations
+            for block in self.feature_blocks:
+                activations.append(block.activations)
+                block.activations = None
+
+        # Per-layer list of activations [N, C, H, W]
+        return activations
+
+
+def collect_features(cfg, activations: List[torch.Tensor], sample_idx=0):
+    """ Upsample activations and concatenate them to form a feature tensor """
+    assert all([isinstance(acts, torch.Tensor) for acts in activations])
+    size = tuple(cfg.mlp.dim[:-1])
+    resized_activations = []
+    for feats in activations:
+        feats = feats[sample_idx][None]
+        feats = nn.functional.interpolate(
+            feats, size=size, mode=cfg.ddpm.upsample_mode)
+        resized_activations.append(feats[0])
+
+    return torch.cat(resized_activations, dim=0)
diff --git a/modelscope/models/cv/image_semantic_segmentation/ddpm_seg/pixel_classifier.py b/modelscope/models/cv/image_semantic_segmentation/ddpm_seg/pixel_classifier.py
new file mode 100644
index 00000000..5af3b3c4
--- /dev/null
+++ b/modelscope/models/cv/image_semantic_segmentation/ddpm_seg/pixel_classifier.py
@@ -0,0 +1,151 @@
+# The implementation here is modified based on ddpm-segmentation,
+# originally Apache 2.0 License and publicly avaialbe at https://github.com/yandex-research/ddpm-segmentation
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import os
+from collections import Counter
+
+import numpy as np
+import torch
+import torch.nn as nn
+from PIL import Image
+from torch.distributions import Categorical
+
+from .data_util import get_class_names, get_palette
+from .utils import colorize_mask, oht_to_scalar
+
+
+# Adopted from https://github.com/nv-tlabs/datasetGAN/train_interpreter.py
+class pixel_classifier(nn.Module):
+
+    def __init__(self, category, dim, **kwargs):
+        super(pixel_classifier, self).__init__()
+        category_cfg = kwargs.get(category, None)
+        assert category_cfg is not None
+        class_num = category_cfg['number_class']
+
+        dim = dim[-1]
+
+        if class_num < 30:
+            self.layers = nn.Sequential(
+                nn.Linear(dim,
+                          128), nn.ReLU(), nn.BatchNorm1d(num_features=128),
+                nn.Linear(128, 32), nn.ReLU(), nn.BatchNorm1d(num_features=32),
+                nn.Linear(32, class_num))
+        else:
+            self.layers = nn.Sequential(
+                nn.Linear(dim,
+                          256), nn.ReLU(), nn.BatchNorm1d(num_features=256),
+                nn.Linear(256, 128), nn.ReLU(),
+                nn.BatchNorm1d(num_features=128), nn.Linear(128, class_num))
+
+    def init_weights(self, init_type='normal', gain=0.02):
+        '''
+        initialize network's weights
+        init_type: normal | xavier | kaiming | orthogonal
+        https://github.com/junyanz/pytorch-CycleGAN-and-pix2pix/blob/9451e70673400885567d08a9e97ade2524c700d0/models/networks.py#L39
+        '''
+
+        def init_func(m):
+            classname = m.__class__.__name__
+            if hasattr(m, 'weight') and (classname.find('Conv') != -1
+                                         or classname.find('Linear') != -1):
+                if init_type == 'normal':
+                    nn.init.normal_(m.weight.data, 0.0, gain)
+                elif init_type == 'xavier':
+                    nn.init.xavier_normal_(m.weight.data, gain=gain)
+                elif init_type == 'kaiming':
+                    nn.init.kaiming_normal_(m.weight.data, a=0, mode='fan_in')
+                elif init_type == 'orthogonal':
+                    nn.init.orthogonal_(m.weight.data, gain=gain)
+
+                if hasattr(m, 'bias') and m.bias is not None:
+                    nn.init.constant_(m.bias.data, 0.0)
+
+            elif classname.find('BatchNorm2d') != -1:
+                nn.init.normal_(m.weight.data, 1.0, gain)
+                nn.init.constant_(m.bias.data, 0.0)
+
+        self.apply(init_func)
+
+    def forward(self, x):
+        return self.layers(x)
+
+
+def predict_labels(models, features, size):
+    if isinstance(features, np.ndarray):
+        features = torch.from_numpy(features)
+
+    mean_seg = None
+    all_seg = []
+    all_entropy = []
+    seg_mode_ensemble = []
+
+    softmax_f = nn.Softmax(dim=1)
+    with torch.no_grad():
+        for MODEL_NUMBER in range(len(models)):
+            models[MODEL_NUMBER].to(features.device)
+            preds = models[MODEL_NUMBER](features)
+            entropy = Categorical(logits=preds).entropy()
+            all_entropy.append(entropy)
+            all_seg.append(preds)
+
+            if mean_seg is None:
+                mean_seg = softmax_f(preds)
+            else:
+                mean_seg += softmax_f(preds)
+
+            img_seg = oht_to_scalar(preds)
+            img_seg = img_seg.reshape(*size)
+            img_seg = img_seg.cpu().detach()
+
+            seg_mode_ensemble.append(img_seg)
+
+        mean_seg = mean_seg / len(all_seg)
+
+        full_entropy = Categorical(mean_seg).entropy()
+
+        js = full_entropy - torch.mean(torch.stack(all_entropy), 0)
+        top_k = js.sort()[0][-int(js.shape[0] / 10):].mean()
+
+        img_seg_final = torch.stack(seg_mode_ensemble, dim=-1)
+        img_seg_final = torch.mode(img_seg_final, 2)[0]
+    return img_seg_final, top_k
+
+
+def load_ensemble(model_num=1,
+                  model_path='',
+                  device='cpu',
+                  category='ffhq_34',
+                  dim=[256, 256, 8448],
+                  **kwargs):
+    models = []
+
+    for i in range(model_num):
+        per_model_path = os.path.join(model_path, f'model_{i}.pth')
+        state_dict = torch.load(
+            per_model_path, map_location='cpu')['model_state_dict']
+        new_state_dict = {
+            k.replace('module.', ''): v
+            for k, v in state_dict.items()
+        }
+        model = pixel_classifier(category, dim, **kwargs)
+        model.load_state_dict(new_state_dict)
+        models.append(model.eval())
+    return models
+
+
+def save_predictions(preds, category='ffhq_34'):
+    palette = get_palette(category)
+
+    masks = []
+    out_imgs = []
+    for i, pred in enumerate(preds['pred']):
+
+        pred = np.squeeze(pred)
+        masks.append(pred)
+
+        out_img = colorize_mask(pred, palette)
+        out_img = Image.fromarray(out_img)
+        out_imgs.append(out_img)
+    return masks, out_imgs
diff --git a/modelscope/models/cv/image_semantic_segmentation/ddpm_seg/utils.py b/modelscope/models/cv/image_semantic_segmentation/ddpm_seg/utils.py
new file mode 100644
index 00000000..3eeafe96
--- /dev/null
+++ b/modelscope/models/cv/image_semantic_segmentation/ddpm_seg/utils.py
@@ -0,0 +1,71 @@
+"""
+Copyright (C) 2021 NVIDIA Corporation.  All rights reserved.
+Licensed under The MIT License (MIT)
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
+the Software, and to permit persons to whom the Software is furnished to do so,
+subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+"""
+
+import random
+
+import numpy as np
+import torch
+from PIL import Image
+
+
+def multi_acc(y_pred, y_test):
+    y_pred_softmax = torch.log_softmax(y_pred, dim=1)
+    _, y_pred_tags = torch.max(y_pred_softmax, dim=1)
+
+    correct_pred = (y_pred_tags == y_test).float()
+    acc = correct_pred.sum() / len(correct_pred)
+
+    acc = acc * 100
+
+    return acc
+
+
+def oht_to_scalar(y_pred):
+    y_pred_softmax = torch.log_softmax(y_pred, dim=1)
+    _, y_pred_tags = torch.max(y_pred_softmax, dim=1)
+
+    return y_pred_tags
+
+
+def colorize_mask(mask, palette):
+    # mask: numpy array of the mask
+    new_mask = Image.fromarray(mask.astype(np.uint8)).convert('P')
+    new_mask.putpalette(palette)
+    return np.array(new_mask.convert('RGB'))
+
+
+def to_labels(masks, palette):
+    results = np.zeros((len(masks), 256, 256), dtype=np.int32)
+    label = 0
+    for color in palette:
+        idxs = np.where((masks == color).all(-1))
+        results[idxs] = label
+        label += 1
+    return results
+
+
+def setup_seed(seed):
+    print('Seed: ', seed)
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
diff --git a/modelscope/models/cv/image_semantic_segmentation/ddpm_segmentation_model.py b/modelscope/models/cv/image_semantic_segmentation/ddpm_segmentation_model.py
new file mode 100644
index 00000000..ea023e1b
--- /dev/null
+++ b/modelscope/models/cv/image_semantic_segmentation/ddpm_segmentation_model.py
@@ -0,0 +1,80 @@
+# The implementation here is modified based on ddpm-segmentation,
+# originally Apache 2.0 License and publicly avaialbe at https://github.com/yandex-research/ddpm-segmentation
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import os.path as osp
+from typing import Any, Dict
+
+import torch
+from ddpm_guided_diffusion.script_util import model_and_diffusion_defaults
+
+from modelscope.metainfo import Models
+from modelscope.models.base.base_torch_model import TorchModel
+from modelscope.models.builder import MODELS
+from modelscope.utils.config import Config
+from modelscope.utils.constant import ModelFile, Tasks
+from modelscope.utils.logger import get_logger
+from .ddpm_seg.feature_extractors import (collect_features,
+                                          create_feature_extractor)
+from .ddpm_seg.pixel_classifier import (load_ensemble, pixel_classifier,
+                                        predict_labels, save_predictions)
+
+logger = get_logger()
+
+
+@MODELS.register_module(Tasks.semantic_segmentation, module_name=Models.ddpm)
+class DDPMSegmentationModel(TorchModel):
+
+    def __init__(self, model_dir: str, *args, **kwargs):
+        """str -- model file root."""
+        super().__init__(model_dir, *args, **kwargs)
+
+        config_path = osp.join(model_dir, ModelFile.CONFIGURATION)
+        self.cfg = Config.from_file(config_path)
+        self.cfg.model.mlp.category = self.cfg.pipeline.category
+
+        self.cfg.model.ddpm.model_path = osp.join(model_dir,
+                                                  ModelFile.TORCH_MODEL_FILE)
+        default_ddpm_args = model_and_diffusion_defaults()
+        default_ddpm_args.update(self.cfg.model.ddpm)
+        self.feature_extractor = create_feature_extractor(**default_ddpm_args)
+
+        self.cfg.model.mlp.model_path = osp.join(model_dir,
+                                                 self.cfg.model.mlp.category)
+        self.is_ensemble = kwargs.get('is_pipeline', True)
+        if self.is_ensemble:
+            logger.info('Load ensemble mlp ......')
+            self.seg_model = load_ensemble(**self.cfg.model.mlp)
+        else:
+            logger.info('Load single mlp ......')
+            self.seg_model = pixel_classifier(**self.cfg.model.mlp)
+
+    def forward(self, inputs: Dict[str, Any]) -> Dict[str, torch.Tensor]:
+        re = self.inference(inputs)
+        return re
+
+    def inference(self, batch, seed=0):
+        img = batch['input_img']
+        img = img[None]
+        w, h, c = self.cfg.model.mlp.dim
+
+        if self.cfg.model.ddpm.share_noise:
+            rnd_gen = torch.Generator().manual_seed(seed)
+            noise = torch.randn(1, 3, w, h, generator=rnd_gen)
+            noise = noise.to(img.device)
+        else:
+            noise = None
+
+        features = self.feature_extractor(img, noise=noise)
+        features = collect_features(self.cfg.model, features)
+
+        x = features.view(c, -1).permute(1, 0)
+
+        pred, _ = predict_labels(self.seg_model, x, size=(w, h))
+
+        return {'pred': [pred.numpy()]}
+
+    def postprocess(self, inputs: Dict[str, Any], **kwargs):
+        category = self.cfg.model.mlp.category
+        mask, out_img = save_predictions(inputs, category)
+        return mask, out_img
diff --git a/modelscope/models/cv/image_skychange/preprocessor.py b/modelscope/models/cv/image_skychange/preprocessor.py
index 570fb6be..87edca0e 100644
--- a/modelscope/models/cv/image_skychange/preprocessor.py
+++ b/modelscope/models/cv/image_skychange/preprocessor.py
@@ -62,13 +62,13 @@ class ImageSkyChangePreprocessor(Preprocessor):
         Args:
             data (dict): data dict containing following info:
                 sky_image, scene_image
-                example:
-                    ```python
-                    {
-                        "sky_image": "xxx.jpg" # sky_image path(str)
-                        "scene_image": "xxx.jpg", # scene_image path(str)
-                    }
-                    ```
+
+        Example:
+            >>> {
+            >>>     "sky_image": "xxx.jpg" # sky_image path(str)
+            >>>     "scene_image": "xxx.jpg", # scene_image path(str)
+            >>> }
+
         Returns:
             Dict[str, Any]: the preprocessed data
             {
diff --git a/modelscope/models/cv/motion_generation/__init__.py b/modelscope/models/cv/motion_generation/__init__.py
new file mode 100644
index 00000000..0f8cbad7
--- /dev/null
+++ b/modelscope/models/cv/motion_generation/__init__.py
@@ -0,0 +1,24 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+
+    from .model import create_model, load_model_wo_clip
+    from .modules.cfg_sampler import ClassifierFreeSampleModel
+else:
+    _import_structure = {
+        'model': ['create_model', 'load_model_wo_clip'],
+        'modules.cfg_sampler': ['ClassifierFreeSampleModel']
+    }
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/models/cv/motion_generation/model.py b/modelscope/models/cv/motion_generation/model.py
new file mode 100644
index 00000000..aa944ada
--- /dev/null
+++ b/modelscope/models/cv/motion_generation/model.py
@@ -0,0 +1,65 @@
+# This code is borrowed and modified from Human Motion Diffusion Model,
+# made publicly available under MIT license at https://github.com/GuyTevet/motion-diffusion-model
+
+from .modules import gaussian_diffusion as gd
+from .modules.mdm import MDM
+from .modules.respace import SpacedDiffusion, space_timesteps
+
+
+def load_model_wo_clip(model, state_dict):
+    missing_keys, unexpected_keys = model.load_state_dict(
+        state_dict, strict=False)
+    assert len(unexpected_keys) == 0
+    assert all([k.startswith('clip_model.') for k in missing_keys])
+
+
+def create_model(cfg):
+    model = MDM(
+        '',
+        njoints=263,
+        nfeats=1,
+        num_actions=1,
+        translation=True,
+        pose_rep='rot6d',
+        glob=True,
+        glob_rot=True,
+        latent_dim=512,
+        ff_size=1024,
+        smpl_data_path=cfg.smpl_data_path,
+        data_rep='hml_vec',
+        dataset='humanml',
+        clip_version='ViT-B/32',
+        **{
+            'cond_mode': 'text',
+            'cond_mask_prob': 0.1,
+            'action_emb': 'tensor'
+        })
+
+    predict_xstart = True  # we always predict x_start (a.k.a. x0), that's our deal!
+    steps = cfg.sample_steps
+    scale_beta = 1.  # no scaling
+    timestep_respacing = ''  # can be used for ddim sampling, we don't use it.
+    learn_sigma = False
+    rescale_timesteps = False
+
+    betas = gd.get_named_beta_schedule('cosine', steps, scale_beta)
+    loss_type = gd.LossType.MSE
+
+    if not timestep_respacing:
+        timestep_respacing = [steps]
+
+    diffusion = SpacedDiffusion(
+        use_timesteps=space_timesteps(steps, timestep_respacing),
+        betas=betas,
+        model_mean_type=(gd.ModelMeanType.EPSILON
+                         if not predict_xstart else gd.ModelMeanType.START_X),
+        model_var_type=((gd.ModelVarType.FIXED_LARGE
+                         if not True else gd.ModelVarType.FIXED_SMALL)
+                        if not learn_sigma else gd.ModelVarType.LEARNED_RANGE),
+        loss_type=loss_type,
+        rescale_timesteps=rescale_timesteps,
+        lambda_vel=0.0,
+        lambda_rcxyz=0.0,
+        lambda_fc=0.0,
+    )
+    return model, diffusion
diff --git a/modelscope/models/cv/motion_generation/modules/__init__.py b/modelscope/models/cv/motion_generation/modules/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/modelscope/models/cv/motion_generation/modules/cfg_sampler.py b/modelscope/models/cv/motion_generation/modules/cfg_sampler.py
new file mode 100644
index 00000000..cd07362d
--- /dev/null
+++ b/modelscope/models/cv/motion_generation/modules/cfg_sampler.py
@@ -0,0 +1,33 @@
+# This code is borrowed and modified from Human Motion Diffusion Model,
+# made publicly available under MIT license at https://github.com/GuyTevet/motion-diffusion-model
+from copy import deepcopy
+
+import torch.nn as nn
+
+
+# A wrapper model for Classifier-free guidance **SAMPLING** only
+# https://arxiv.org/abs/2207.12598
+class ClassifierFreeSampleModel(nn.Module):
+
+    def __init__(self, model):
+        super().__init__()
+        self.model = model  # model is the actual model to run
+
+        assert self.model.cond_mask_prob > 0
+
+        # pointers to inner model
+        self.rot2xyz = self.model.rot2xyz
+        self.translation = self.model.translation
+        self.njoints = self.model.njoints
+        self.nfeats = self.model.nfeats
+        self.data_rep = self.model.data_rep
+        self.cond_mode = self.model.cond_mode
+
+    def forward(self, x, timesteps, y=None):
+        cond_mode = self.model.cond_mode
+        assert cond_mode in ['text', 'action']
+        y_uncond = deepcopy(y)
+        y_uncond['uncond'] = True
+        out = self.model(x, timesteps, y)
+        out_uncond = self.model(x, timesteps, y_uncond)
+        return out_uncond + (y['scale'].view(-1, 1, 1, 1) * (out - out_uncond))
diff --git a/modelscope/models/cv/motion_generation/modules/gaussian_diffusion.py b/modelscope/models/cv/motion_generation/modules/gaussian_diffusion.py
new file mode 100644
index 00000000..2d283642
--- /dev/null
+++ b/modelscope/models/cv/motion_generation/modules/gaussian_diffusion.py
@@ -0,0 +1,666 @@
+# This code is borrowed and modified from Human Motion Diffusion Model,
+# made publicly available under MIT license at https://github.com/GuyTevet/motion-diffusion-model
+
+import enum
+import math
+from copy import deepcopy
+
+import numpy as np
+import torch as th
+
+
+def get_named_beta_schedule(schedule_name,
+                            num_diffusion_timesteps,
+                            scale_betas=1.):
+    """
+    Get a pre-defined beta schedule for the given name.
+
+    The beta schedule library consists of beta schedules which remain similar
+    in the limit of num_diffusion_timesteps.
+    Beta schedules may be added, but should not be removed or changed once
+    they are committed to maintain backwards compatibility.
+    """
+    if schedule_name == 'linear':
+        # Linear schedule from Ho et al, extended to work for any number of
+        # diffusion steps.
+        scale = scale_betas * 1000 / num_diffusion_timesteps
+        beta_start = scale * 0.0001
+        beta_end = scale * 0.02
+        return np.linspace(
+            beta_start, beta_end, num_diffusion_timesteps, dtype=np.float64)
+    elif schedule_name == 'cosine':
+        return betas_for_alpha_bar(
+            num_diffusion_timesteps,
+            lambda t: math.cos((t + 0.008) / 1.008 * math.pi / 2)**2,
+        )
+    else:
+        raise NotImplementedError(f'unknown beta schedule: {schedule_name}')
+
+
+def betas_for_alpha_bar(num_diffusion_timesteps, alpha_bar, max_beta=0.999):
+    """
+    Create a beta schedule that discretizes the given alpha_t_bar function,
+    which defines the cumulative product of (1-beta) over time from t = [0,1].
+
+    :param num_diffusion_timesteps: the number of betas to produce.
+    :param alpha_bar: a lambda that takes an argument t from 0 to 1 and
+                      produces the cumulative product of (1-beta) up to that
+                      part of the diffusion process.
+    :param max_beta: the maximum beta to use; use values lower than 1 to
+                     prevent singularities.
+    """
+    betas = []
+    for i in range(num_diffusion_timesteps):
+        t1 = i / num_diffusion_timesteps
+        t2 = (i + 1) / num_diffusion_timesteps
+        betas.append(min(1 - alpha_bar(t2) / alpha_bar(t1), max_beta))
+    return np.array(betas)
+
+
+class ModelMeanType(enum.Enum):
+    """
+    Which type of output the model predicts.
+    """
+
+    PREVIOUS_X = enum.auto()  # the model predicts x_{t-1}
+    START_X = enum.auto()  # the model predicts x_0
+    EPSILON = enum.auto()  # the model predicts epsilon
+
+
+class ModelVarType(enum.Enum):
+    """
+    What is used as the model's output variance.
+
+    The LEARNED_RANGE option has been added to allow the model to predict
+    values between FIXED_SMALL and FIXED_LARGE, making its job easier.
+    """
+
+    LEARNED = enum.auto()
+    FIXED_SMALL = enum.auto()
+    FIXED_LARGE = enum.auto()
+    LEARNED_RANGE = enum.auto()
+
+
+class LossType(enum.Enum):
+    MSE = enum.auto()  # use raw MSE loss (and KL when learning variances)
+    RESCALED_MSE = (
+        enum.auto()
+    )  # use raw MSE loss (with RESCALED_KL when learning variances)
+    KL = enum.auto()  # use the variational lower-bound
+    RESCALED_KL = enum.auto()  # like KL, but rescale to estimate the full VLB
+
+    def is_vb(self):
+        return self == LossType.KL or self == LossType.RESCALED_KL
+
+
+class GaussianDiffusion:
+    """
+    Utilities for training and sampling diffusion models.
+
+    Ported directly from here, and then adapted over time to further experimentation.
+    https://github.com/hojonathanho/diffusion/blob/1e0dceb3b3495bbe19116a5e1b3596cd0706c543/diffusion_tf/diffusion_utils_2.py#L42
+
+    :param betas: a 1-D numpy array of betas for each diffusion timestep,
+                  starting at T and going to 1.
+    :param model_mean_type: a ModelMeanType determining what the model outputs.
+    :param model_var_type: a ModelVarType determining how variance is output.
+    :param loss_type: a LossType determining the loss function to use.
+    :param rescale_timesteps: if True, pass floating point timesteps into the
+                              model so that they are always scaled like in the
+                              original paper (0 to 1000).
+    """
+
+    def __init__(
+        self,
+        *,
+        betas,
+        model_mean_type,
+        model_var_type,
+        loss_type,
+        rescale_timesteps=False,
+        lambda_rcxyz=0.,
+        lambda_vel=0.,
+        lambda_pose=1.,
+        lambda_orient=1.,
+        lambda_loc=1.,
+        data_rep='rot6d',
+        lambda_root_vel=0.,
+        lambda_vel_rcxyz=0.,
+        lambda_fc=0.,
+    ):
+        self.model_mean_type = model_mean_type
+        self.model_var_type = model_var_type
+        self.loss_type = loss_type
+        self.rescale_timesteps = rescale_timesteps
+        self.data_rep = data_rep
+
+        if data_rep != 'rot_vel' and lambda_pose != 1.:
+            raise ValueError(
+                'lambda_pose is relevant only when training on velocities!')
+        self.lambda_pose = lambda_pose
+        self.lambda_orient = lambda_orient
+        self.lambda_loc = lambda_loc
+
+        self.lambda_rcxyz = lambda_rcxyz
+        self.lambda_vel = lambda_vel
+        self.lambda_root_vel = lambda_root_vel
+        self.lambda_vel_rcxyz = lambda_vel_rcxyz
+        self.lambda_fc = lambda_fc
+
+        if self.lambda_rcxyz > 0. or self.lambda_vel > 0. or self.lambda_root_vel > 0. or \
+                self.lambda_vel_rcxyz > 0. or self.lambda_fc > 0.:
+            assert self.loss_type == LossType.MSE, 'Geometric losses are supported by MSE loss type only!'
+
+        # Use float64 for accuracy.
+        betas = np.array(betas, dtype=np.float64)
+        self.betas = betas
+        assert len(betas.shape) == 1, 'betas must be 1-D'
+        assert (betas > 0).all() and (betas <= 1).all()
+
+        self.num_timesteps = int(betas.shape[0])
+
+        alphas = 1.0 - betas
+        self.alphas_cumprod = np.cumprod(alphas, axis=0)
+        self.alphas_cumprod_prev = np.append(1.0, self.alphas_cumprod[:-1])
+        self.alphas_cumprod_next = np.append(self.alphas_cumprod[1:], 0.0)
+        assert self.alphas_cumprod_prev.shape == (self.num_timesteps, )
+
+        # calculations for diffusion q(x_t | x_{t-1}) and others
+        self.sqrt_alphas_cumprod = np.sqrt(self.alphas_cumprod)
+        self.sqrt_one_minus_alphas_cumprod = np.sqrt(1.0 - self.alphas_cumprod)
+        self.log_one_minus_alphas_cumprod = np.log(1.0 - self.alphas_cumprod)
+        self.sqrt_recip_alphas_cumprod = np.sqrt(1.0 / self.alphas_cumprod)
+        self.sqrt_recipm1_alphas_cumprod = np.sqrt(1.0 / self.alphas_cumprod
+                                                   - 1)
+
+        # calculations for posterior q(x_{t-1} | x_t, x_0)
+        self.posterior_variance = betas * (1.0 - self.alphas_cumprod_prev) / (
+            1.0 - self.alphas_cumprod)
+        # log calculation clipped because the posterior variance is 0 at the
+        # beginning of the diffusion chain.
+        self.posterior_log_variance_clipped = np.log(
+            np.append(self.posterior_variance[1], self.posterior_variance[1:]))
+        self.posterior_mean_coef1 = betas * np.sqrt(
+            self.alphas_cumprod_prev) / (1.0 - self.alphas_cumprod)
+        self.posterior_mean_coef2 = (1.0 - self.alphas_cumprod_prev) * np.sqrt(
+            alphas) / (1.0 - self.alphas_cumprod)
+
+        self.l2_loss = lambda a, b: (
+            a - b
+        )**2  # th.nn.MSELoss(reduction='none')  # must be None for handling mask later on.
+
+    def q_mean_variance(self, x_start, t):
+        """
+        Get the distribution q(x_t | x_0).
+
+        :param x_start: the [N x C x ...] tensor of noiseless inputs.
+        :param t: the number of diffusion steps (minus 1). Here, 0 means one step.
+        :return: A tuple (mean, variance, log_variance), all of x_start's shape.
+        """
+        mean = (
+            _extract_into_tensor(self.sqrt_alphas_cumprod, t, x_start.shape)
+            * x_start)
+        variance = _extract_into_tensor(1.0 - self.alphas_cumprod, t,
+                                        x_start.shape)
+        log_variance = _extract_into_tensor(self.log_one_minus_alphas_cumprod,
+                                            t, x_start.shape)
+        return mean, variance, log_variance
+
+    def q_sample(self, x_start, t, noise=None):
+        """
+        Diffuse the dataset for a given number of diffusion steps.
+
+        In other words, sample from q(x_t | x_0).
+
+        :param x_start: the initial dataset batch.
+        :param t: the number of diffusion steps (minus 1). Here, 0 means one step.
+        :param noise: if specified, the split-out normal noise.
+        :return: A noisy version of x_start.
+        """
+        if noise is None:
+            noise = th.randn_like(x_start)
+        assert noise.shape == x_start.shape
+        return (
+            _extract_into_tensor(self.sqrt_alphas_cumprod, t, x_start.shape)
+            * x_start + _extract_into_tensor(
+                self.sqrt_one_minus_alphas_cumprod, t, x_start.shape) * noise)
+
+    def q_posterior_mean_variance(self, x_start, x_t, t):
+        """
+        Compute the mean and variance of the diffusion posterior:
+
+            q(x_{t-1} | x_t, x_0)
+
+        """
+        assert x_start.shape == x_t.shape
+        posterior_mean = (
+            _extract_into_tensor(self.posterior_mean_coef1, t, x_t.shape)
+            * x_start
+            + _extract_into_tensor(self.posterior_mean_coef2, t, x_t.shape)
+            * x_t)
+        posterior_variance = _extract_into_tensor(self.posterior_variance, t,
+                                                  x_t.shape)
+        posterior_log_variance_clipped = _extract_into_tensor(
+            self.posterior_log_variance_clipped, t, x_t.shape)
+        return posterior_mean, posterior_variance, posterior_log_variance_clipped
+
+    def p_mean_variance(self,
+                        model,
+                        x,
+                        t,
+                        clip_denoised=True,
+                        denoised_fn=None,
+                        model_kwargs=None):
+        """
+        Apply the model to get p(x_{t-1} | x_t), as well as a prediction of
+        the initial x, x_0.
+
+        :param model: the model, which takes a signal and a batch of timesteps
+                      as input.
+        :param x: the [N x C x ...] tensor at time t.
+        :param t: a 1-D Tensor of timesteps.
+        :param clip_denoised: if True, clip the denoised signal into [-1, 1].
+        :param denoised_fn: if not None, a function which applies to the
+            x_start prediction before it is used to sample. Applies before
+            clip_denoised.
+        :param model_kwargs: if not None, a dict of extra keyword arguments to
+            pass to the model. This can be used for conditioning.
+        :return: a dict with the following keys:
+                 - 'mean': the model mean output.
+                 - 'variance': the model variance output.
+                 - 'log_variance': the log of 'variance'.
+                 - 'pred_xstart': the prediction for x_0.
+        """
+        if model_kwargs is None:
+            model_kwargs = {}
+
+        B, C = x.shape[:2]
+        assert t.shape == (B, )
+        model_output = model(x, self._scale_timesteps(t), **model_kwargs)
+
+        if 'inpainting_mask' in model_kwargs['y'].keys(
+        ) and 'inpainted_motion' in model_kwargs['y'].keys():
+            inpainting_mask, inpainted_motion = model_kwargs['y'][
+                'inpainting_mask'], model_kwargs['y']['inpainted_motion']
+            assert self.model_mean_type == ModelMeanType.START_X, 'This feature supports only X_start pred for mow!'
+            assert model_output.shape == inpainting_mask.shape == inpainted_motion.shape
+            model_output = (model_output * ~inpainting_mask) + (
+                inpainted_motion * inpainting_mask)
+
+        if self.model_var_type in [
+                ModelVarType.LEARNED, ModelVarType.LEARNED_RANGE
+        ]:
+            assert model_output.shape == (B, C * 2, *x.shape[2:])
+            model_output, model_var_values = th.split(model_output, C, dim=1)
+            if self.model_var_type == ModelVarType.LEARNED:
+                model_log_variance = model_var_values
+                model_variance = th.exp(model_log_variance)
+            else:
+                min_log = _extract_into_tensor(
+                    self.posterior_log_variance_clipped, t, x.shape)
+                max_log = _extract_into_tensor(np.log(self.betas), t, x.shape)
+                # The model_var_values is [-1, 1] for [min_var, max_var].
+                frac = (model_var_values + 1) / 2
+                model_log_variance = frac * max_log + (1 - frac) * min_log
+                model_variance = th.exp(model_log_variance)
+        else:
+            model_variance, model_log_variance = {
+                ModelVarType.FIXED_LARGE: (
+                    np.append(self.posterior_variance[1], self.betas[1:]),
+                    np.log(
+                        np.append(self.posterior_variance[1], self.betas[1:])),
+                ),
+                ModelVarType.FIXED_SMALL: (
+                    self.posterior_variance,
+                    self.posterior_log_variance_clipped,
+                ),
+            }[self.model_var_type]
+
+            model_variance = _extract_into_tensor(model_variance, t, x.shape)
+            model_log_variance = _extract_into_tensor(model_log_variance, t,
+                                                      x.shape)
+
+        def process_xstart(x):
+            if denoised_fn is not None:
+                x = denoised_fn(x)
+            if clip_denoised:
+                return x.clamp(-1, 1)
+            return x
+
+        if self.model_mean_type == ModelMeanType.PREVIOUS_X:
+            pred_xstart = process_xstart(
+                self._predict_xstart_from_xprev(
+                    x_t=x, t=t, xprev=model_output))
+            model_mean = model_output
+        elif self.model_mean_type in [
+                ModelMeanType.START_X, ModelMeanType.EPSILON
+        ]:  # THIS IS US!
+            if self.model_mean_type == ModelMeanType.START_X:
+                pred_xstart = process_xstart(model_output)
+            else:
+                pred_xstart = process_xstart(
+                    self._predict_xstart_from_eps(
+                        x_t=x, t=t, eps=model_output))
+            model_mean, _, _ = self.q_posterior_mean_variance(
+                x_start=pred_xstart, x_t=x, t=t)
+        else:
+            raise NotImplementedError(self.model_mean_type)
+
+        return {
+            'mean': model_mean,
+            'variance': model_variance,
+            'log_variance': model_log_variance,
+            'pred_xstart': pred_xstart,
+        }
+
+    def _predict_xstart_from_eps(self, x_t, t, eps):
+        assert x_t.shape == eps.shape
+        return (
+            _extract_into_tensor(self.sqrt_recip_alphas_cumprod, t, x_t.shape)
+            * x_t - _extract_into_tensor(self.sqrt_recipm1_alphas_cumprod, t,
+                                         x_t.shape) * eps)
+
+    def _predict_xstart_from_xprev(self, x_t, t, xprev):
+        assert x_t.shape == xprev.shape
+        return (  # (xprev - coef2*x_t) / coef1
+            _extract_into_tensor(1.0 / self.posterior_mean_coef1, t, x_t.shape)
+            * xprev - _extract_into_tensor(
+                self.posterior_mean_coef2 / self.posterior_mean_coef1, t,
+                x_t.shape) * x_t)
+
+    def _predict_eps_from_xstart(self, x_t, t, pred_xstart):
+        return (
+            _extract_into_tensor(self.sqrt_recip_alphas_cumprod, t, x_t.shape)
+            * x_t - pred_xstart) / _extract_into_tensor(
+                self.sqrt_recipm1_alphas_cumprod, t, x_t.shape)
+
+    def _scale_timesteps(self, t):
+        if self.rescale_timesteps:
+            return t.float() * (1000.0 / self.num_timesteps)
+        return t
+
+    def condition_mean(self, cond_fn, p_mean_var, x, t, model_kwargs=None):
+        """
+        Compute the mean for the previous step, given a function cond_fn that
+        computes the gradient of a conditional log probability with respect to
+        x. In particular, cond_fn computes grad(log(p(y|x))), and we want to
+        condition on y.
+
+        This uses the conditioning strategy from Sohl-Dickstein et al. (2015).
+        """
+        gradient = cond_fn(x, self._scale_timesteps(t), **model_kwargs)
+        new_mean = (
+            p_mean_var['mean'].float()
+            + p_mean_var['variance'] * gradient.float())
+        return new_mean
+
+    def condition_mean_with_grad(self,
+                                 cond_fn,
+                                 p_mean_var,
+                                 x,
+                                 t,
+                                 model_kwargs=None):
+        """
+        Compute the mean for the previous step, given a function cond_fn that
+        computes the gradient of a conditional log probability with respect to
+        x. In particular, cond_fn computes grad(log(p(y|x))), and we want to
+        condition on y.
+
+        This uses the conditioning strategy from Sohl-Dickstein et al. (2015).
+        """
+        gradient = cond_fn(x, t, p_mean_var, **model_kwargs)
+        new_mean = (
+            p_mean_var['mean'].float()
+            + p_mean_var['variance'] * gradient.float())
+        return new_mean
+
+    def p_sample(
+        self,
+        model,
+        x,
+        t,
+        clip_denoised=True,
+        denoised_fn=None,
+        cond_fn=None,
+        model_kwargs=None,
+        const_noise=False,
+    ):
+        """
+        Sample x_{t-1} from the model at the given timestep.
+
+        :param model: the model to sample from.
+        :param x: the current tensor at x_{t-1}.
+        :param t: the value of t, starting at 0 for the first diffusion step.
+        :param clip_denoised: if True, clip the x_start prediction to [-1, 1].
+        :param denoised_fn: if not None, a function which applies to the
+            x_start prediction before it is used to sample.
+        :param cond_fn: if not None, this is a gradient function that acts
+                        similarly to the model.
+        :param model_kwargs: if not None, a dict of extra keyword arguments to
+            pass to the model. This can be used for conditioning.
+        :return: a dict containing the following keys:
+                 - 'sample': a random sample from the model.
+                 - 'pred_xstart': a prediction of x_0.
+        """
+        out = self.p_mean_variance(
+            model,
+            x,
+            t,
+            clip_denoised=clip_denoised,
+            denoised_fn=denoised_fn,
+            model_kwargs=model_kwargs,
+        )
+        noise = th.randn_like(x)
+        if const_noise:
+            noise = noise[[0]].repeat(x.shape[0], 1, 1, 1)
+
+        nonzero_mask = ((t != 0).float().view(-1, *([1] * (len(x.shape) - 1)))
+                        )  # no noise when t == 0
+        if cond_fn is not None:
+            out['mean'] = self.condition_mean(
+                cond_fn, out, x, t, model_kwargs=model_kwargs)
+        sample = out['mean'] + nonzero_mask * th.exp(
+            0.5 * out['log_variance']) * noise
+        return {'sample': sample, 'pred_xstart': out['pred_xstart']}
+
+    def p_sample_with_grad(
+        self,
+        model,
+        x,
+        t,
+        clip_denoised=True,
+        denoised_fn=None,
+        cond_fn=None,
+        model_kwargs=None,
+    ):
+        """
+        Sample x_{t-1} from the model at the given timestep.
+
+        :param model: the model to sample from.
+        :param x: the current tensor at x_{t-1}.
+        :param t: the value of t, starting at 0 for the first diffusion step.
+        :param clip_denoised: if True, clip the x_start prediction to [-1, 1].
+        :param denoised_fn: if not None, a function which applies to the
+            x_start prediction before it is used to sample.
+        :param cond_fn: if not None, this is a gradient function that acts
+                        similarly to the model.
+        :param model_kwargs: if not None, a dict of extra keyword arguments to
+            pass to the model. This can be used for conditioning.
+        :return: a dict containing the following keys:
+                 - 'sample': a random sample from the model.
+                 - 'pred_xstart': a prediction of x_0.
+        """
+        with th.enable_grad():
+            x = x.detach().requires_grad_()
+            out = self.p_mean_variance(
+                model,
+                x,
+                t,
+                clip_denoised=clip_denoised,
+                denoised_fn=denoised_fn,
+                model_kwargs=model_kwargs,
+            )
+            noise = th.randn_like(x)
+            nonzero_mask = ((t != 0).float().view(-1,
+                                                  *([1] * (len(x.shape) - 1)))
+                            )  # no noise when t == 0
+            if cond_fn is not None:
+                out['mean'] = self.condition_mean_with_grad(
+                    cond_fn, out, x, t, model_kwargs=model_kwargs)
+        sample = out['mean'] + nonzero_mask * th.exp(
+            0.5 * out['log_variance']) * noise
+        return {'sample': sample, 'pred_xstart': out['pred_xstart'].detach()}
+
+    def p_sample_loop(
+        self,
+        model,
+        shape,
+        noise=None,
+        clip_denoised=True,
+        denoised_fn=None,
+        cond_fn=None,
+        model_kwargs=None,
+        device=None,
+        progress=False,
+        skip_timesteps=0,
+        init_image=None,
+        randomize_class=False,
+        cond_fn_with_grad=False,
+        dump_steps=None,
+        const_noise=False,
+    ):
+        """
+        Generate samples from the model.
+
+        :param model: the model module.
+        :param shape: the shape of the samples, (N, C, H, W).
+        :param noise: if specified, the noise from the encoder to sample.
+                      Should be of the same shape as `shape`.
+        :param clip_denoised: if True, clip x_start predictions to [-1, 1].
+        :param denoised_fn: if not None, a function which applies to the
+            x_start prediction before it is used to sample.
+        :param cond_fn: if not None, this is a gradient function that acts
+                        similarly to the model.
+        :param model_kwargs: if not None, a dict of extra keyword arguments to
+            pass to the model. This can be used for conditioning.
+        :param device: if specified, the device to create the samples on.
+                       If not specified, use a model parameter's device.
+        :param progress: if True, show a tqdm progress bar.
+        :param const_noise: If True, will noise all samples with the same noise throughout sampling
+        :return: a non-differentiable batch of samples.
+        """
+        final = None
+        if dump_steps is not None:
+            dump = []
+
+        for i, sample in enumerate(
+                self.p_sample_loop_progressive(
+                    model,
+                    shape,
+                    noise=noise,
+                    clip_denoised=clip_denoised,
+                    denoised_fn=denoised_fn,
+                    cond_fn=cond_fn,
+                    model_kwargs=model_kwargs,
+                    device=device,
+                    progress=progress,
+                    skip_timesteps=skip_timesteps,
+                    init_image=init_image,
+                    randomize_class=randomize_class,
+                    cond_fn_with_grad=cond_fn_with_grad,
+                    const_noise=const_noise,
+                )):
+            if dump_steps is not None and i in dump_steps:
+                dump.append(deepcopy(sample['sample']))
+            final = sample
+        if dump_steps is not None:
+            return dump
+        return final['sample']
+
+    def p_sample_loop_progressive(
+        self,
+        model,
+        shape,
+        noise=None,
+        clip_denoised=True,
+        denoised_fn=None,
+        cond_fn=None,
+        model_kwargs=None,
+        device=None,
+        progress=False,
+        skip_timesteps=0,
+        init_image=None,
+        randomize_class=False,
+        cond_fn_with_grad=False,
+        const_noise=False,
+    ):
+        """
+        Generate samples from the model and yield intermediate samples from
+        each timestep of diffusion.
+
+        Arguments are the same as p_sample_loop().
+        Returns a generator over dicts, where each dict is the return value of
+        p_sample().
+        """
+        if device is None:
+            device = next(model.parameters()).device
+        assert isinstance(shape, (tuple, list))
+        if noise is not None:
+            img = noise
+        else:
+            img = th.randn(*shape, device=device)
+
+        if skip_timesteps and init_image is None:
+            init_image = th.zeros_like(img)
+
+        indices = list(range(self.num_timesteps - skip_timesteps))[::-1]
+
+        if init_image is not None:
+            my_t = th.ones([shape[0]], device=device,
+                           dtype=th.long) * indices[0]
+            img = self.q_sample(init_image, my_t, img)
+
+        if progress:
+            # Lazy import so that we don't depend on tqdm.
+            from tqdm.auto import tqdm
+
+            indices = tqdm(indices)
+
+        for i in indices:
+            t = th.tensor([i] * shape[0], device=device)
+            if randomize_class and 'y' in model_kwargs:
+                model_kwargs['y'] = th.randint(
+                    low=0,
+                    high=model.num_classes,
+                    size=model_kwargs['y'].shape,
+                    device=model_kwargs['y'].device)
+            with th.no_grad():
+                sample_fn = self.p_sample_with_grad if cond_fn_with_grad else self.p_sample
+                out = sample_fn(
+                    model,
+                    img,
+                    t,
+                    clip_denoised=clip_denoised,
+                    denoised_fn=denoised_fn,
+                    cond_fn=cond_fn,
+                    model_kwargs=model_kwargs,
+                    const_noise=const_noise,
+                )
+                yield out
+                img = out['sample']
+
+
+def _extract_into_tensor(arr, timesteps, broadcast_shape):
+    """
+    Extract values from a 1-D numpy array for a batch of indices.
+
+    :param arr: the 1-D numpy array.
+    :param timesteps: a tensor of indices into the array to extract.
+    :param broadcast_shape: a larger shape of K dimensions with the batch
+                            dimension equal to the length of timesteps.
+    :return: a tensor of shape [batch_size, 1, ...] where the shape has K dims.
+    """
+    res = th.from_numpy(arr).to(device=timesteps.device)[timesteps].float()
+    while len(res.shape) < len(broadcast_shape):
+        res = res[..., None]
+    return res.expand(broadcast_shape)
diff --git a/modelscope/models/cv/motion_generation/modules/mdm.py b/modelscope/models/cv/motion_generation/modules/mdm.py
new file mode 100644
index 00000000..716acd83
--- /dev/null
+++ b/modelscope/models/cv/motion_generation/modules/mdm.py
@@ -0,0 +1,364 @@
+# This code is borrowed and modified from Human Motion Diffusion Model,
+# made publicly available under MIT license at https://github.com/GuyTevet/motion-diffusion-model
+
+import clip
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from .rotation2xyz import Rotation2xyz
+
+
+class MDM(nn.Module):
+
+    def __init__(self,
+                 modeltype,
+                 njoints,
+                 nfeats,
+                 num_actions,
+                 translation,
+                 pose_rep,
+                 glob,
+                 glob_rot,
+                 latent_dim=256,
+                 ff_size=1024,
+                 num_layers=8,
+                 num_heads=4,
+                 dropout=0.1,
+                 smpl_data_path=None,
+                 ablation=None,
+                 activation='gelu',
+                 legacy=False,
+                 data_rep='rot6d',
+                 dataset='amass',
+                 clip_dim=512,
+                 arch='trans_enc',
+                 emb_trans_dec=False,
+                 clip_version=None,
+                 **kargs):
+        super().__init__()
+
+        self.legacy = legacy
+        self.modeltype = modeltype
+        self.njoints = njoints
+        self.nfeats = nfeats
+        self.num_actions = num_actions
+        self.data_rep = data_rep
+        self.dataset = dataset
+
+        self.pose_rep = pose_rep
+        self.glob = glob
+        self.glob_rot = glob_rot
+        self.translation = translation
+
+        self.latent_dim = latent_dim
+
+        self.ff_size = ff_size
+        self.num_layers = num_layers
+        self.num_heads = num_heads
+        self.dropout = dropout
+
+        self.ablation = ablation
+        self.activation = activation
+        self.clip_dim = clip_dim
+        self.action_emb = kargs.get('action_emb', None)
+
+        self.input_feats = self.njoints * self.nfeats
+
+        self.normalize_output = kargs.get('normalize_encoder_output', False)
+
+        self.cond_mode = kargs.get('cond_mode', 'no_cond')
+        self.cond_mask_prob = kargs.get('cond_mask_prob', 0.)
+        self.arch = arch
+        self.gru_emb_dim = self.latent_dim if self.arch == 'gru' else 0
+        self.input_process = InputProcess(self.data_rep,
+                                          self.input_feats + self.gru_emb_dim,
+                                          self.latent_dim)
+
+        self.sequence_pos_encoder = PositionalEncoding(self.latent_dim,
+                                                       self.dropout)
+        self.emb_trans_dec = emb_trans_dec
+
+        if self.arch == 'trans_enc':
+            print('TRANS_ENC init')
+            seqTransEncoderLayer = nn.TransformerEncoderLayer(
+                d_model=self.latent_dim,
+                nhead=self.num_heads,
+                dim_feedforward=self.ff_size,
+                dropout=self.dropout,
+                activation=self.activation)
+
+            self.seqTransEncoder = nn.TransformerEncoder(
+                seqTransEncoderLayer, num_layers=self.num_layers)
+        elif self.arch == 'trans_dec':
+            print('TRANS_DEC init')
+            seqTransDecoderLayer = nn.TransformerDecoderLayer(
+                d_model=self.latent_dim,
+                nhead=self.num_heads,
+                dim_feedforward=self.ff_size,
+                dropout=self.dropout,
+                activation=activation)
+            self.seqTransDecoder = nn.TransformerDecoder(
+                seqTransDecoderLayer, num_layers=self.num_layers)
+        elif self.arch == 'gru':
+            print('GRU init')
+            self.gru = nn.GRU(
+                self.latent_dim,
+                self.latent_dim,
+                num_layers=self.num_layers,
+                batch_first=True)
+        else:
+            raise ValueError(
+                'Please choose correct architecture [trans_enc, trans_dec, gru]'
+            )
+
+        self.embed_timestep = TimestepEmbedder(self.latent_dim,
+                                               self.sequence_pos_encoder)
+
+        if self.cond_mode != 'no_cond':
+            if 'text' in self.cond_mode:
+                self.embed_text = nn.Linear(self.clip_dim, self.latent_dim)
+                print('EMBED TEXT')
+                print('Loading CLIP...')
+                self.clip_version = clip_version
+                self.clip_model = self.load_and_freeze_clip(clip_version)
+            if 'action' in self.cond_mode:
+                self.embed_action = EmbedAction(self.num_actions,
+                                                self.latent_dim)
+                print('EMBED ACTION')
+
+        self.output_process = OutputProcess(self.data_rep, self.input_feats,
+                                            self.latent_dim, self.njoints,
+                                            self.nfeats)
+
+        self.rot2xyz = Rotation2xyz(
+            device='cpu', smpl_data_path=smpl_data_path, dataset=self.dataset)
+
+    def parameters_wo_clip(self):
+        return [
+            p for name, p in self.named_parameters()
+            if not name.startswith('clip_model.')
+        ]
+
+    def load_and_freeze_clip(self, clip_version):
+        clip_model, clip_preprocess = clip.load(
+            clip_version, device='cpu',
+            jit=False)  # Must set jit=False for training
+        # clip.model.convert_weights(
+        #     clip_model)  # Actually this line is unnecessary since clip by default already on float16
+
+        # Freeze CLIP weights
+        clip_model.eval()
+        for p in clip_model.parameters():
+            p.requires_grad = False
+
+        return clip_model
+
+    def mask_cond(self, cond, force_mask=False):
+        bs, d = cond.shape
+        if force_mask:
+            return torch.zeros_like(cond)
+        elif self.training and self.cond_mask_prob > 0.:
+            mask = torch.bernoulli(
+                torch.ones(bs, device=cond.device) * self.cond_mask_prob).view(
+                    bs, 1)  # 1-> use null_cond, 0-> use real cond
+            return cond * (1. - mask)
+        else:
+            return cond
+
+    def encode_text(self, raw_text):
+        device = next(self.parameters()).device
+        max_text_len = 20 if self.dataset in [
+            'humanml', 'kit'
+        ] else None  # Specific hardcoding for humanml dataset
+        if max_text_len is not None:
+            default_context_length = 77
+            context_length = max_text_len + 2  # start_token + 20 + end_token
+            assert context_length < default_context_length
+            texts = clip.tokenize(
+                raw_text, context_length=context_length,
+                truncate=True).to(device)
+            zero_pad = torch.zeros(
+                [texts.shape[0], default_context_length - context_length],
+                dtype=texts.dtype,
+                device=texts.device)
+            texts = torch.cat([texts, zero_pad], dim=1)
+        else:
+            texts = clip.tokenize(raw_text, truncate=True).to(device)
+        return self.clip_model.encode_text(texts).float()
+
+    def forward(self, x, timesteps, y=None):
+        """
+        x: [batch_size, njoints, nfeats, max_frames], denoted x_t in the paper
+        timesteps: [batch_size] (int)
+        """
+        bs, njoints, nfeats, nframes = x.shape
+        emb = self.embed_timestep(timesteps)  # [1, bs, d]
+
+        force_mask = y.get('uncond', False)
+        if 'text' in self.cond_mode:
+            enc_text = self.encode_text(y['text'])
+            emb += self.embed_text(
+                self.mask_cond(enc_text, force_mask=force_mask))
+        if 'action' in self.cond_mode:
+            action_emb = self.embed_action(y['action'])
+            emb += self.mask_cond(action_emb, force_mask=force_mask)
+
+        if self.arch == 'gru':
+            x_reshaped = x.reshape(bs, njoints * nfeats, 1, nframes)
+            emb_gru = emb.repeat(nframes, 1, 1)  # [#frames, bs, d]
+            emb_gru = emb_gru.permute(1, 2, 0)  # [bs, d, #frames]
+            emb_gru = emb_gru.reshape(bs, self.latent_dim, 1,
+                                      nframes)  # [bs, d, 1, #frames]
+            x = torch.cat((x_reshaped, emb_gru),
+                          axis=1)  # [bs, d+joints*feat, 1, #frames]
+
+        x = self.input_process(x)
+
+        if self.arch == 'trans_enc':
+            # adding the timestep embed
+            xseq = torch.cat((emb, x), axis=0)  # [seqlen+1, bs, d]
+            xseq = self.sequence_pos_encoder(xseq)  # [seqlen+1, bs, d]
+            output = self.seqTransEncoder(xseq)[
+                1:]  # , src_key_padding_mask=~maskseq)  # [seqlen, bs, d]
+
+        elif self.arch == 'trans_dec':
+            if self.emb_trans_dec:
+                xseq = torch.cat((emb, x), axis=0)
+            else:
+                xseq = x
+            xseq = self.sequence_pos_encoder(xseq)  # [seqlen+1, bs, d]
+            if self.emb_trans_dec:
+                output = self.seqTransDecoder(
+                    tgt=xseq, memory=emb
+                )[1:]  # [seqlen, bs, d] # FIXME - maybe add a causal mask
+            else:
+                output = self.seqTransDecoder(tgt=xseq, memory=emb)
+        elif self.arch == 'gru':
+            xseq = x
+            xseq = self.sequence_pos_encoder(xseq)  # [seqlen, bs, d]
+            output, _ = self.gru(xseq)
+
+        output = self.output_process(output)  # [bs, njoints, nfeats, nframes]
+        return output
+
+    def _apply(self, fn):
+        super()._apply(fn)
+        self.rot2xyz.smpl_model._apply(fn)
+
+    def train(self, *args, **kwargs):
+        super().train(*args, **kwargs)
+        self.rot2xyz.smpl_model.train(*args, **kwargs)
+
+
+class PositionalEncoding(nn.Module):
+
+    def __init__(self, d_model, dropout=0.1, max_len=5000):
+        super(PositionalEncoding, self).__init__()
+        self.dropout = nn.Dropout(p=dropout)
+
+        pe = torch.zeros(max_len, d_model)
+        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
+        div_term = torch.exp(
+            torch.arange(0, d_model, 2).float() * (-np.log(10000.0) / d_model))
+        pe[:, 0::2] = torch.sin(position * div_term)
+        pe[:, 1::2] = torch.cos(position * div_term)
+        pe = pe.unsqueeze(0).transpose(0, 1)
+
+        self.register_buffer('pe', pe)
+
+    def forward(self, x):
+        # not used in the final model
+        x = x + self.pe[:x.shape[0], :]
+        return self.dropout(x)
+
+
+class TimestepEmbedder(nn.Module):
+
+    def __init__(self, latent_dim, sequence_pos_encoder):
+        super().__init__()
+        self.latent_dim = latent_dim
+        self.sequence_pos_encoder = sequence_pos_encoder
+
+        time_embed_dim = self.latent_dim
+        self.time_embed = nn.Sequential(
+            nn.Linear(self.latent_dim, time_embed_dim),
+            nn.SiLU(),
+            nn.Linear(time_embed_dim, time_embed_dim),
+        )
+
+    def forward(self, timesteps):
+        return self.time_embed(
+            self.sequence_pos_encoder.pe[timesteps]).permute(1, 0, 2)
+
+
+class InputProcess(nn.Module):
+
+    def __init__(self, data_rep, input_feats, latent_dim):
+        super().__init__()
+        self.data_rep = data_rep
+        self.input_feats = input_feats
+        self.latent_dim = latent_dim
+        self.poseEmbedding = nn.Linear(self.input_feats, self.latent_dim)
+        if self.data_rep == 'rot_vel':
+            self.velEmbedding = nn.Linear(self.input_feats, self.latent_dim)
+
+    def forward(self, x):
+        bs, njoints, nfeats, nframes = x.shape
+        x = x.permute((3, 0, 1, 2)).reshape(nframes, bs, njoints * nfeats)
+
+        if self.data_rep in ['rot6d', 'xyz', 'hml_vec']:
+            x = self.poseEmbedding(x)  # [seqlen, bs, d]
+            return x
+        elif self.data_rep == 'rot_vel':
+            first_pose = x[[0]]  # [1, bs, 150]
+            first_pose = self.poseEmbedding(first_pose)  # [1, bs, d]
+            vel = x[1:]  # [seqlen-1, bs, 150]
+            vel = self.velEmbedding(vel)  # [seqlen-1, bs, d]
+            return torch.cat((first_pose, vel), axis=0)  # [seqlen, bs, d]
+        else:
+            raise ValueError
+
+
+class OutputProcess(nn.Module):
+
+    def __init__(self, data_rep, input_feats, latent_dim, njoints, nfeats):
+        super().__init__()
+        self.data_rep = data_rep
+        self.input_feats = input_feats
+        self.latent_dim = latent_dim
+        self.njoints = njoints
+        self.nfeats = nfeats
+        self.poseFinal = nn.Linear(self.latent_dim, self.input_feats)
+        if self.data_rep == 'rot_vel':
+            self.velFinal = nn.Linear(self.latent_dim, self.input_feats)
+
+    def forward(self, output):
+        nframes, bs, d = output.shape
+        if self.data_rep in ['rot6d', 'xyz', 'hml_vec']:
+            output = self.poseFinal(output)  # [seqlen, bs, 150]
+        elif self.data_rep == 'rot_vel':
+            first_pose = output[[0]]  # [1, bs, d]
+            first_pose = self.poseFinal(first_pose)  # [1, bs, 150]
+            vel = output[1:]  # [seqlen-1, bs, d]
+            vel = self.velFinal(vel)  # [seqlen-1, bs, 150]
+            output = torch.cat((first_pose, vel), axis=0)  # [seqlen, bs, 150]
+        else:
+            raise ValueError
+        output = output.reshape(nframes, bs, self.njoints, self.nfeats)
+        output = output.permute(1, 2, 3, 0)  # [bs, njoints, nfeats, nframes]
+        return output
+
+
+class EmbedAction(nn.Module):
+
+    def __init__(self, num_actions, latent_dim):
+        super().__init__()
+        self.action_embedding = nn.Parameter(
+            torch.randn(num_actions, latent_dim))
+
+    def forward(self, input):
+        idx = input[:, 0].to(torch.long)  # an index array must be long
+        output = self.action_embedding[idx]
+        return output
diff --git a/modelscope/models/cv/motion_generation/modules/respace.py b/modelscope/models/cv/motion_generation/modules/respace.py
new file mode 100644
index 00000000..45c37ee6
--- /dev/null
+++ b/modelscope/models/cv/motion_generation/modules/respace.py
@@ -0,0 +1,132 @@
+# This code is borrowed and modified from Human Motion Diffusion Model,
+# made publicly available under MIT license at https://github.com/GuyTevet/motion-diffusion-model
+
+import numpy as np
+import torch as th
+
+from .gaussian_diffusion import GaussianDiffusion
+
+
+def space_timesteps(num_timesteps, section_counts):
+    """
+    Create a list of timesteps to use from an original diffusion process,
+    given the number of timesteps we want to take from equally-sized portions
+    of the original process.
+
+    For example, if there's 300 timesteps and the section counts are [10,15,20]
+    then the first 100 timesteps are strided to be 10 timesteps, the second 100
+    are strided to be 15 timesteps, and the final 100 are strided to be 20.
+
+    If the stride is a string starting with "ddim", then the fixed striding
+    from the DDIM paper is used, and only one section is allowed.
+
+    :param num_timesteps: the number of diffusion steps in the original
+                          process to divide up.
+    :param section_counts: either a list of numbers, or a string containing
+                           comma-separated numbers, indicating the step count
+                           per section. As a special case, use "ddimN" where N
+                           is a number of steps to use the striding from the
+                           DDIM paper.
+    :return: a set of diffusion steps from the original process to use.
+    """
+    if isinstance(section_counts, str):
+        if section_counts.startswith('ddim'):
+            desired_count = int(section_counts[len('ddim'):])
+            for i in range(1, num_timesteps):
+                if len(range(0, num_timesteps, i)) == desired_count:
+                    return set(range(0, num_timesteps, i))
+            raise ValueError(
+                f'cannot create exactly {num_timesteps} steps with an integer stride'
+            )
+        section_counts = [int(x) for x in section_counts.split(',')]
+    size_per = num_timesteps // len(section_counts)
+    extra = num_timesteps % len(section_counts)
+    start_idx = 0
+    all_steps = []
+    for i, section_count in enumerate(section_counts):
+        size = size_per + (1 if i < extra else 0)
+        if size < section_count:
+            raise ValueError(
+                f'cannot divide section of {size} steps into {section_count}')
+        if section_count <= 1:
+            frac_stride = 1
+        else:
+            frac_stride = (size - 1) / (section_count - 1)
+        cur_idx = 0.0
+        taken_steps = []
+        for _ in range(section_count):
+            taken_steps.append(start_idx + round(cur_idx))
+            cur_idx += frac_stride
+        all_steps += taken_steps
+        start_idx += size
+    return set(all_steps)
+
+
+class SpacedDiffusion(GaussianDiffusion):
+    """
+    A diffusion process which can skip steps in a base diffusion process.
+
+    :param use_timesteps: a collection (sequence or set) of timesteps from the
+                          original diffusion process to retain.
+    :param kwargs: the kwargs to create the base diffusion process.
+    """
+
+    def __init__(self, use_timesteps, **kwargs):
+        self.use_timesteps = set(use_timesteps)
+        self.timestep_map = []
+        self.original_num_steps = len(kwargs['betas'])
+
+        base_diffusion = GaussianDiffusion(**kwargs)  # pylint: disable=missing-kwoa
+        last_alpha_cumprod = 1.0
+        new_betas = []
+        for i, alpha_cumprod in enumerate(base_diffusion.alphas_cumprod):
+            if i in self.use_timesteps:
+                new_betas.append(1 - alpha_cumprod / last_alpha_cumprod)
+                last_alpha_cumprod = alpha_cumprod
+                self.timestep_map.append(i)
+        kwargs['betas'] = np.array(new_betas)
+        super().__init__(**kwargs)
+
+    def p_mean_variance(self, model, *args, **kwargs):  # pylint: disable=signature-differs
+        return super().p_mean_variance(
+            self._wrap_model(model), *args, **kwargs)
+
+    def training_losses(self, model, *args, **kwargs):  # pylint: disable=signature-differs
+        return super().training_losses(
+            self._wrap_model(model), *args, **kwargs)
+
+    def condition_mean(self, cond_fn, *args, **kwargs):
+        return super().condition_mean(
+            self._wrap_model(cond_fn), *args, **kwargs)
+
+    def condition_score(self, cond_fn, *args, **kwargs):
+        return super().condition_score(
+            self._wrap_model(cond_fn), *args, **kwargs)
+
+    def _wrap_model(self, model):
+        if isinstance(model, _WrappedModel):
+            return model
+        return _WrappedModel(model, self.timestep_map, self.rescale_timesteps,
+                             self.original_num_steps)
+
+    def _scale_timesteps(self, t):
+        # Scaling is done by the wrapped model.
+        return t
+
+
+class _WrappedModel:
+
+    def __init__(self, model, timestep_map, rescale_timesteps,
+                 original_num_steps):
+        self.model = model
+        self.timestep_map = timestep_map
+        self.rescale_timesteps = rescale_timesteps
+        self.original_num_steps = original_num_steps
+
+    def __call__(self, x, ts, **kwargs):
+        map_tensor = th.tensor(
+            self.timestep_map, device=ts.device, dtype=ts.dtype)
+        new_ts = map_tensor[ts]
+        if self.rescale_timesteps:
+            new_ts = new_ts.float() * (1000.0 / self.original_num_steps)
+        return self.model(x, new_ts, **kwargs)
diff --git a/modelscope/models/cv/motion_generation/modules/rotation2xyz.py b/modelscope/models/cv/motion_generation/modules/rotation2xyz.py
new file mode 100644
index 00000000..0b00015e
--- /dev/null
+++ b/modelscope/models/cv/motion_generation/modules/rotation2xyz.py
@@ -0,0 +1,112 @@
+# This code is borrowed and modified from Human Motion Diffusion Model,
+# made publicly available under MIT license at https://github.com/GuyTevet/motion-diffusion-model
+
+import torch
+
+from modelscope.utils.cv.motion_utils import rotation_conversions as geometry
+from .smpl import JOINTSTYPE_ROOT, SMPL
+
+JOINTSTYPES = ['a2m', 'a2mpl', 'smpl', 'vibe', 'vertices']
+
+
+class Rotation2xyz:
+
+    def __init__(self, device, smpl_data_path, dataset='amass'):
+        self.device = device
+        self.dataset = dataset
+        self.smpl_model = SMPL(smpl_data_path).eval().to(device)
+
+    def __call__(self,
+                 x,
+                 mask,
+                 pose_rep,
+                 translation,
+                 glob,
+                 jointstype,
+                 vertstrans,
+                 betas=None,
+                 beta=0,
+                 glob_rot=None,
+                 get_rotations_back=False,
+                 **kwargs):
+        if pose_rep == 'xyz':
+            return x
+
+        if mask is None:
+            mask = torch.ones((x.shape[0], x.shape[-1]),
+                              dtype=bool,
+                              device=x.device)
+
+        if not glob and glob_rot is None:
+            raise TypeError(
+                'You must specify global rotation if glob is False')
+
+        if jointstype not in JOINTSTYPES:
+            raise NotImplementedError('This jointstype is not implemented.')
+
+        if translation:
+            x_translations = x[:, -1, :3]
+            x_rotations = x[:, :-1]
+        else:
+            x_rotations = x
+
+        x_rotations = x_rotations.permute(0, 3, 1, 2)
+        nsamples, time, njoints, feats = x_rotations.shape
+
+        # Compute rotations (convert only masked sequences output)
+        if pose_rep == 'rotvec':
+            rotations = geometry.axis_angle_to_matrix(x_rotations[mask])
+        elif pose_rep == 'rotmat':
+            rotations = x_rotations[mask].view(-1, njoints, 3, 3)
+        elif pose_rep == 'rotquat':
+            rotations = geometry.quaternion_to_matrix(x_rotations[mask])
+        elif pose_rep == 'rot6d':
+            rotations = geometry.rotation_6d_to_matrix(x_rotations[mask])
+        else:
+            raise NotImplementedError('No geometry for this one.')
+
+        if not glob:
+            global_orient = torch.tensor(glob_rot, device=x.device)
+            global_orient = geometry.axis_angle_to_matrix(global_orient).view(
+                1, 1, 3, 3)
+            global_orient = global_orient.repeat(len(rotations), 1, 1, 1)
+        else:
+            global_orient = rotations[:, 0]
+            rotations = rotations[:, 1:]
+
+        if betas is None:
+            betas = torch.zeros(
+                [rotations.shape[0], self.smpl_model.num_betas],
+                dtype=rotations.dtype,
+                device=rotations.device)
+            betas[:, 1] = beta
+            # import ipdb; ipdb.set_trace()
+        out = self.smpl_model(
+            body_pose=rotations, global_orient=global_orient, betas=betas)
+
+        # get the desirable joints
+        joints = out[jointstype]
+
+        x_xyz = torch.empty(
+            nsamples, time, joints.shape[1], 3, device=x.device, dtype=x.dtype)
+        x_xyz[~mask] = 0
+        x_xyz[mask] = joints
+
+        x_xyz = x_xyz.permute(0, 2, 3, 1).contiguous()
+
+        # the first translation root at the origin on the prediction
+        if jointstype != 'vertices':
+            rootindex = JOINTSTYPE_ROOT[jointstype]
+            x_xyz = x_xyz - x_xyz[:, [rootindex], :, :]
+
+        if translation and vertstrans:
+            # the first translation root at the origin
+            x_translations = x_translations - x_translations[:, :, [0]]
+
+            # add the translation to all the joints
+            x_xyz = x_xyz + x_translations[:, None, :, :]
+
+        if get_rotations_back:
+            return x_xyz, rotations, global_orient
+        else:
+            return x_xyz
diff --git a/modelscope/models/cv/motion_generation/modules/smpl.py b/modelscope/models/cv/motion_generation/modules/smpl.py
new file mode 100644
index 00000000..60b027de
--- /dev/null
+++ b/modelscope/models/cv/motion_generation/modules/smpl.py
@@ -0,0 +1,117 @@
+# This code is borrowed and modified from Human Motion Diffusion Model,
+# made publicly available under MIT license at https://github.com/GuyTevet/motion-diffusion-model
+
+import contextlib
+import os.path as osp
+
+import numpy as np
+import torch
+from smplx import SMPLLayer as _SMPLLayer
+from smplx.lbs import vertices2joints
+
+action2motion_joints = [
+    8, 1, 2, 3, 4, 5, 6, 7, 0, 9, 10, 11, 12, 13, 14, 21, 24, 38
+]
+
+JOINTSTYPE_ROOT = {
+    'a2m': 0,  # action2motion
+    'smpl': 0,
+    'a2mpl': 0,  # set(smpl, a2m)
+    'vibe': 8
+}  # 0 is the 8 position: OP MidHip below
+
+JOINT_MAP = {
+    'OP Nose': 24,
+    'OP Neck': 12,
+    'OP RShoulder': 17,
+    'OP RElbow': 19,
+    'OP RWrist': 21,
+    'OP LShoulder': 16,
+    'OP LElbow': 18,
+    'OP LWrist': 20,
+    'OP MidHip': 0,
+    'OP RHip': 2,
+    'OP RKnee': 5,
+    'OP RAnkle': 8,
+    'OP LHip': 1,
+    'OP LKnee': 4,
+    'OP LAnkle': 7,
+    'OP REye': 25,
+    'OP LEye': 26,
+    'OP REar': 27,
+    'OP LEar': 28,
+    'OP LBigToe': 29,
+    'OP LSmallToe': 30,
+    'OP LHeel': 31,
+    'OP RBigToe': 32,
+    'OP RSmallToe': 33,
+    'OP RHeel': 34,
+    'Right Ankle': 8,
+    'Right Knee': 5,
+    'Right Hip': 45,
+    'Left Hip': 46,
+    'Left Knee': 4,
+    'Left Ankle': 7,
+    'Right Wrist': 21,
+    'Right Elbow': 19,
+    'Right Shoulder': 17,
+    'Left Shoulder': 16,
+    'Left Elbow': 18,
+    'Left Wrist': 20,
+    'Neck (LSP)': 47,
+    'Top of Head (LSP)': 48,
+    'Pelvis (MPII)': 49,
+    'Thorax (MPII)': 50,
+    'Spine (H36M)': 51,
+    'Jaw (H36M)': 52,
+    'Head (H36M)': 53,
+    'Nose': 24,
+    'Left Eye': 26,
+    'Right Eye': 25,
+    'Left Ear': 28,
+    'Right Ear': 27
+}
+
+JOINT_NAMES = list(JOINT_MAP.keys())
+
+
+class SMPL(_SMPLLayer):
+    """ Extension of the official SMPL implementation to support more joints """
+
+    def __init__(self, smpl_data_path, **kwargs):
+        kwargs['model_path'] = osp.join(smpl_data_path, 'SMPL_NEUTRAL.pkl')
+
+        # remove the verbosity for the 10-shapes beta parameters
+        with contextlib.redirect_stdout(None):
+            super(SMPL, self).__init__(**kwargs)
+
+        J_regressor_extra = np.load(
+            osp.join(smpl_data_path, 'J_regressor_extra.npy'))
+        self.register_buffer(
+            'J_regressor_extra',
+            torch.tensor(J_regressor_extra, dtype=torch.float32))
+        vibe_indexes = np.array([JOINT_MAP[i] for i in JOINT_NAMES])
+        a2m_indexes = vibe_indexes[action2motion_joints]
+        smpl_indexes = np.arange(24)
+        a2mpl_indexes = np.unique(np.r_[smpl_indexes, a2m_indexes])
+
+        self.maps = {
+            'vibe': vibe_indexes,
+            'a2m': a2m_indexes,
+            'smpl': smpl_indexes,
+            'a2mpl': a2mpl_indexes
+        }
+
+    def forward(self, *args, **kwargs):
+        smpl_output = super(SMPL, self).forward(*args, **kwargs)
+
+        extra_joints = vertices2joints(self.J_regressor_extra,
+                                       smpl_output.vertices)
+        all_joints = torch.cat([smpl_output.joints, extra_joints], dim=1)
+
+        output = {'vertices': smpl_output.vertices}
+
+        for joinstype, indexes in self.maps.items():
+            output[joinstype] = all_joints[:, indexes]
+
+        return output
diff --git a/modelscope/models/cv/nerf_recon_acc/__init__.py b/modelscope/models/cv/nerf_recon_acc/__init__.py
new file mode 100644
index 00000000..95ff8357
--- /dev/null
+++ b/modelscope/models/cv/nerf_recon_acc/__init__.py
@@ -0,0 +1,22 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .nerf_recon_acc import NeRFReconAcc
+    from .nerf_preprocess import NeRFReconPreprocessor
+
+else:
+    _import_structure = {'nerf_recon_acc': ['NeRFReconAcc']}
+    _import_structure = {'nerf_preprocess': ['NeRFReconPreprocessor']}
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/models/cv/nerf_recon_acc/dataloader/__init__.py b/modelscope/models/cv/nerf_recon_acc/dataloader/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/modelscope/models/cv/nerf_recon_acc/dataloader/nerf_dataset.py b/modelscope/models/cv/nerf_recon_acc/dataloader/nerf_dataset.py
new file mode 100644
index 00000000..8ff14774
--- /dev/null
+++ b/modelscope/models/cv/nerf_recon_acc/dataloader/nerf_dataset.py
@@ -0,0 +1,518 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import math
+import os
+
+import json
+import numpy as np
+import torch
+import torch.nn.functional as F
+import torchvision.transforms.functional as TF
+from PIL import Image
+from torch.utils.data import Dataset
+
+from .read_write_model import (read_cameras_binary, read_images_binary,
+                               read_points3D_binary)
+
+
+def get_rays(directions, c2w, keepdim=False):
+    assert directions.shape[-1] == 3
+
+    if directions.ndim == 2:
+        assert c2w.ndim == 3
+        rays_d = (directions[:, None, :] * c2w[:, :3, :3]).sum(-1)
+        rays_o = c2w[:, :, 3].expand(rays_d.shape)
+    elif directions.ndim == 3:
+        if c2w.ndim == 2:
+            rays_d = (directions[:, :, None, :]
+                      * c2w[None, None, :3, :3]).sum(-1)
+            rays_o = c2w[None, None, :, 3].expand(rays_d.shape)
+        elif c2w.ndim == 3:
+            rays_d = (directions[None, :, :, None, :]
+                      * c2w[:, None, None, :3, :3]).sum(-1)
+            rays_o = c2w[:, None, None, :, 3].expand(rays_d.shape)
+
+    if not keepdim:
+        rays_o, rays_d = rays_o.reshape(-1, 3), rays_d.reshape(-1, 3)
+
+    return rays_o, rays_d
+
+
+def get_ray_directions(W, H, fx, fy, cx, cy, use_pixel_centers=True):
+    pixel_center = 0.5 if use_pixel_centers else 0
+    i, j = np.meshgrid(
+        np.arange(W, dtype=np.float32) + pixel_center,
+        np.arange(H, dtype=np.float32) + pixel_center,
+        indexing='xy')
+    i, j = torch.from_numpy(i), torch.from_numpy(j)
+
+    directions = torch.stack(
+        [(i - cx) / fx, -(j - cy) / fy, -torch.ones_like(i)], -1)  # (H, W, 3)
+
+    return directions
+
+
+def get_center(pts):
+    center = pts.mean(0)
+    dis = (pts - center[None, :]).norm(p=2, dim=-1)
+    mean, std = dis.mean(), dis.std()
+    q25, q75 = torch.quantile(dis, 0.25), torch.quantile(dis, 0.75)
+    valid = (dis > mean - 1.5 * std) & (dis < mean + 1.5 * std) & (
+        dis > mean - (q75 - q25) * 1.5) & (
+            dis < mean + (q75 - q25) * 1.5)
+    pts = pts[valid]
+    center = pts.mean(0)
+    return center, pts
+
+
+def normalize_poses(poses, pts):
+    center, pts = get_center(pts)
+
+    z = F.normalize((poses[..., 3] - center).mean(0), dim=0)
+    y_ = torch.as_tensor([z[1], -z[0], 0.])
+    x = F.normalize(y_.cross(z), dim=0)
+    y = z.cross(x)
+
+    Rc = torch.stack([x, y, z], dim=1)
+    tc = center.reshape(3, 1)
+
+    R, t = Rc.T, -Rc.T @ tc
+
+    pose_last = torch.as_tensor([[[0., 0., 0.,
+                                   1.]]]).expand(poses.shape[0], -1, -1)
+    poses_homo = torch.cat([poses, pose_last], dim=1)
+    inv_trans = torch.cat(
+        [torch.cat([R, t], dim=1),
+         torch.as_tensor([[0., 0., 0., 1.]])], dim=0)
+
+    poses_norm = (inv_trans @ poses_homo)[:, :3]  # (N_images, 4, 4)
+    scale = poses_norm[..., 3].norm(p=2, dim=-1).min()
+    poses_norm[..., 3] /= scale
+
+    pts = (inv_trans @ torch.cat([pts, torch.ones_like(pts[:, 0:1])],
+                                 dim=-1)[..., None])[:, :3, 0]
+    pts = pts / scale
+
+    return poses_norm, pts
+
+
+def create_spheric_poses(cameras, n_steps=120):
+    center = torch.as_tensor([0., 0., 0.],
+                             dtype=cameras.dtype,
+                             device=cameras.device)
+    mean_d = (cameras - center[None, :]).norm(p=2, dim=-1).mean()
+    mean_h = cameras[:, 2].mean()
+    r = (mean_d**2 - mean_h**2).sqrt()
+    up = torch.as_tensor([0., 0., 1.],
+                         dtype=center.dtype,
+                         device=center.device)
+
+    all_c2w = []
+    for theta in torch.linspace(0, 2 * math.pi, n_steps):
+        cam_pos = torch.stack([r * theta.cos(), r * theta.sin(), mean_h])
+        h = F.normalize(center - cam_pos, p=2, dim=0)
+        s = F.normalize(h.cross(up), p=2, dim=0)
+        u = F.normalize(s.cross(h), p=2, dim=0)
+        concat = torch.stack([s, u, -h], dim=1)
+        c2w = torch.cat([concat, cam_pos[:, None]], axis=1)
+        all_c2w.append(c2w)
+
+    all_c2w = torch.stack(all_c2w, dim=0)
+
+    return all_c2w
+
+
+def to4x4(pose):
+    constants = torch.zeros_like(pose[..., :1, :], device=pose.device)
+    constants[..., :, 3] = 1
+    return torch.cat([pose, constants], dim=-2)
+
+
+def get_spiral_path(cameras,
+                    fx,
+                    fy,
+                    n_steps=120,
+                    radius=0.1,
+                    rots=2,
+                    zrate=0.5):
+    up = cameras[0, :3, 2]
+    fx = torch.tensor(fx, dtype=torch.float32)
+    fy = torch.tensor(fy, dtype=torch.float32)
+    focal = torch.min(fx, fy)
+    target = torch.tensor(
+        [0, 0, -focal],
+        device=cameras.device)  # camera looking in -z direction
+    rad = torch.tensor([radius] * 3, device=cameras.device)
+    c2w = cameras[0]
+    c2wh_global = to4x4(c2w)
+
+    local_c2whs = []
+    for theta in torch.linspace(0.0, 2.0 * torch.pi * rots, n_steps + 1)[:-1]:
+        theta_list = [
+            torch.cos(theta), -torch.sin(theta), -torch.sin(theta * zrate)
+        ]
+        center = (torch.tensor(theta_list, device=cameras.device) * rad)
+        lookat = center - target
+        vec2 = F.normalize(lookat, p=2, dim=0)
+        vec1_avg = F.normalize(up, p=2, dim=0)
+        vec0 = F.normalize(torch.cross(vec1_avg, vec2), p=2, dim=0)
+        vec1 = F.normalize(torch.cross(vec2, vec0), p=2, dim=0)
+        c2w = torch.stack([vec0, vec1, vec2, center], 1)
+        c2wh = to4x4(c2w)
+        local_c2whs.append(c2wh)
+
+    new_c2ws = []
+    for local_c2wh in local_c2whs:
+        c2wh = torch.matmul(c2wh_global, local_c2wh)
+        new_c2ws.append(c2wh[:3, :4])
+    new_c2ws = torch.stack(new_c2ws, dim=0)
+    return new_c2ws
+
+
+class BlenderDataset(Dataset):
+    """Single subject data loader for training and evaluation."""
+
+    def __init__(
+        self,
+        root_fp,
+        split,
+        img_wh=(800, 800),
+        max_size=None,
+        num_rays=None,
+        color_bkgd_aug='white',
+        near=2.0,
+        far=6.0,
+        batch_over_images=True,
+    ):
+        super().__init__()
+        self.root_fp = root_fp
+        self.split = split
+        self.max_size = max_size
+        self.num_rays = num_rays
+        self.near = near
+        self.far = far
+        self.training = (num_rays is not None) and (split == 'train')
+        self.color_bkgd_aug = color_bkgd_aug
+        self.batch_over_images = batch_over_images
+
+        with open(
+                os.path.join(self.root_fp, f'transforms_{self.split}.json'),
+                'r') as f:
+            meta = json.load(f)
+
+        if 'w' in meta and 'h' in meta:
+            W, H = int(meta['w']), int(meta['h'])
+        else:
+            W, H = img_wh
+
+        self.w, self.h = W, H
+        self.image_wh = (self.w, self.h)
+        self.focal = 0.5 * self.w / math.tan(0.5 * meta['camera_angle_x'])
+        self.directions = get_ray_directions(self.w, self.h, self.focal,
+                                             self.focal, self.w // 2,
+                                             self.h // 2).cuda()
+
+        self.all_c2w, self.all_images, self.all_fg_masks = [], [], []
+
+        for i, frame in enumerate(meta['frames']):
+            c2w = torch.from_numpy(np.array(frame['transform_matrix'])[:3, :4])
+            self.all_c2w.append(c2w)
+
+            img_path = os.path.join(self.root_fp, f"{frame['file_path']}.png")
+            img = Image.open(img_path)
+            img = TF.to_tensor(img).permute(1, 2, 0)  # (4, h, w) => (h, w, 4)
+
+            self.all_fg_masks.append(img[..., -1])  # (h, w)
+            self.all_images.append(img[..., :3])
+
+        self.all_c2w, self.all_images, self.all_fg_masks = \
+            torch.stack(self.all_c2w, dim=0).float().cuda(), \
+            torch.stack(self.all_images, dim=0).float().cuda(), \
+            torch.stack(self.all_fg_masks, dim=0).float().cuda()
+
+    def __len__(self):
+        return len(self.all_images)
+
+    @torch.no_grad()
+    def __getitem__(self, index):
+        data = self.fetch_data(index)
+        return data
+
+    def update_num_rays(self, num_rays):
+        self.num_rays = num_rays
+
+    def fetch_data(self, index):
+        """Fetch the data (it maybe cached for multiple batches)."""
+        num_rays = self.num_rays
+        if self.training:
+            if self.batch_over_images:
+                index = torch.randint(
+                    0,
+                    len(self.all_images),
+                    size=(num_rays, ),
+                    device=self.all_images.device)
+
+            else:
+                index = torch.randint(
+                    0,
+                    len(self.all_images),
+                    size=(1, ),
+                    device=self.all_images.device)
+
+            x = torch.randint(
+                0, self.w, size=(num_rays, ), device=self.all_images.device)
+            y = torch.randint(
+                0, self.h, size=(num_rays, ), device=self.all_images.device)
+            c2w = self.all_c2w[index]
+            directions = self.directions[y, x]
+            rays_o, rays_d = get_rays(directions, c2w)
+            rgb = self.all_images[index, y, x].view(-1,
+                                                    self.all_images.shape[-1])
+            fg_mask = self.all_fg_masks[index, y, x].view(-1)
+
+        else:
+            c2w = self.all_c2w[index]
+            directions = self.directions
+            rays_o, rays_d = get_rays(directions, c2w)
+            rgb = self.all_images[index].view(-1, self.all_images.shape[-1])
+            fg_mask = self.all_fg_masks[index].view(-1)
+
+        rays = torch.cat([rays_o, rays_d], dim=-1)
+
+        if self.training:
+            if self.color_bkgd_aug == 'random':
+                color_bkgd = torch.rand(3, device=self.all_images.device)
+            elif self.color_bkgd_aug == 'white':
+                color_bkgd = torch.ones(3, device=self.all_images.device)
+            elif self.color_bkgd_aug == 'black':
+                color_bkgd = torch.zeros(3, device=self.all_images.device)
+        else:
+            # just use white during inference
+            color_bkgd = torch.ones(3, device=self.all_images.device)
+
+        rgb = rgb * fg_mask[..., None] + color_bkgd * (1 - fg_mask[..., None])
+
+        return {
+            'pixels': rgb,  # [h*w, 4] or [num_rays, 4]
+            'rays': rays,  # [h*w, 6] or [num_rays, 6]
+            'fg_mask': fg_mask,
+            'image_wh': self.image_wh,
+        }
+
+
+class ColmapDataset(Dataset):
+    """data loader for training and evaluation."""
+
+    def __init__(
+        self,
+        root_fp,
+        split,
+        img_wh,
+        max_size=1200,
+        num_rays=None,
+        use_mask=True,
+        color_bkgd_aug='random',
+        batch_over_images=True,
+        n_test_traj_steps=120,
+    ):
+        super().__init__()
+        if os.path.exists(os.path.join(root_fp, 'preprocess')):
+            self.root_fp = os.path.join(root_fp, 'preprocess')
+            self.distort = True
+        else:
+            self.root_fp = root_fp
+            self.distort = False
+        self.split = split
+        self.num_rays = num_rays
+        self.use_mask = use_mask
+        self.training = (num_rays is not None) and (split == 'train')
+        self.color_bkgd_aug = color_bkgd_aug
+        self.batch_over_images = batch_over_images
+        self.n_test_traj_steps = n_test_traj_steps
+
+        if self.distort:
+            camdata = read_cameras_binary(
+                os.path.join(self.root_fp, 'sparse/cameras.bin'))
+        else:
+            camdata = read_cameras_binary(
+                os.path.join(self.root_fp, 'sparse/0/cameras.bin'))
+        H, W = int(camdata[1].height), int(camdata[1].width)
+
+        if img_wh is not None:
+            w, h = img_wh
+            self.width, self.height = w, h
+            self.factor = w / W
+        else:
+            if H <= max_size and W <= max_size:
+                self.height = H
+                self.width = W
+                self.factor = 1
+            else:
+                if H > W:
+                    self.height = max_size
+                    self.width = round(max_size * W / H)
+                    self.factor = max_size / H
+                else:
+                    self.width = max_size
+                    self.height = round(max_size * H / W)
+                    self.factor = max_size / W
+        self.image_wh = (self.width, self.height)
+        print('process image width and height: {}'.format(self.image_wh))
+
+        print(camdata[1].model)
+        if camdata[1].model == 'SIMPLE_RADIAL':
+            fx = fy = camdata[1].params[0] * self.factor
+            cx = camdata[1].params[1] * self.factor
+            cy = camdata[1].params[2] * self.factor
+        elif camdata[1].model in ['PINHOLE', 'OPENCV']:
+            fx = camdata[1].params[0] * self.factor
+            fy = camdata[1].params[1] * self.factor
+            cx = camdata[1].params[2] * self.factor
+            cy = camdata[1].params[3] * self.factor
+        else:
+            raise ValueError(
+                f'Please parse the intrinsics for camera model {camdata[1].model}!'
+            )
+
+        self.directions = get_ray_directions(self.width, self.height, fx, fy,
+                                             cx, cy).cuda()
+        if self.distort:
+            imdata = read_images_binary(
+                os.path.join(self.root_fp, 'sparse/images.bin'))
+        else:
+            imdata = read_images_binary(
+                os.path.join(self.root_fp, 'sparse/0/images.bin'))
+
+        mask_dir = os.path.join(self.root_fp, 'masks')
+        self.use_mask = os.path.exists(mask_dir) and self.use_mask
+
+        self.all_c2w, self.all_images, self.all_fg_masks = [], [], []
+        for i, d in enumerate(imdata.values()):
+            R = d.qvec2rotmat()
+            t = d.tvec.reshape(3, 1)
+            c2w = torch.from_numpy(np.concatenate([R.T, -R.T @ t],
+                                                  axis=1)).float()
+            c2w[:, 1:3] *= -1.
+            self.all_c2w.append(c2w)
+            if self.split in ['train', 'val']:
+                img_path = os.path.join(self.root_fp, 'images', d.name)
+                img = Image.open(img_path)
+                img = img.resize(self.image_wh, Image.BICUBIC)
+                img = TF.to_tensor(img).permute(1, 2, 0)[..., :3]
+                if self.use_mask:
+                    mask_path = os.path.join(mask_dir, d.name)
+                    mask = Image.open(mask_path).convert('L')
+                    mask = mask.resize(self.image_wh, Image.BICUBIC)
+                    mask = TF.to_tensor(mask)[0]
+                else:
+                    mask = torch.ones_like(img[..., 0])
+                self.all_fg_masks.append(mask)
+                self.all_images.append(img)
+
+        self.all_c2w = torch.stack(self.all_c2w, dim=0)
+
+        if self.distort:
+            pts3d = read_points3D_binary(
+                os.path.join(self.root_fp, 'sparse/points3D.bin'))
+        else:
+            pts3d = read_points3D_binary(
+                os.path.join(self.root_fp, 'sparse/0/points3D.bin'))
+        pts3d = torch.from_numpy(np.array([pts3d[k].xyz
+                                           for k in pts3d])).float()
+
+        self.all_c2w, pts3d = normalize_poses(self.all_c2w, pts3d)
+
+        if self.split == 'test':
+            # self.all_c2w = get_spiral_path(
+            #     self.all_c2w, fx, fy, n_steps=self.n_test_traj_steps)
+            self.all_c2w = create_spheric_poses(
+                self.all_c2w[:, :, 3], n_steps=self.n_test_traj_steps)
+            self.all_images = torch.zeros(
+                (self.n_test_traj_steps, self.height, self.width, 3),
+                dtype=torch.float32)
+            self.all_fg_masks = torch.zeros(
+                (self.n_test_traj_steps, self.height, self.width),
+                dtype=torch.float32)
+        else:
+            self.all_images, self.all_fg_masks = torch.stack(
+                self.all_images, dim=0), torch.stack(
+                    self.all_fg_masks, dim=0)
+
+        self.all_c2w, self.all_images, self.all_fg_masks = \
+            self.all_c2w.float().cuda(), \
+            self.all_images.float().cuda(), \
+            self.all_fg_masks.float().cuda()
+
+    def __len__(self):
+        return len(self.all_images)
+
+    @torch.no_grad()
+    def __getitem__(self, index):
+        data = self.fetch_data(index)
+        return data
+
+    def update_num_rays(self, num_rays):
+        self.num_rays = num_rays
+
+    def fetch_data(self, index):
+        """Fetch the data (it maybe cached for multiple batches)."""
+        num_rays = self.num_rays
+        if self.training:
+            if self.batch_over_images:
+                index = torch.randint(
+                    0,
+                    len(self.all_images),
+                    size=(num_rays, ),
+                    device=self.all_images.device)
+
+            else:
+                index = torch.randint(
+                    0,
+                    len(self.all_images),
+                    size=(1, ),
+                    device=self.all_images.device)
+
+            x = torch.randint(
+                0,
+                self.width,
+                size=(num_rays, ),
+                device=self.all_images.device)
+            y = torch.randint(
+                0,
+                self.height,
+                size=(num_rays, ),
+                device=self.all_images.device)
+            c2w = self.all_c2w[index]
+            directions = self.directions[y, x]
+            rays_o, rays_d = get_rays(directions, c2w)
+            rgb = self.all_images[index, y, x].view(-1,
+                                                    self.all_images.shape[-1])
+            fg_mask = self.all_fg_masks[index, y, x].view(-1)
+
+        else:
+            c2w = self.all_c2w[index]
+            directions = self.directions
+            rays_o, rays_d = get_rays(directions, c2w)
+            rgb = self.all_images[index].view(-1, self.all_images.shape[-1])
+            fg_mask = self.all_fg_masks[index].view(-1)
+
+        rays = torch.cat([rays_o, F.normalize(rays_d, p=2, dim=-1)], dim=-1)
+
+        if self.training:
+            if self.color_bkgd_aug == 'random':
+                color_bkgd = torch.rand(3, device=self.all_images.device)
+            elif self.color_bkgd_aug == 'white':
+                color_bkgd = torch.ones(3, device=self.all_images.device)
+            elif self.color_bkgd_aug == 'black':
+                color_bkgd = torch.zeros(3, device=self.all_images.device)
+        else:
+            # just use white during inference
+            color_bkgd = torch.ones(3, device=self.all_images.device)
+
+        rgb = rgb * fg_mask[..., None] + color_bkgd * (1 - fg_mask[..., None])
+
+        return {
+            'pixels': rgb,  # [h*w, 4] or [num_rays, 4]
+            'rays': rays,  # [h*w, 6] or [num_rays, 6]
+            'fg_mask': fg_mask,
+            'image_wh': self.image_wh,
+        }
diff --git a/modelscope/models/cv/nerf_recon_acc/dataloader/read_write_model.py b/modelscope/models/cv/nerf_recon_acc/dataloader/read_write_model.py
new file mode 100644
index 00000000..cec00a52
--- /dev/null
+++ b/modelscope/models/cv/nerf_recon_acc/dataloader/read_write_model.py
@@ -0,0 +1,500 @@
+# Copyright (c) 2023, ETH Zurich and UNC Chapel Hill.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+#     * Redistributions of source code must retain the above copyright
+#       notice, this list of conditions and the following disclaimer.
+#
+#     * Redistributions in binary form must reproduce the above copyright
+#       notice, this list of conditions and the following disclaimer in the
+#       documentation and/or other materials provided with the distribution.
+#
+#     * Neither the name of ETH Zurich and UNC Chapel Hill nor the names of
+#       its contributors may be used to endorse or promote products derived
+#       from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+#
+# Author: Johannes L. Schoenberger (jsch-at-demuc-dot-de)
+
+import argparse
+import collections
+import os
+import struct
+
+import numpy as np
+
+CameraModel = collections.namedtuple('CameraModel',
+                                     ['model_id', 'model_name', 'num_params'])
+Camera = collections.namedtuple('Camera',
+                                ['id', 'model', 'width', 'height', 'params'])
+BaseImage = collections.namedtuple(
+    'Image', ['id', 'qvec', 'tvec', 'camera_id', 'name', 'xys', 'point3D_ids'])
+Point3D = collections.namedtuple(
+    'Point3D', ['id', 'xyz', 'rgb', 'error', 'image_ids', 'point2D_idxs'])
+
+
+class Image(BaseImage):
+
+    def qvec2rotmat(self):
+        return qvec2rotmat(self.qvec)
+
+
+CAMERA_MODELS = {
+    CameraModel(model_id=0, model_name='SIMPLE_PINHOLE', num_params=3),
+    CameraModel(model_id=1, model_name='PINHOLE', num_params=4),
+    CameraModel(model_id=2, model_name='SIMPLE_RADIAL', num_params=4),
+    CameraModel(model_id=3, model_name='RADIAL', num_params=5),
+    CameraModel(model_id=4, model_name='OPENCV', num_params=8),
+    CameraModel(model_id=5, model_name='OPENCV_FISHEYE', num_params=8),
+    CameraModel(model_id=6, model_name='FULL_OPENCV', num_params=12),
+    CameraModel(model_id=7, model_name='FOV', num_params=5),
+    CameraModel(model_id=8, model_name='SIMPLE_RADIAL_FISHEYE', num_params=4),
+    CameraModel(model_id=9, model_name='RADIAL_FISHEYE', num_params=5),
+    CameraModel(model_id=10, model_name='THIN_PRISM_FISHEYE', num_params=12)
+}
+CAMERA_MODEL_IDS = dict([(camera_model.model_id, camera_model)
+                         for camera_model in CAMERA_MODELS])
+CAMERA_MODEL_NAMES = dict([(camera_model.model_name, camera_model)
+                           for camera_model in CAMERA_MODELS])
+
+
+def read_next_bytes(fid,
+                    num_bytes,
+                    format_char_sequence,
+                    endian_character='<'):
+    """Read and unpack the next bytes from a binary file.
+    :param fid:
+    :param num_bytes: Sum of combination of {2, 4, 8}, e.g. 2, 6, 16, 30, etc.
+    :param format_char_sequence: List of {c, e, f, d, h, H, i, I, l, L, q, Q}.
+    :param endian_character: Any of {@, =, <, >, !}
+    :return: Tuple of read and unpacked values.
+    """
+    data = fid.read(num_bytes)
+    return struct.unpack(endian_character + format_char_sequence, data)
+
+
+def write_next_bytes(fid, data, format_char_sequence, endian_character='<'):
+    """pack and write to a binary file.
+    :param fid:
+    :param data: data to send, if multiple elements are sent at the same time,
+    they should be encapsuled either in a list or a tuple
+    :param format_char_sequence: List of {c, e, f, d, h, H, i, I, l, L, q, Q}.
+    should be the same length as the data list or tuple
+    :param endian_character: Any of {@, =, <, >, !}
+    """
+    if isinstance(data, (list, tuple)):
+        bytes = struct.pack(endian_character + format_char_sequence, *data)
+    else:
+        bytes = struct.pack(endian_character + format_char_sequence, data)
+    fid.write(bytes)
+
+
+def read_cameras_text(path):
+    """
+    see: src/base/reconstruction.cc
+        void Reconstruction::WriteCamerasText(const std::string& path)
+        void Reconstruction::ReadCamerasText(const std::string& path)
+    """
+    cameras = {}
+    with open(path, 'r') as fid:
+        while True:
+            line = fid.readline()
+            if not line:
+                break
+            line = line.strip()
+            if len(line) > 0 and line[0] != '#':
+                elems = line.split()
+                camera_id = int(elems[0])
+                model = elems[1]
+                width = int(elems[2])
+                height = int(elems[3])
+                params = np.array(tuple(map(float, elems[4:])))
+                cameras[camera_id] = Camera(
+                    id=camera_id,
+                    model=model,
+                    width=width,
+                    height=height,
+                    params=params)
+    return cameras
+
+
+def read_cameras_binary(path_to_model_file):
+    """
+    see: src/base/reconstruction.cc
+        void Reconstruction::WriteCamerasBinary(const std::string& path)
+        void Reconstruction::ReadCamerasBinary(const std::string& path)
+    """
+    cameras = {}
+    with open(path_to_model_file, 'rb') as fid:
+        num_cameras = read_next_bytes(fid, 8, 'Q')[0]
+        for _ in range(num_cameras):
+            camera_properties = read_next_bytes(
+                fid, num_bytes=24, format_char_sequence='iiQQ')
+            camera_id = camera_properties[0]
+            model_id = camera_properties[1]
+            model_name = CAMERA_MODEL_IDS[camera_properties[1]].model_name
+            width = camera_properties[2]
+            height = camera_properties[3]
+            num_params = CAMERA_MODEL_IDS[model_id].num_params
+            params = read_next_bytes(
+                fid,
+                num_bytes=8 * num_params,
+                format_char_sequence='d' * num_params)
+            cameras[camera_id] = Camera(
+                id=camera_id,
+                model=model_name,
+                width=width,
+                height=height,
+                params=np.array(params))
+        assert len(cameras) == num_cameras
+    return cameras
+
+
+def write_cameras_text(cameras, path):
+    """
+    see: src/base/reconstruction.cc
+        void Reconstruction::WriteCamerasText(const std::string& path)
+        void Reconstruction::ReadCamerasText(const std::string& path)
+    """
+    HEADER = '# Camera list with one line of data per camera:\n' + \
+             '#   CAMERA_ID, MODEL, WIDTH, HEIGHT, PARAMS[]\n' + \
+             '# Number of cameras: {}\n'.format(len(cameras))
+    with open(path, 'w') as fid:
+        fid.write(HEADER)
+        for _, cam in cameras.items():
+            to_write = [cam.id, cam.model, cam.width, cam.height, *cam.params]
+            line = ' '.join([str(elem) for elem in to_write])
+            fid.write(line + '\n')
+
+
+def write_cameras_binary(cameras, path_to_model_file):
+    """
+    see: src/base/reconstruction.cc
+        void Reconstruction::WriteCamerasBinary(const std::string& path)
+        void Reconstruction::ReadCamerasBinary(const std::string& path)
+    """
+    with open(path_to_model_file, 'wb') as fid:
+        write_next_bytes(fid, len(cameras), 'Q')
+        for _, cam in cameras.items():
+            model_id = CAMERA_MODEL_NAMES[cam.model].model_id
+            camera_properties = [cam.id, model_id, cam.width, cam.height]
+            write_next_bytes(fid, camera_properties, 'iiQQ')
+            for p in cam.params:
+                write_next_bytes(fid, float(p), 'd')
+    return cameras
+
+
+def read_images_text(path):
+    """
+    see: src/base/reconstruction.cc
+        void Reconstruction::ReadImagesText(const std::string& path)
+        void Reconstruction::WriteImagesText(const std::string& path)
+    """
+    images = {}
+    with open(path, 'r') as fid:
+        while True:
+            line = fid.readline()
+            if not line:
+                break
+            line = line.strip()
+            if len(line) > 0 and line[0] != '#':
+                elems = line.split()
+                image_id = int(elems[0])
+                qvec = np.array(tuple(map(float, elems[1:5])))
+                tvec = np.array(tuple(map(float, elems[5:8])))
+                camera_id = int(elems[8])
+                image_name = elems[9]
+                elems = fid.readline().split()
+                xys = np.column_stack([
+                    tuple(map(float, elems[0::3])),
+                    tuple(map(float, elems[1::3]))
+                ])
+                point3D_ids = np.array(tuple(map(int, elems[2::3])))
+                images[image_id] = Image(
+                    id=image_id,
+                    qvec=qvec,
+                    tvec=tvec,
+                    camera_id=camera_id,
+                    name=image_name,
+                    xys=xys,
+                    point3D_ids=point3D_ids)
+    return images
+
+
+def read_images_binary(path_to_model_file):
+    """
+    see: src/base/reconstruction.cc
+        void Reconstruction::ReadImagesBinary(const std::string& path)
+        void Reconstruction::WriteImagesBinary(const std::string& path)
+    """
+    images = {}
+    with open(path_to_model_file, 'rb') as fid:
+        num_reg_images = read_next_bytes(fid, 8, 'Q')[0]
+        for _ in range(num_reg_images):
+            binary_image_properties = read_next_bytes(
+                fid, num_bytes=64, format_char_sequence='idddddddi')
+            image_id = binary_image_properties[0]
+            qvec = np.array(binary_image_properties[1:5])
+            tvec = np.array(binary_image_properties[5:8])
+            camera_id = binary_image_properties[8]
+            image_name = ''
+            current_char = read_next_bytes(fid, 1, 'c')[0]
+            while current_char != b'\x00':  # look for the ASCII 0 entry
+                image_name += current_char.decode('utf-8')
+                current_char = read_next_bytes(fid, 1, 'c')[0]
+            num_points2D = read_next_bytes(
+                fid, num_bytes=8, format_char_sequence='Q')[0]
+            x_y_id_s = read_next_bytes(
+                fid,
+                num_bytes=24 * num_points2D,
+                format_char_sequence='ddq' * num_points2D)
+            xys = np.column_stack([
+                tuple(map(float, x_y_id_s[0::3])),
+                tuple(map(float, x_y_id_s[1::3]))
+            ])
+            point3D_ids = np.array(tuple(map(int, x_y_id_s[2::3])))
+            images[image_id] = Image(
+                id=image_id,
+                qvec=qvec,
+                tvec=tvec,
+                camera_id=camera_id,
+                name=image_name,
+                xys=xys,
+                point3D_ids=point3D_ids)
+    return images
+
+
+def write_images_text(images, path):
+    """
+    see: src/base/reconstruction.cc
+        void Reconstruction::ReadImagesText(const std::string& path)
+        void Reconstruction::WriteImagesText(const std::string& path)
+    """
+    if len(images) == 0:
+        mean_observations = 0
+    else:
+        mean_observations = sum(
+            (len(img.point3D_ids) for _, img in images.items())) / len(images)
+    HEADER = '# Image list with two lines of data per image:\n' + \
+             '#   IMAGE_ID, QW, QX, QY, QZ, TX, TY, TZ, CAMERA_ID, NAME\n' + \
+             '#   POINTS2D[] as (X, Y, POINT3D_ID)\n' + \
+             '# Number of images: {}, mean observations per image: {}\n'.format(len(images), mean_observations)
+
+    with open(path, 'w') as fid:
+        fid.write(HEADER)
+        for _, img in images.items():
+            image_header = [
+                img.id, *img.qvec, *img.tvec, img.camera_id, img.name
+            ]
+            first_line = ' '.join(map(str, image_header))
+            fid.write(first_line + '\n')
+
+            points_strings = []
+            for xy, point3D_id in zip(img.xys, img.point3D_ids):
+                points_strings.append(' '.join(map(str, [*xy, point3D_id])))
+            fid.write(' '.join(points_strings) + '\n')
+
+
+def write_images_binary(images, path_to_model_file):
+    """
+    see: src/base/reconstruction.cc
+        void Reconstruction::ReadImagesBinary(const std::string& path)
+        void Reconstruction::WriteImagesBinary(const std::string& path)
+    """
+    with open(path_to_model_file, 'wb') as fid:
+        write_next_bytes(fid, len(images), 'Q')
+        for _, img in images.items():
+            write_next_bytes(fid, img.id, 'i')
+            write_next_bytes(fid, img.qvec.tolist(), 'dddd')
+            write_next_bytes(fid, img.tvec.tolist(), 'ddd')
+            write_next_bytes(fid, img.camera_id, 'i')
+            for char in img.name:
+                write_next_bytes(fid, char.encode('utf-8'), 'c')
+            write_next_bytes(fid, b'\x00', 'c')
+            write_next_bytes(fid, len(img.point3D_ids), 'Q')
+            for xy, p3d_id in zip(img.xys, img.point3D_ids):
+                write_next_bytes(fid, [*xy, p3d_id], 'ddq')
+
+
+def read_points3D_text(path):
+    """
+    see: src/base/reconstruction.cc
+        void Reconstruction::ReadPoints3DText(const std::string& path)
+        void Reconstruction::WritePoints3DText(const std::string& path)
+    """
+    points3D = {}
+    with open(path, 'r') as fid:
+        while True:
+            line = fid.readline()
+            if not line:
+                break
+            line = line.strip()
+            if len(line) > 0 and line[0] != '#':
+                elems = line.split()
+                point3D_id = int(elems[0])
+                xyz = np.array(tuple(map(float, elems[1:4])))
+                rgb = np.array(tuple(map(int, elems[4:7])))
+                error = float(elems[7])
+                image_ids = np.array(tuple(map(int, elems[8::2])))
+                point2D_idxs = np.array(tuple(map(int, elems[9::2])))
+                points3D[point3D_id] = Point3D(
+                    id=point3D_id,
+                    xyz=xyz,
+                    rgb=rgb,
+                    error=error,
+                    image_ids=image_ids,
+                    point2D_idxs=point2D_idxs)
+    return points3D
+
+
+def read_points3D_binary(path_to_model_file):
+    """
+    see: src/base/reconstruction.cc
+        void Reconstruction::ReadPoints3DBinary(const std::string& path)
+        void Reconstruction::WritePoints3DBinary(const std::string& path)
+    """
+    points3D = {}
+    with open(path_to_model_file, 'rb') as fid:
+        num_points = read_next_bytes(fid, 8, 'Q')[0]
+        for _ in range(num_points):
+            binary_point_line_properties = read_next_bytes(
+                fid, num_bytes=43, format_char_sequence='QdddBBBd')
+            point3D_id = binary_point_line_properties[0]
+            xyz = np.array(binary_point_line_properties[1:4])
+            rgb = np.array(binary_point_line_properties[4:7])
+            error = np.array(binary_point_line_properties[7])
+            track_length = read_next_bytes(
+                fid, num_bytes=8, format_char_sequence='Q')[0]
+            track_elems = read_next_bytes(
+                fid,
+                num_bytes=8 * track_length,
+                format_char_sequence='ii' * track_length)
+            image_ids = np.array(tuple(map(int, track_elems[0::2])))
+            point2D_idxs = np.array(tuple(map(int, track_elems[1::2])))
+            points3D[point3D_id] = Point3D(
+                id=point3D_id,
+                xyz=xyz,
+                rgb=rgb,
+                error=error,
+                image_ids=image_ids,
+                point2D_idxs=point2D_idxs)
+    return points3D
+
+
+def write_points3D_text(points3D, path):
+    """
+    see: src/base/reconstruction.cc
+        void Reconstruction::ReadPoints3DText(const std::string& path)
+        void Reconstruction::WritePoints3DText(const std::string& path)
+    """
+    if len(points3D) == 0:
+        mean_track_length = 0
+    else:
+        mean_track_length = sum(
+            (len(pt.image_ids) for _, pt in points3D.items())) / len(points3D)
+    HEADER = '# 3D point list with one line of data per point:\n' + \
+             '#   POINT3D_ID, X, Y, Z, R, G, B, ERROR, TRACK[] as (IMAGE_ID, POINT2D_IDX)\n' + \
+             '# Number of points: {}, mean track length: {}\n'.format(len(points3D), mean_track_length)
+
+    with open(path, 'w') as fid:
+        fid.write(HEADER)
+        for _, pt in points3D.items():
+            point_header = [pt.id, *pt.xyz, *pt.rgb, pt.error]
+            fid.write(' '.join(map(str, point_header)) + ' ')
+            track_strings = []
+            for image_id, point2D in zip(pt.image_ids, pt.point2D_idxs):
+                track_strings.append(' '.join(map(str, [image_id, point2D])))
+            fid.write(' '.join(track_strings) + '\n')
+
+
+def write_points3D_binary(points3D, path_to_model_file):
+    """
+    see: src/base/reconstruction.cc
+        void Reconstruction::ReadPoints3DBinary(const std::string& path)
+        void Reconstruction::WritePoints3DBinary(const std::string& path)
+    """
+    with open(path_to_model_file, 'wb') as fid:
+        write_next_bytes(fid, len(points3D), 'Q')
+        for _, pt in points3D.items():
+            write_next_bytes(fid, pt.id, 'Q')
+            write_next_bytes(fid, pt.xyz.tolist(), 'ddd')
+            write_next_bytes(fid, pt.rgb.tolist(), 'BBB')
+            write_next_bytes(fid, pt.error, 'd')
+            track_length = pt.image_ids.shape[0]
+            write_next_bytes(fid, track_length, 'Q')
+            for image_id, point2D_id in zip(pt.image_ids, pt.point2D_idxs):
+                write_next_bytes(fid, [image_id, point2D_id], 'ii')
+
+
+def detect_model_format(path, ext):
+    if os.path.isfile(os.path.join(path, 'cameras' + ext)) and \
+       os.path.isfile(os.path.join(path, 'images' + ext)) and \
+       os.path.isfile(os.path.join(path, 'points3D' + ext)):
+        print("Detected model format: '" + ext + "'")
+        return True
+
+    return False
+
+
+def read_model(path, ext=''):
+    # try to detect the extension automatically
+    if ext == '':
+        if detect_model_format(path, '.bin'):
+            ext = '.bin'
+        elif detect_model_format(path, '.txt'):
+            ext = '.txt'
+        else:
+            print("Provide model format: '.bin' or '.txt'")
+            return
+
+    if ext == '.txt':
+        cameras = read_cameras_text(os.path.join(path, 'cameras' + ext))
+        images = read_images_text(os.path.join(path, 'images' + ext))
+        points3D = read_points3D_text(os.path.join(path, 'points3D') + ext)
+    else:
+        cameras = read_cameras_binary(os.path.join(path, 'cameras' + ext))
+        images = read_images_binary(os.path.join(path, 'images' + ext))
+        points3D = read_points3D_binary(os.path.join(path, 'points3D') + ext)
+    return cameras, images, points3D
+
+
+def write_model(cameras, images, points3D, path, ext='.bin'):
+    if ext == '.txt':
+        write_cameras_text(cameras, os.path.join(path, 'cameras' + ext))
+        write_images_text(images, os.path.join(path, 'images' + ext))
+        write_points3D_text(points3D, os.path.join(path, 'points3D') + ext)
+    else:
+        write_cameras_binary(cameras, os.path.join(path, 'cameras' + ext))
+        write_images_binary(images, os.path.join(path, 'images' + ext))
+        write_points3D_binary(points3D, os.path.join(path, 'points3D') + ext)
+    return cameras, images, points3D
+
+
+def qvec2rotmat(qvec):
+    array_10 = 1 - 2 * qvec[2]**2 - 2 * qvec[3]**2
+    array_11 = 2 * qvec[1] * qvec[2] - 2 * qvec[0] * qvec[3]
+    array_12 = 2 * qvec[3] * qvec[1] + 2 * qvec[0] * qvec[2]
+    array_1 = [array_10, array_11, array_12]
+    array_20 = 2 * qvec[1] * qvec[2] + 2 * qvec[0] * qvec[3]
+    array_21 = 1 - 2 * qvec[1]**2 - 2 * qvec[3]**2
+    array_22 = 2 * qvec[2] * qvec[3] - 2 * qvec[0] * qvec[1]
+    array_2 = [array_20, array_21, array_22]
+    array_30 = 2 * qvec[3] * qvec[1] - 2 * qvec[0] * qvec[2]
+    array_31 = 2 * qvec[2] * qvec[3] + 2 * qvec[0] * qvec[1]
+    array_32 = 1 - 2 * qvec[1]**2 - 2 * qvec[2]**2
+    array_3 = [array_30, array_31, array_32]
+
+    return np.array([array_1, array_2, array_3])
diff --git a/modelscope/models/cv/nerf_recon_acc/nerf_preprocess.py b/modelscope/models/cv/nerf_recon_acc/nerf_preprocess.py
new file mode 100644
index 00000000..1fd90d49
--- /dev/null
+++ b/modelscope/models/cv/nerf_recon_acc/nerf_preprocess.py
@@ -0,0 +1,204 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import glob
+import os
+import subprocess
+from typing import Any, Dict, Union
+
+import cv2
+import numpy as np
+import tensorflow as tf
+
+from modelscope.metainfo import Preprocessors
+from modelscope.preprocessors import Preprocessor
+from modelscope.preprocessors.builder import PREPROCESSORS
+from modelscope.utils.constant import Fields, ModeKeys
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+
+@PREPROCESSORS.register_module(
+    Fields.cv, module_name=Preprocessors.nerf_recon_acc_preprocessor)
+class NeRFReconPreprocessor(Preprocessor):
+
+    def __init__(self,
+                 mode=ModeKeys.INFERENCE,
+                 data_type='colmap',
+                 use_mask=True,
+                 match_type='exhaustive_matcher',
+                 frame_count=60,
+                 use_distortion=False,
+                 *args,
+                 **kwargs):
+
+        super().__init__(mode)
+
+        # set preprocessor info
+        self.data_type = data_type
+        self.use_mask = use_mask
+
+        self.match_type = match_type
+        if match_type != 'exhaustive_matcher' and match_type != 'sequential_matcher':
+            raise Exception('matcher type {} is not valid'.format(match_type))
+        self.frame_count = frame_count
+        self.use_distortion = use_distortion
+
+    def __call__(self, data: Union[str, Dict], **kwargs) -> Dict[str, Any]:
+
+        if self.data_type != 'blender' and self.data_type != 'colmap':
+            raise Exception('data type {} is not support currently'.format(
+                self.data_type))
+
+        if 'data_dir' not in data.keys():
+            raise Exception('Do not specify the data dir')
+
+        data_dir = data['data_dir']
+        os.makedirs(data_dir, exist_ok=True)
+        if self.data_type == 'blender':
+            transform_file = os.path.join(data_dir, 'transforms_train.json')
+            if not os.path.exists(transform_file):
+                raise Exception('Blender dataset is not found')
+
+        if self.data_type == 'colmap':
+            if 'video_input_path' not in data.keys():
+                raise Exception('Do not specify the video path')
+            video_path = data['video_input_path']
+            self.split_frames(video_path, data_dir, self.frame_count)
+            self.gen_poses(data_dir, self.match_type, self.use_distortion)
+            files_needed = [
+                '{}.bin'.format(f) for f in ['cameras', 'images', 'points3D']
+            ]
+            if self.use_distortion:
+                colmap_dir = os.path.join(data_dir, 'preprocess/sparse')
+                files_had = os.listdir(colmap_dir)
+            else:
+                colmap_dir = os.path.join(data_dir, 'sparse/0')
+                files_had = os.listdir(colmap_dir)
+            if not all([f in files_had for f in files_needed]):
+                raise Exception('colmap run failed')
+
+        data = {}
+        data['data_dir'] = data_dir
+        return data
+
+    def split_frames(self, video_path, basedir, frame_count=60):
+        cap = cv2.VideoCapture(video_path)
+        fps = round(cap.get(cv2.CAP_PROP_FPS))
+        frame_total = round(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+
+        if not os.path.exists(os.path.join(basedir, 'images')):
+            logger.info('Need to run ffmpeg')
+            image_dir = os.path.join(basedir, 'images')
+            os.makedirs(image_dir, exist_ok=True)
+            fps = int(frame_count * fps / frame_total)
+            cmd = f"ffmpeg -i {video_path} -qscale:v 1 -qmin 1 -vf \"fps={fps}\" {image_dir}/%04d.png"
+            os.system(cmd)
+            logger.info('split frames done')
+        else:
+            logger.info('Don\'t need to run ffmpeg')
+
+    def run_colmap(self, basedir, match_type, use_distortion):
+        logfile_name = os.path.join(basedir, 'colmap_output.txt')
+        logfile = open(logfile_name, 'w')
+
+        feature_extractor_args = [
+            'colmap', 'feature_extractor', '--database_path',
+            os.path.join(basedir, 'database.db'), '--image_path',
+            os.path.join(basedir, 'images'), '--ImageReader.single_camera', '1'
+        ]
+        feat_output = (
+            subprocess.check_output(
+                feature_extractor_args, universal_newlines=True))
+        logfile.write(feat_output)
+        logger.info('Features extracted done')
+
+        exhaustive_matcher_args = [
+            'colmap',
+            match_type,
+            '--database_path',
+            os.path.join(basedir, 'database.db'),
+        ]
+
+        match_output = (
+            subprocess.check_output(
+                exhaustive_matcher_args, universal_newlines=True))
+        logfile.write(match_output)
+        logger.info('Features matched done')
+
+        p = os.path.join(basedir, 'sparse')
+        if not os.path.exists(p):
+            os.makedirs(p)
+
+        mapper_args = [
+            'colmap',
+            'mapper',
+            '--database_path',
+            os.path.join(basedir, 'database.db'),
+            '--image_path',
+            os.path.join(basedir, 'images'),
+            '--output_path',
+            os.path.join(
+                basedir, 'sparse'
+            ),  # --export_path changed to --output_path in colmap 3.6
+            '--Mapper.num_threads',
+            '16',
+            '--Mapper.init_min_tri_angle',
+            '4',
+            '--Mapper.multiple_models',
+            '0',
+            '--Mapper.extract_colors',
+            '0',
+        ]
+
+        map_output = (
+            subprocess.check_output(mapper_args, universal_newlines=True))
+        logfile.write(map_output)
+        logger.info('Sparse map created done.')
+
+        bundle_adjuster_cmd = [
+            'colmap',
+            'bundle_adjuster',
+            '--input_path',
+            os.path.join(basedir, 'sparse/0'),
+            '--output_path',
+            os.path.join(basedir, 'sparse/0'),
+            '--BundleAdjustment.refine_principal_point',
+            '1',
+        ]
+        map_output = (
+            subprocess.check_output(
+                bundle_adjuster_cmd, universal_newlines=True))
+        logfile.write(map_output)
+        logger.info('Refining intrinsics done.')
+
+        if use_distortion:
+            os.makedirs(os.path.join(basedir, 'preprocess'), exist_ok=True)
+            distort_cmd = [
+                'colmap', 'image_undistorter', '--image_path',
+                os.path.join(basedir, 'images'), '--input_path',
+                os.path.join(basedir, 'sparse/0'), '--output_path',
+                os.path.join(basedir, 'preprocess'), '--output_type', 'COLMAP'
+            ]
+            map_output = (
+                subprocess.check_output(distort_cmd, universal_newlines=True))
+            logfile.write(map_output)
+            logger.info('Image distortion done.')
+
+        logfile.close()
+        logger.info(
+            'Finished running COLMAP, see {} for logs'.format(logfile_name))
+
+    def gen_poses(self, basedir, match_type, use_distortion):
+        files_needed = [
+            '{}.bin'.format(f) for f in ['cameras', 'images', 'points3D']
+        ]
+        if os.path.exists(os.path.join(basedir, 'sparse/0')):
+            files_had = os.listdir(os.path.join(basedir, 'sparse/0'))
+        else:
+            files_had = []
+        if not all([f in files_had for f in files_needed]):
+            logger.info('Need to run COLMAP')
+            self.run_colmap(basedir, match_type, use_distortion)
+        else:
+            logger.info('Don\'t need to run COLMAP')
diff --git a/modelscope/models/cv/nerf_recon_acc/nerf_recon_acc.py b/modelscope/models/cv/nerf_recon_acc/nerf_recon_acc.py
new file mode 100644
index 00000000..d16aa4fe
--- /dev/null
+++ b/modelscope/models/cv/nerf_recon_acc/nerf_recon_acc.py
@@ -0,0 +1,319 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import glob
+import os
+import random
+import re
+import time
+from collections import OrderedDict
+from typing import Any, Dict
+
+import cv2
+import numpy as np
+import torch
+import torch.nn.functional as F
+import tqdm
+
+from modelscope.metainfo import Models
+from modelscope.models.base import Tensor, TorchModel
+from modelscope.models.builder import MODELS
+from modelscope.utils.constant import ModelFile, Tasks
+from modelscope.utils.logger import get_logger
+from .dataloader.nerf_dataset import BlenderDataset, ColmapDataset
+from .network.nerf import NeRFModel
+from .network.segmenter import ObjectSegmenter
+from .network.utils import PSNR
+
+logger = get_logger()
+
+__all__ = ['NeRFReconAcc']
+
+
+@MODELS.register_module(
+    Tasks.nerf_recon_acc, module_name=Models.nerf_recon_acc)
+class NeRFReconAcc(TorchModel):
+
+    def __init__(self, model_dir, data_type, use_mask, network_cfg, **kwargs):
+        """initialize the acceleration version of nerf reconstruction model for object.
+
+        Args:
+            model_dir (str): the model path.
+            data_type (str): default is 'colmap'
+            use_mask (bool): whether use mask of objects, default True
+            max_step (int): max train steps, default 30000
+            train_num_rays (int): init number of rays in training, default 256
+            num_samples_per_ray (int): sampling numbers for each ray, default 1024
+            max_train_num_rays (int): max number of rays in training, default 8192
+            test_ray_chunk (int): chunk size for rendering, default 1024
+            dynamic_ray_sampling (bool): whether use dynamic ray sampling when training, default True
+            max_size (int): max size of (width, height) when training, default 800
+            n_test_traj_steps (int): number of testing images, default 120
+            log_every_n_steps (int): print log info every n steps, default 1000
+            save_mesh (bool): whether to save the reconstructed mesh of object, default False
+            save_ckpt (bool): whether to save the checkpoints in data_dir, default False
+            network_cfg (dict): args of network config
+        """
+        super().__init__(model_dir, **kwargs)
+
+        if not torch.cuda.is_available():
+            raise Exception('GPU is required')
+
+        self.data_type = data_type
+        self.use_mask = use_mask
+        self.max_step = kwargs['max_step']
+        self.train_num_rays = kwargs['train_num_rays']
+        self.num_samples_per_ray = kwargs['num_samples_per_ray']
+        self.train_num_samples = self.train_num_rays * self.num_samples_per_ray
+        self.max_train_num_rays = kwargs['max_train_num_rays']
+        self.dynamic_ray_sampling = kwargs['dynamic_ray_sampling']
+
+        self.log_every_n_steps = kwargs['log_every_n_steps']
+        self.save_mesh = kwargs['save_mesh']
+        self.save_ckpt = kwargs['save_ckpt']
+
+        if self.use_mask:
+            segment_path = os.path.join(model_dir, 'matting.pb')
+            self.segmenter = ObjectSegmenter(segment_path)
+
+        if self.data_type == 'blender':
+            self.img_wh = (800, 800)
+            network_cfg['radius'] = 1.5
+            self.background = 'white'
+            network_cfg['background'] = 'white'
+        elif self.data_type == 'colmap':
+            self.img_wh = None
+            self.max_size = kwargs['max_size']
+            self.n_test_traj_steps = kwargs['n_test_traj_steps']
+            network_cfg['radius'] = 0.5
+            if self.use_mask:
+                self.background = 'white'
+                network_cfg['background'] = 'white'
+                logger.info('run nerf with mask data')
+            else:
+                self.background = 'random'
+                network_cfg['background'] = 'random'
+                logger.info('run nerf without mask data')
+        logger.info(network_cfg)
+
+        self.model = NeRFModel(network_cfg, **kwargs).cuda()
+        self.optimizer = torch.optim.Adam(
+            self.model.parameters(), lr=0.01, eps=1e-15)
+        self.grad_scaler = torch.cuda.amp.GradScaler(2**10)
+
+        self.scheduler = torch.optim.lr_scheduler.MultiStepLR(
+            self.optimizer,
+            milestones=[
+                self.max_step // 2, self.max_step * 3 // 4,
+                self.max_step * 9 // 10
+            ],
+            gamma=0.33,
+        )
+        self.criterions = PSNR()
+        self.set_random_seed(42)
+
+    def set_random_seed(self, seed):
+        random.seed(seed)
+        np.random.seed(seed)
+        torch.manual_seed(seed)
+
+    @torch.enable_grad()
+    def nerf_reconstruction(self, data_dir):
+        if os.path.exists(os.path.join(data_dir, 'preprocess')):
+            use_distortion = True
+        else:
+            use_distortion = False
+        if self.use_mask:
+            if use_distortion:
+                image_dir = os.path.join(data_dir, 'preprocess/images')
+                save_mask_dir = os.path.join(data_dir, 'preprocess/masks')
+            else:
+                image_dir = os.path.join(data_dir, 'images')
+                save_mask_dir = os.path.join(data_dir, 'masks')
+            os.makedirs(save_mask_dir, exist_ok=True)
+            img_list = glob.glob('{}/*.*g'.format(image_dir)) + glob.glob(
+                '{}/*.*G'.format(image_dir))
+            for img_path in img_list:
+                img = cv2.imread(img_path)
+                mask = self.segmenter.run_mask(img)
+                outpath = os.path.join(save_mask_dir,
+                                       os.path.basename(img_path))
+                cv2.imwrite(outpath, mask)
+            logger.info('segment images done!')
+
+        if self.data_type == 'blender':
+            self.train_dataset = BlenderDataset(
+                root_fp=data_dir,
+                split='train',
+                img_wh=self.img_wh,
+                num_rays=self.train_num_rays,
+                color_bkgd_aug=self.background,
+            )
+
+            self.test_dataset = BlenderDataset(
+                root_fp=data_dir,
+                split='test',
+                img_wh=self.img_wh,
+                num_rays=self.train_num_rays,
+            )
+
+        elif self.data_type == 'colmap':
+            self.train_dataset = ColmapDataset(
+                root_fp=data_dir,
+                split='train',
+                img_wh=self.img_wh,
+                max_size=self.max_size,
+                num_rays=self.train_num_rays,
+                color_bkgd_aug=self.background,
+            )
+
+            self.test_dataset = ColmapDataset(
+                root_fp=data_dir,
+                split='test',
+                img_wh=self.img_wh,
+                max_size=self.max_size,
+                num_rays=self.train_num_rays,
+                n_test_traj_steps=self.n_test_traj_steps,
+            )
+
+        step = 0
+        tic = time.time()
+        while step < self.max_step:
+            for i in range(len(self.train_dataset)):
+                self.model.train()
+                data = self.train_dataset[i]
+                self.model.update_step(step)
+                rays = data['rays'].cuda()
+                pixels = data['pixels'].cuda()
+
+                out = self.model(rays)
+
+                if out['num_samples'] == 0:
+                    continue
+
+                loss = 0.
+
+                if self.dynamic_ray_sampling:
+                    temp = self.train_num_samples / sum(out['num_samples'])
+                    train_num_rays = int(self.train_num_rays * temp)
+                    self.train_num_rays = min(
+                        int(self.train_num_rays * 0.9 + train_num_rays * 0.1),
+                        self.max_train_num_rays)
+
+                self.train_dataset.update_num_rays(self.train_num_rays)
+                loss_rgb = F.smooth_l1_loss(out['comp_rgb'][out['rays_valid']],
+                                            pixels[out['rays_valid']])
+                loss += loss_rgb
+                psnr = self.criterions(out['comp_rgb'], pixels)
+                self.optimizer.zero_grad()
+                self.grad_scaler.scale(loss).backward()
+                self.optimizer.step()
+                self.scheduler.step()
+
+                if step % self.log_every_n_steps == 0:
+
+                    elapsed_time = time.time() - tic
+                    logger.info(
+                        f'elapsed_time={elapsed_time:.2f}s | step={step} | '
+                        f'loss={loss:.4f} | '
+                        f'train/num_rays={self.train_num_rays:d} |'
+                        f'PSNR={psnr:.4f} ')
+
+                step += 1
+
+        save_video_path = os.path.join(data_dir, 'render.mp4')
+        self.render_video(data_dir, save_video_path)
+        if self.save_ckpt:
+            save_ckpt_dir = os.path.join(data_dir, 'ckpt')
+            os.makedirs(save_ckpt_dir, exist_ok=True)
+            torch.save(
+                {
+                    'global_step': self.max_step,
+                    'network_state_dict': self.model.state_dict(),
+                    'optimizer_state_dict': self.optimizer.state_dict(),
+                }, os.path.join(save_ckpt_dir, '{}.ckpt'.format(step)))
+            logger.info('save checkpoints done')
+
+        logger.info('reconstruction finish')
+        return save_video_path
+
+    def render_video(self, data_dir, save_video_path):
+        self.model.eval()
+        with torch.no_grad():
+            psnr = 0
+            for i in tqdm.tqdm(range(len(self.test_dataset))):
+
+                data = self.test_dataset[i]
+                rays = data['rays'].cuda()
+                pixels = data['pixels'].cuda()
+                image_wh = data['image_wh']
+                out = self.model.inference(rays)
+
+                psnr += self.criterions(out['comp_rgb'], pixels)
+
+                W, H = image_wh
+                img = out['comp_rgb'].view(H, W, 3)
+                save_img_dir = os.path.join(data_dir, 'render')
+                os.makedirs(save_img_dir, exist_ok=True)
+                save_img_path = os.path.join(save_img_dir, f'{i:d}.png')
+                self.save_image(save_img_path, img)
+
+            self.save_video(save_video_path, save_img_dir)
+            logger.info('test psnr: {}'.format(psnr / len(self.test_dataset)))
+            logger.info('save render video done.')
+
+            if self.save_mesh:
+                mesh = self.model.isosurface()
+                save_mesh_path = os.path.join(data_dir, 'out.obj')
+                self.save_obj(save_mesh_path, mesh['v_pos'], mesh['t_pos_idx'])
+
+    def save_image(self, filename, img):
+        img = img.clip(0, 1).cpu().numpy()
+        img = (img * 255.).astype(np.uint8)
+        img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
+        save_dir = os.path.dirname(filename)
+        os.makedirs(save_dir, exist_ok=True)
+        cv2.imwrite(filename, img)
+
+    def save_video(self, filename, img_dir, fps=20):
+        img_paths = glob.glob('{}/*.png'.format(img_dir))
+        img_paths = sorted(
+            img_paths, key=lambda f: int(os.path.basename(f)[:-4]))
+        imgs = [cv2.imread(f) for f in img_paths]
+
+        H, W, _ = imgs[0].shape
+        writer = cv2.VideoWriter(filename, cv2.VideoWriter_fourcc(*'mp4v'),
+                                 fps, (W, H), True)
+        for img in imgs:
+            writer.write(img)
+        writer.release()
+
+    def write_obj(self, filename, v_pos, t_pos_idx, v_tex, t_tex_idx):
+        with open(filename, 'w') as f:
+            for v in v_pos:
+                f.write('v {} {} {} \n'.format(v[0], v[1], v[2]))
+
+            if v_tex is not None:
+                assert (len(t_pos_idx) == len(t_tex_idx))
+                for v in v_tex:
+                    f.write('vt {} {} \n'.format(v[0], 1.0 - v[1]))
+
+            for i in range(len(t_pos_idx)):
+                f.write('f ')
+                for j in range(3):
+                    f.write(
+                        ' %s/%s' %
+                        (str(t_pos_idx[i][j] + 1),
+                         '' if v_tex is None else str(t_tex_idx[i][j] + 1)))
+                f.write('\n')
+
+    def save_obj(self, filename, v_pos, t_pos_idx, v_tex=None, t_tex_idx=None):
+
+        v_pos = v_pos.cpu().numpy()
+        t_pos_idx = t_pos_idx.cpu().numpy()
+        save_dir = os.path.dirname(filename)
+        os.makedirs(save_dir, exist_ok=True)
+        if v_tex is not None and t_tex_idx is not None:
+            v_tex = v_tex.cpu().numpy()
+            t_tex_idx = t_tex_idx.cpu().numpy()
+            self.write_obj(filename, v_pos, t_pos_idx, v_tex, t_tex_idx)
+        else:
+            self.write_obj(filename, v_pos, t_pos_idx, v_tex, t_tex_idx)
diff --git a/modelscope/models/cv/nerf_recon_acc/network/__init__.py b/modelscope/models/cv/nerf_recon_acc/network/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/modelscope/models/cv/nerf_recon_acc/network/nerf.py b/modelscope/models/cv/nerf_recon_acc/network/nerf.py
new file mode 100644
index 00000000..972d2d85
--- /dev/null
+++ b/modelscope/models/cv/nerf_recon_acc/network/nerf.py
@@ -0,0 +1,344 @@
+# The implementation is modified from nerfacc, made publicly available under the MIT License
+# at https://github.com/KAIR-BAIR/nerfacc/blob/master/examples/radiance_fields/ngp.py
+import numpy as np
+import tinycudann as tcnn
+import torch
+import torch.nn as nn
+from nerfacc import ContractionType, OccupancyGrid, ray_marching, rendering
+from torch.autograd import Function
+from torch.cuda.amp import custom_bwd, custom_fwd
+
+from .utils import chunk_batch, cleanup, get_activation, normalize
+
+
+class VanillaMLP(nn.Module):
+
+    def __init__(self, dim_in, dim_out, n_neurons, n_hidden_layers,
+                 activation):
+        super().__init__()
+        self.layers = [
+            self.make_linear(dim_in, n_neurons),
+            self.make_activation()
+        ]
+        for i in range(n_hidden_layers - 1):
+            self.layers += [
+                self.make_linear(n_neurons, n_neurons),
+                self.make_activation()
+            ]
+        self.layers += [self.make_linear(n_neurons, dim_out)]
+        self.layers = nn.Sequential(*self.layers)
+        self.output_activation = get_activation(activation)
+
+    def forward(self, x):
+        x = self.layers(x.float())
+        x = self.output_activation(x)
+        return x
+
+    def make_linear(self, dim_in, dim_out):
+        layer = nn.Linear(dim_in, dim_out, bias=True)
+        torch.nn.init.constant_(layer.bias, 0.0)
+        torch.nn.init.kaiming_uniform_(layer.weight, nonlinearity='relu')
+        return layer
+
+    def make_activation(self):
+        return nn.ReLU(inplace=True)
+
+
+class MarchingCubeHelper(nn.Module):
+
+    def __init__(self, resolution, use_torch=True):
+        super().__init__()
+        self.resolution = resolution
+        self.use_torch = use_torch
+        self.points_range = (0, 1)
+        if self.use_torch:
+            import torchmcubes
+            self.mc_func = torchmcubes.marching_cubes
+        else:
+            import mcubes
+            self.mc_func = mcubes.marching_cubes
+        self.verts = None
+
+    def grid_vertices(self):
+        if self.verts is None:
+            x, y, z = torch.linspace(*self.points_range,
+                                     self.resolution), torch.linspace(
+                                         *self.points_range,
+                                         self.resolution), torch.linspace(
+                                             *self.points_range,
+                                             self.resolution)
+            x, y, z = torch.meshgrid(x, y, z)
+            verts = torch.cat(
+                [x.reshape(-1, 1),
+                 y.reshape(-1, 1),
+                 z.reshape(-1, 1)], dim=-1).reshape(-1, 3)
+            self.verts = verts.cuda()
+        return self.verts
+
+    def forward(self, level, threshold=0.):
+        level = level.float().view(self.resolution, self.resolution,
+                                   self.resolution)
+        if self.use_torch:
+            verts, faces = self.mc_func(level.cuda(), threshold)
+            verts, faces = verts.cpu(), faces.cpu().long()
+        else:
+            verts, faces = self.mc_func(-level.numpy(),
+                                        threshold)  # transform to numpy
+            verts, faces = torch.from_numpy(
+                verts.astype(np.float32)), torch.from_numpy(
+                    faces.astype(np.int64))  # transform back to pytorch
+        verts = verts / (self.resolution - 1.)
+        return {'v_pos': verts, 't_pos_idx': faces}
+
+
+class _TruncExp(Function):  # pylint: disable=abstract-method
+    # Implementation from torch-ngp:
+    # https://github.com/ashawkey/torch-ngp/blob/93b08a0d4ec1cc6e69d85df7f0acdfb99603b628/activation.py
+    @staticmethod
+    @custom_fwd(cast_inputs=torch.float32)
+    def forward(ctx, x):  # pylint: disable=arguments-differ
+        ctx.save_for_backward(x)
+        return torch.exp(x)
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx, g):  # pylint: disable=arguments-differ
+        x = ctx.saved_tensors[0]
+        return g * torch.exp(x.clamp(-15, 15))
+
+
+trunc_exp = _TruncExp.apply
+
+
+class VolumeDensity(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.radius = self.config.radius
+        self.n_input_dims = 3
+        self.n_output_dims = self.config.geometry_feature_dim + 1
+        point_encoding = tcnn.Encoding(
+            n_input_dims=self.n_input_dims,
+            encoding_config={
+                'otype': 'HashGrid',
+                'n_levels': self.config.n_levels,
+                'n_features_per_level': self.config.n_features_per_level,
+                'log2_hashmap_size': self.config.log2_hashmap_size,
+                'base_resolution': self.config.base_resolution,
+                'per_level_scale': self.config.per_level_scale,
+            },
+        )
+        point_network = VanillaMLP(point_encoding.n_output_dims,
+                                   self.n_output_dims, 64, 1, 'none')
+        self.encoding_with_network = torch.nn.Sequential(
+            point_encoding, point_network)
+
+        self.density_activation = trunc_exp
+        self.helper = MarchingCubeHelper(
+            self.config.isosurface_resolution, use_torch=False)
+
+    def forward(self, points):
+        points = normalize(points, (-self.radius, self.radius), (0, 1))
+        out = self.encoding_with_network(points.view(
+            -1, self.n_input_dims)).view(*points.shape[:-1],
+                                         self.n_output_dims).float()
+        density, feature = out[..., 0], out[..., 1:]
+        density = self.density_activation(density - 1)
+        return density, feature
+
+    def forward_level(self, points):
+        points = normalize(points, (-self.radius, self.radius), (0, 1))
+        out = self.encoding_with_network(points.view(
+            -1, self.n_input_dims)).view(*points.shape[:-1],
+                                         self.n_output_dims).float()
+        density, _ = out[..., 0], out[..., 1:]
+        density = self.density_activation(density - 1)
+        return -density
+
+    def isosurface_(self, vmin, vmax):
+        grid_verts = self.helper.grid_vertices()
+        grid_verts_1 = normalize(grid_verts[..., 0], (0, 1),
+                                 (vmin[0], vmax[0]))
+        grid_verts_2 = normalize(grid_verts[..., 1], (0, 1),
+                                 (vmin[1], vmax[1]))
+        grid_verts_3 = normalize(grid_verts[..., 2], (0, 1),
+                                 (vmin[2], vmax[2]))
+        grid_verts = torch.stack([grid_verts_1, grid_verts_2, grid_verts_3],
+                                 dim=-1)
+
+        def batch_func(x):
+            rv = self.forward_level(x).cpu()
+            cleanup()
+            return rv
+
+        level = chunk_batch(batch_func, self.config.isosurface_chunk,
+                            grid_verts)
+        mesh = self.helper(level, threshold=self.config.isosurface_threshold)
+        mesh_1 = normalize(mesh['v_pos'][..., 0], (0, 1), (vmin[0], vmax[0]))
+        mesh_2 = normalize(mesh['v_pos'][..., 1], (0, 1), (vmin[1], vmax[1]))
+        mesh_3 = normalize(mesh['v_pos'][..., 2], (0, 1), (vmin[2], vmax[2]))
+        mesh['v_pos'] = torch.stack([mesh_1, mesh_2, mesh_3], dim=-1)
+        return mesh
+
+    @torch.no_grad()
+    def isosurface(self):
+        mesh_coarse = self.isosurface_(
+            (-self.radius, -self.radius, -self.radius),
+            (self.radius, self.radius, self.radius))
+        vmin, vmax = mesh_coarse['v_pos'].amin(
+            dim=0), mesh_coarse['v_pos'].amax(dim=0)
+        vmin_ = (vmin - (vmax - vmin) * 0.1).clamp(-self.radius, self.radius)
+        vmax_ = (vmax + (vmax - vmin) * 0.1).clamp(-self.radius, self.radius)
+        mesh_fine = self.isosurface_(vmin_, vmax_)
+        return mesh_fine
+
+
+class VolumeRadiance(nn.Module):
+
+    def __init__(self, config):
+        super(VolumeRadiance, self).__init__()
+        self.config = config
+        self.n_dir_dims = 3
+        self.n_output_dims = 3
+        self.direction_encoding = tcnn.Encoding(
+            n_input_dims=self.n_dir_dims,
+            encoding_config={
+                'otype':
+                'Composite',
+                'nested': [
+                    {
+                        'n_dims_to_encode': self.n_dir_dims,
+                        'otype': 'SphericalHarmonics',
+                        'degree': self.config.degree,
+                    },
+                ],
+            },
+        )
+        self.n_input_dims = self.config.geometry_feature_dim + self.direction_encoding.n_output_dims
+        self.network = VanillaMLP(self.n_input_dims, self.n_output_dims, 64, 2,
+                                  'sigmoid')
+
+    def forward(self, features, dirs):
+        dirs = (dirs + 1.) / 2.  # (-1, 1) => (0, 1)
+        dirs_embd = self.direction_encoding(dirs.view(-1, self.n_dir_dims))
+        network_inp = torch.cat(
+            [dirs_embd,
+             features.view(-1, self.config.geometry_feature_dim)],
+            dim=-1)
+        color = self.network(network_inp).view(*features.shape[:-1],
+                                               self.n_output_dims).float()
+        return color
+
+
+class NeRFModel(nn.Module):
+
+    def __init__(self, network_cfg, **kwargs):
+        super().__init__()
+        self.config = network_cfg
+        self.num_samples_per_ray = kwargs['num_samples_per_ray']
+        self.test_ray_chunk = kwargs['test_ray_chunk']
+        self.background = self.config.background
+        self.geometry = VolumeDensity(self.config)
+        self.texture = VolumeRadiance(self.config)
+        radius_list = [
+            -self.config.radius, -self.config.radius, -self.config.radius,
+            self.config.radius, self.config.radius, self.config.radius
+        ]
+        radius_tensor = torch.as_tensor(radius_list, dtype=torch.float32)
+        self.register_buffer('scene_aabb', radius_tensor)
+        self.occupancy_grid = OccupancyGrid(
+            roi_aabb=self.scene_aabb,
+            resolution=128,
+            contraction_type=ContractionType.AABB)
+        self.render_step_size = 1.732 * 2 * self.config.radius / self.num_samples_per_ray
+
+    def update_step(self, global_step):
+        # progressive viewdir PE frequencies
+
+        def occ_eval_fn(x):
+            density, _ = self.geometry(x)
+            # approximate for 1 - torch.exp(-density[...,None] * self.render_step_size) based on taylor series
+            return density[..., None] * self.render_step_size
+
+        self.occupancy_grid.every_n_step(
+            step=global_step, occ_eval_fn=occ_eval_fn)
+
+    def isosurface(self):
+        mesh = self.geometry.isosurface()
+        return mesh
+
+    def forward(self, rays):
+        rays_o, rays_d = rays[:, 0:3], rays[:, 3:6]  # both (N_rays, 3)
+        if self.training:
+            if self.background == 'random':
+                background_color = torch.rand(
+                    3, dtype=torch.float32, device=rays_o.device)
+            elif self.background == 'white':
+                background_color = torch.ones(
+                    3, dtype=torch.float32, device=rays_o.device)
+            elif self.background == 'black':
+                background_color = torch.zeros(
+                    3, dtype=torch.float32, device=rays_o.device)
+        else:
+            background_color = torch.ones(
+                3, dtype=torch.float32, device=rays_o.device)
+
+        def sigma_fn(t_starts, t_ends, ray_indices):
+            ray_indices = ray_indices.long()
+            t_origins = rays_o[ray_indices]
+            t_dirs = rays_d[ray_indices]
+            positions = t_origins + t_dirs * (t_starts + t_ends) / 2.
+            density, _ = self.geometry(positions)
+
+            return density[..., None]
+
+        def rgb_sigma_fn(t_starts, t_ends, ray_indices):
+            ray_indices = ray_indices.long()
+            t_origins = rays_o[ray_indices]
+            t_dirs = rays_d[ray_indices]
+            positions = t_origins + t_dirs * (t_starts + t_ends) / 2.
+            density, feature = self.geometry(positions)
+            rgb = self.texture(feature, t_dirs)
+            return rgb, density[..., None]
+
+        with torch.no_grad():
+            packed_info, t_starts, t_ends = ray_marching(
+                rays_o,
+                rays_d,
+                scene_aabb=self.scene_aabb,
+                grid=self.occupancy_grid,
+                sigma_fn=sigma_fn,
+                near_plane=None,
+                far_plane=None,
+                render_step_size=self.render_step_size,
+                stratified=self.training,
+                cone_angle=0.0,
+                alpha_thre=0.0)
+        rgb, opacity, depth = rendering(
+            packed_info,
+            t_starts,
+            t_ends,
+            rgb_sigma_fn=rgb_sigma_fn,
+            render_bkgd=background_color)
+
+        opacity, depth = opacity.squeeze(-1), depth.squeeze(-1)
+
+        return {
+            'comp_rgb':
+            rgb,
+            'opacity':
+            opacity,
+            'depth':
+            depth,
+            'rays_valid':
+            opacity > 0,
+            'num_samples':
+            torch.as_tensor([len(t_starts)],
+                            dtype=torch.int32,
+                            device=rays.device)
+        }
+
+    def inference(self, rays):
+        out = chunk_batch(self.forward, self.test_ray_chunk, rays)
+        return {**out}
diff --git a/modelscope/models/cv/nerf_recon_acc/network/segmenter.py b/modelscope/models/cv/nerf_recon_acc/network/segmenter.py
new file mode 100644
index 00000000..d71b9f16
--- /dev/null
+++ b/modelscope/models/cv/nerf_recon_acc/network/segmenter.py
@@ -0,0 +1,43 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import numpy as np
+import tensorflow as tf
+
+
+class ObjectSegmenter(object):
+    """use ObjectSegmenter to segment object from input video.
+
+    Args:
+        model_path (str): the segment model path.
+    """
+
+    def __init__(self, model_path):
+        super(ObjectSegmenter, self).__init__()
+        f = tf.gfile.FastGFile(model_path, 'rb')
+        graph_def = tf.GraphDef()
+        graph_def.ParseFromString(f.read())
+        persisted_graph = tf.import_graph_def(graph_def, name='')
+
+        config = tf.ConfigProto()
+        config.gpu_options.per_process_gpu_memory_fraction = 0.3
+        self.sess = tf.InteractiveSession(graph=persisted_graph, config=config)
+
+        self.image_node = self.sess.graph.get_tensor_by_name('input_image:0')
+        self.output_node = self.sess.graph.get_tensor_by_name('output_png:0')
+        self.logits_node = self.sess.graph.get_tensor_by_name('if_person:0')
+
+    def image_preprocess(self, img):
+        if len(img.shape) == 2:
+            img = np.dstack((img, img, img))
+        elif img.shape[2] == 4:
+            img = img[:, :, :3]
+        img = img[:, :, ::-1]
+        img = img.astype(np.float)
+        return img
+
+    def run_mask(self, img):
+        image_feed = self.image_preprocess(img)
+        output_img_value, logits_value = self.sess.run(
+            [self.output_node, self.logits_node],
+            feed_dict={self.image_node: image_feed})
+        mask = output_img_value[:, :, 3:]
+        return mask
diff --git a/modelscope/models/cv/nerf_recon_acc/network/utils.py b/modelscope/models/cv/nerf_recon_acc/network/utils.py
new file mode 100644
index 00000000..aa8c3d66
--- /dev/null
+++ b/modelscope/models/cv/nerf_recon_acc/network/utils.py
@@ -0,0 +1,176 @@
+# The implementation is partly adopted from nerfacc, made publicly available under the MIT License
+# at https://github.com/KAIR-BAIR/nerfacc/blob/master/examples/radiance_fields/ngp.py
+import gc
+from collections import defaultdict
+
+import mcubes
+import numpy as np
+import tinycudann as tcnn
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.autograd import Function
+from torch.cuda.amp import custom_bwd, custom_fwd
+
+
+class PSNR(nn.Module):
+
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, inputs, targets, valid_mask=None, reduction='mean'):
+        assert reduction in ['mean', 'none']
+        value = (inputs - targets)**2
+        if valid_mask is not None:
+            value = value[valid_mask]
+        if reduction == 'mean':
+            return -10 * torch.log10(torch.mean(value))
+        elif reduction == 'none':
+            return -10 * torch.log10(
+                torch.mean(value, dim=tuple(range(value.ndim)[1:])))
+
+
+def extract_fields(bound_min, bound_max, resolution, query_func):
+    N = 64
+    X = torch.linspace(bound_min[0], bound_max[0], resolution).split(N)
+    Y = torch.linspace(bound_min[1], bound_max[1], resolution).split(N)
+    Z = torch.linspace(bound_min[2], bound_max[2], resolution).split(N)
+
+    u = np.zeros([resolution, resolution, resolution], dtype=np.float32)
+    with torch.no_grad():
+        for xi, xs in enumerate(X):
+            for yi, ys in enumerate(Y):
+                for zi, zs in enumerate(Z):
+                    xx, yy, zz = torch.meshgrid(xs, ys, zs)
+                    xx = xx.reshape(-1, 1)
+                    yy = yy.reshape(-1, 1)
+                    zz = zz.reshape(-1, 1)
+                    pts = torch.cat([xx, yy, zz], dim=-1).cuda()
+                    val = query_func(pts).reshape(
+                        len(xs), len(ys), len(zs)).detach().cpu().numpy()
+                    u[xi * N:xi * N + len(xs), yi * N:yi * N + len(ys),
+                      zi * N:zi * N + len(zs)] = val
+    return u
+
+
+def extract_geometry(bound_min, bound_max, resolution, threshold, query_func):
+    u = extract_fields(bound_min, bound_max, resolution, query_func)
+    vertices, triangles = mcubes.marching_cubes(u, threshold)
+    b_max_np = bound_max.detach().cpu().numpy()
+    b_min_np = bound_min.detach().cpu().numpy()
+
+    vertices = vertices / (resolution - 1.0) * (
+        b_max_np - b_min_np)[None, :] + b_min_np[None, :]
+    return vertices, triangles
+
+
+def chunk_batch(func, chunk_size, *args, **kwargs):
+    B = None
+    for arg in args:
+        if isinstance(arg, torch.Tensor):
+            B = arg.shape[0]
+            break
+    out = defaultdict(list)
+    out_type = None
+    for i in range(0, B, chunk_size):
+        out_chunk = func(
+            *[
+                arg[i:i + chunk_size] if isinstance(arg, torch.Tensor) else arg
+                for arg in args
+            ], **kwargs)
+        if out_chunk is None:
+            continue
+        out_type = type(out_chunk)
+        if isinstance(out_chunk, torch.Tensor):
+            out_chunk = {0: out_chunk}
+        elif isinstance(out_chunk, tuple) or isinstance(out_chunk, list):
+            chunk_length = len(out_chunk)
+            out_chunk = {i: chunk for i, chunk in enumerate(out_chunk)}
+        elif isinstance(out_chunk, dict):
+            pass
+        else:
+            exit(1)
+        for k, v in out_chunk.items():
+            out[k].append(v if torch.is_grad_enabled() else v.detach())
+
+    if out_type is None:
+        return
+
+    out = {k: torch.cat(v, dim=0) for k, v in out.items()}
+    if out_type is torch.Tensor:
+        return out[0]
+    elif out_type in [tuple, list]:
+        return out_type([out[i] for i in range(chunk_length)])
+    elif out_type is dict:
+        return out
+
+
+def get_activation(name):
+    name = name.lower()
+    if name is None or name == 'none':
+        return nn.Identity()
+    elif name.startswith('scale'):
+        scale_factor = float(name[5:])
+        return lambda x: x.clamp(0., scale_factor) / scale_factor
+    elif name.startswith('clamp'):
+        clamp_max = float(name[5:])
+        return lambda x: x.clamp(0., clamp_max)
+    elif name.startswith('mul'):
+        mul_factor = float(name[3:])
+        return lambda x: x * mul_factor
+    elif name == 'trunc_exp':
+        return trunc_exp
+    elif name.startswith('+') or name.startswith('-'):
+        return lambda x: x + float(name)
+    elif name.lower() == 'sigmoid':
+        return lambda x: torch.sigmoid(x)
+    elif name.lower() == 'tanh':
+        return lambda x: torch.tanh(x)
+    else:
+        return getattr(F, name)
+
+
+class _TruncExp(Function):
+    # Implementation from torch-ngp:
+    # https://github.com/ashawkey/torch-ngp/blob/93b08a0d4ec1cc6e69d85df7f0acdfb99603b628/activation.py
+    @staticmethod
+    @custom_fwd(cast_inputs=torch.float32)
+    def forward(ctx, x):  # pylint: disable=arguments-differ
+        ctx.save_for_backward(x)
+        return torch.exp(x)
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx, g):
+        x = ctx.saved_tensors[0]
+        return g * torch.exp(torch.clamp(x, max=15))
+
+
+trunc_exp = _TruncExp.apply
+
+
+def dot(x, y):
+    return torch.sum(x * y, -1, keepdim=True)
+
+
+def reflect(x, n):
+    return 2 * dot(x, n) * n - x
+
+
+def normalize(dat, inp_scale, tgt_scale):
+    if inp_scale is None:
+        inp_scale = [dat.min(), dat.max()]
+    dat = (dat - inp_scale[0]) / (inp_scale[1] - inp_scale[0])
+    dat = dat * (tgt_scale[1] - tgt_scale[0]) + tgt_scale[0]
+    return dat
+
+
+def cleanup():
+    gc.collect()
+    torch.cuda.empty_cache()
+    tcnn.free_temporary_memory()
+
+
+def update_module_step(m, epoch, global_step):
+    if hasattr(m, 'update_step'):
+        m.update_step(epoch, global_step)
diff --git a/modelscope/models/cv/object_detection/__init__.py b/modelscope/models/cv/object_detection/__init__.py
index 0c782d7b..8728d2f1 100644
--- a/modelscope/models/cv/object_detection/__init__.py
+++ b/modelscope/models/cv/object_detection/__init__.py
@@ -6,11 +6,13 @@ from modelscope.utils.import_utils import LazyImportModule
 if TYPE_CHECKING:
     from .mmdet_model import DetectionModel
     from .yolox_pai import YOLOX
+    from .dino import DINO
 
 else:
     _import_structure = {
         'mmdet_model': ['DetectionModel'],
         'yolox_pai': ['YOLOX'],
+        'dino': ['DINO']
     }
 
     import sys
diff --git a/modelscope/models/cv/object_detection/dino.py b/modelscope/models/cv/object_detection/dino.py
new file mode 100644
index 00000000..e6c652f1
--- /dev/null
+++ b/modelscope/models/cv/object_detection/dino.py
@@ -0,0 +1,16 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from easycv.models.detection.detectors import Detection as _Detection
+
+from modelscope.metainfo import Models
+from modelscope.models.builder import MODELS
+from modelscope.models.cv.easycv_base import EasyCVBaseModel
+from modelscope.utils.constant import Tasks
+
+
+@MODELS.register_module(
+    group_key=Tasks.image_object_detection, module_name=Models.dino)
+class DINO(EasyCVBaseModel, _Detection):
+
+    def __init__(self, model_dir=None, *args, **kwargs):
+        EasyCVBaseModel.__init__(self, model_dir, args, kwargs)
+        _Detection.__init__(self, *args, **kwargs)
diff --git a/modelscope/models/cv/object_detection_3d/__init__.py b/modelscope/models/cv/object_detection_3d/__init__.py
new file mode 100644
index 00000000..2477a104
--- /dev/null
+++ b/modelscope/models/cv/object_detection_3d/__init__.py
@@ -0,0 +1,21 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .depe import ScrfdDetect
+else:
+    _import_structure = {
+        'depe': ['DepeDetect'],
+    }
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/models/cv/object_detection_3d/depe/__init__.py b/modelscope/models/cv/object_detection_3d/depe/__init__.py
new file mode 100644
index 00000000..1576aa9f
--- /dev/null
+++ b/modelscope/models/cv/object_detection_3d/depe/__init__.py
@@ -0,0 +1,2 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from .depe_detect import DepeDetect
diff --git a/modelscope/models/cv/object_detection_3d/depe/depe_detect.py b/modelscope/models/cv/object_detection_3d/depe/depe_detect.py
new file mode 100644
index 00000000..a6112a16
--- /dev/null
+++ b/modelscope/models/cv/object_detection_3d/depe/depe_detect.py
@@ -0,0 +1,69 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os
+import os.path as osp
+from typing import Any, Dict, List, Union
+
+import numpy as np
+import torch
+
+from modelscope.metainfo import Models
+from modelscope.models.base import TorchModel
+from modelscope.models.builder import MODELS
+from modelscope.utils.config import Config
+from modelscope.utils.constant import ModelFile, Tasks
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+__all__ = ['DepeDetect']
+
+
+@MODELS.register_module(Tasks.object_detection_3d, module_name=Models.depe)
+class DepeDetect(TorchModel):
+
+    def __init__(self, model_dir: str, *args, **kwargs):
+        """DEPE is a simple and pure DETR-like 3D detector with transformers,
+        for more information please refer to:
+        https://www.modelscope.cn/models/damo/cv_object-detection-3d_depe/summary
+
+        initialize the 3d object detection model from the `model_dir` path.
+        Args:
+            model_dir (str): the model path.
+        """
+        super().__init__(model_dir, *args, **kwargs)
+        from mmcv.runner import load_checkpoint
+        import modelscope.models.cv.object_detection_3d.depe.mmdet3d_plugin
+        from modelscope.models.cv.object_detection_3d.depe.mmdet3d_plugin.models.detectors import Petr3D
+
+        # build model and load checkpoint
+        config_path = osp.join(model_dir, ModelFile.CONFIGURATION)
+        config = Config.from_file(config_path)
+        detector = Petr3D(**config.model.network_param)
+        model_file = kwargs.get('model_file', ModelFile.TORCH_MODEL_BIN_FILE)
+        ckpt_path = osp.join(model_dir, model_file)
+        logger.info(f'loading model from {ckpt_path}')
+        load_checkpoint(detector, ckpt_path, map_location='cpu')
+        detector.eval()
+        self.detector = detector
+        logger.info('load model done')
+
+    def forward(self, img: Union[torch.Tensor, List[torch.Tensor]],
+                img_metas: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+        """
+        Args:
+            img (`torch.Tensor`): batched image tensor or list of batched image tensor,
+                shape of each tensor is [B, N, C, H, W], N is 12 for 6 views from current
+                and history frame.
+            img_metas (` List[Dict[str, Any]`): image meta info.
+        Return:
+            result (`List[Dict[str, Any]]`): list of detection results.
+        """
+
+        if isinstance(img, torch.Tensor):
+            img = [img]
+            img_metas = [img_metas]
+
+        result = self.detector(
+            return_loss=False, rescale=True, img=img, img_metas=img_metas)
+        assert result is not None
+        return result
diff --git a/modelscope/models/cv/object_detection_3d/depe/mmdet3d_plugin/__init__.py b/modelscope/models/cv/object_detection_3d/depe/mmdet3d_plugin/__init__.py
new file mode 100644
index 00000000..9406c975
--- /dev/null
+++ b/modelscope/models/cv/object_detection_3d/depe/mmdet3d_plugin/__init__.py
@@ -0,0 +1,13 @@
+"""
+The implementation here is modified based on PETR, originally Apache-2.0 license and publicly avaialbe at
+https://github.com/megvii-research/PETR/blob/main/projects/mmdet3d_plugin
+"""
+from .core.bbox.assigners import HungarianAssigner3D
+from .core.bbox.coders import NMSFreeCoder
+from .core.bbox.match_costs import BBox3DL1Cost
+from .datasets import CustomNuScenesDataset
+from .datasets.pipelines import NormalizeMultiviewImage, PadMultiViewImage
+from .models.backbones import VoVNet
+from .models.dense_heads import PETRv2DEDNHead
+from .models.detectors import Petr3D
+from .models.necks import CPFPN
diff --git a/modelscope/models/cv/object_detection_3d/depe/mmdet3d_plugin/core/__init__.py b/modelscope/models/cv/object_detection_3d/depe/mmdet3d_plugin/core/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/modelscope/models/cv/object_detection_3d/depe/mmdet3d_plugin/core/bbox/__init__.py b/modelscope/models/cv/object_detection_3d/depe/mmdet3d_plugin/core/bbox/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/modelscope/models/cv/object_detection_3d/depe/mmdet3d_plugin/core/bbox/assigners/__init__.py b/modelscope/models/cv/object_detection_3d/depe/mmdet3d_plugin/core/bbox/assigners/__init__.py
new file mode 100644
index 00000000..50554be6
--- /dev/null
+++ b/modelscope/models/cv/object_detection_3d/depe/mmdet3d_plugin/core/bbox/assigners/__init__.py
@@ -0,0 +1,7 @@
+"""
+The implementation here is modified based on PETR, originally Apache-2.0 license and publicly avaialbe at
+https://github.com/megvii-research/PETR/blob/main/projects/mmdet3d_plugin/core/bbox/assigners
+"""
+from .hungarian_assigner_3d import HungarianAssigner3D
+
+__all__ = ['HungarianAssigner3D']
diff --git a/modelscope/models/cv/object_detection_3d/depe/mmdet3d_plugin/core/bbox/assigners/hungarian_assigner_3d.py b/modelscope/models/cv/object_detection_3d/depe/mmdet3d_plugin/core/bbox/assigners/hungarian_assigner_3d.py
new file mode 100644
index 00000000..c47a4d6b
--- /dev/null
+++ b/modelscope/models/cv/object_detection_3d/depe/mmdet3d_plugin/core/bbox/assigners/hungarian_assigner_3d.py
@@ -0,0 +1,139 @@
+"""
+The implementation here is modified based on PETR, originally Apache-2.0 license and publicly avaialbe at
+https://github.com/megvii-research/PETR/blob/main/projects/mmdet3d_plugin/core/bbox/assigners
+"""
+import torch
+from mmdet.core.bbox.assigners import AssignResult, BaseAssigner
+from mmdet.core.bbox.builder import BBOX_ASSIGNERS
+from mmdet.core.bbox.match_costs import build_match_cost
+from mmdet.models.utils.transformer import inverse_sigmoid
+
+from modelscope.models.cv.object_detection_3d.depe.mmdet3d_plugin.core.bbox.util import \
+    normalize_bbox
+
+try:
+    from scipy.optimize import linear_sum_assignment
+except ImportError:
+    linear_sum_assignment = None
+
+
+@BBOX_ASSIGNERS.register_module()
+class HungarianAssigner3D(BaseAssigner):
+    """Computes one-to-one matching between predictions and ground truth.
+    This class computes an assignment between the targets and the predictions
+    based on the costs. The costs are weighted sum of three components:
+    classification cost, regression L1 cost and regression iou cost. The
+    targets don't include the no_object, so generally there are more
+    predictions than targets. After the one-to-one matching, the un-matched
+    are treated as backgrounds. Thus each query prediction will be assigned
+    with `0` or a positive integer indicating the ground truth index:
+    - 0: negative sample, no assigned gt
+    - positive integer: positive sample, index (1-based) of assigned gt
+    Args:
+        cls_weight (int | float, optional): The scale factor for classification
+            cost. Default 1.0.
+        bbox_weight (int | float, optional): The scale factor for regression
+            L1 cost. Default 1.0.
+        iou_weight (int | float, optional): The scale factor for regression
+            iou cost. Default 1.0.
+        iou_calculator (dict | optional): The config for the iou calculation.
+            Default type `BboxOverlaps2D`.
+        iou_mode (str | optional): "iou" (intersection over union), "iof"
+                (intersection over foreground), or "giou" (generalized
+                intersection over union). Default "giou".
+    """
+
+    def __init__(self,
+                 cls_cost=dict(type='ClassificationCost', weight=1.),
+                 reg_cost=dict(type='BBoxL1Cost', weight=1.0),
+                 iou_cost=dict(type='IoUCost', weight=0.0),
+                 pc_range=None):
+        self.cls_cost = build_match_cost(cls_cost)
+        self.reg_cost = build_match_cost(reg_cost)
+        self.iou_cost = build_match_cost(iou_cost)
+        self.pc_range = pc_range
+
+    def assign(self,
+               bbox_pred,
+               cls_pred,
+               gt_bboxes,
+               gt_labels,
+               gt_bboxes_ignore=None,
+               eps=1e-7):
+        """Computes one-to-one matching based on the weighted costs.
+        This method assign each query prediction to a ground truth or
+        background. The `assigned_gt_inds` with -1 means don't care,
+        0 means negative sample, and positive number is the index (1-based)
+        of assigned gt.
+        The assignment is done in the following steps, the order matters.
+        1. assign every prediction to -1
+        2. compute the weighted costs
+        3. do Hungarian matching on CPU based on the costs
+        4. assign all to 0 (background) first, then for each matched pair
+           between predictions and gts, treat this prediction as foreground
+           and assign the corresponding gt index (plus 1) to it.
+        Args:
+            bbox_pred (Tensor): Predicted boxes with normalized coordinates
+                (cx, cy, w, h), which are all in range [0, 1]. Shape
+                [num_query, 4].
+            cls_pred (Tensor): Predicted classification logits, shape
+                [num_query, num_class].
+            gt_bboxes (Tensor): Ground truth boxes with unnormalized
+                coordinates (x1, y1, x2, y2). Shape [num_gt, 4].
+            gt_labels (Tensor): Label of `gt_bboxes`, shape (num_gt,).
+            gt_bboxes_ignore (Tensor, optional): Ground truth bboxes that are
+                labelled as `ignored`. Default None.
+            eps (int | float, optional): A value added to the denominator for
+                numerical stability. Default 1e-7.
+        Returns:
+            :obj:`AssignResult`: The assigned result.
+        """
+        assert gt_bboxes_ignore is None, \
+            'Only case when gt_bboxes_ignore is None is supported.'
+        num_gts, num_bboxes = gt_bboxes.size(0), bbox_pred.size(0)
+
+        # 1. assign -1 by default
+        assigned_gt_inds = bbox_pred.new_full((num_bboxes, ),
+                                              -1,
+                                              dtype=torch.long)
+        assigned_labels = bbox_pred.new_full((num_bboxes, ),
+                                             -1,
+                                             dtype=torch.long)
+        if num_gts == 0 or num_bboxes == 0:
+            # No ground truth or boxes, return empty assignment
+            if num_gts == 0:
+                # No ground truth, assign all to background
+                assigned_gt_inds[:] = 0
+            return AssignResult(
+                num_gts, assigned_gt_inds, None, labels=assigned_labels)
+
+        # 2. compute the weighted costs
+        # classification and bboxcost.
+        cls_cost = self.cls_cost(cls_pred, gt_labels)
+        # regression L1 cost
+        normalized_gt_bboxes = normalize_bbox(gt_bboxes, self.pc_range)
+        reg_cost = self.reg_cost(bbox_pred[:, :8], normalized_gt_bboxes[:, :8])
+
+        # weighted sum of above two costs
+        cost = cls_cost + reg_cost
+
+        # 3. do Hungarian matching on CPU using linear_sum_assignment
+        cost = cost.detach().cpu()
+        if linear_sum_assignment is None:
+            raise ImportError('Please run "pip install scipy" '
+                              'to install scipy first.')
+        cost = torch.nan_to_num(cost, nan=100.0, posinf=100.0, neginf=-100.0)
+        matched_row_inds, matched_col_inds = linear_sum_assignment(cost)
+        matched_row_inds = torch.from_numpy(matched_row_inds).to(
+            bbox_pred.device)
+        matched_col_inds = torch.from_numpy(matched_col_inds).to(
+            bbox_pred.device)
+
+        # 4. assign backgrounds and foregrounds
+        # assign all indices to backgrounds first
+        assigned_gt_inds[:] = 0
+        # assign foregrounds based on matching results
+        assigned_gt_inds[matched_row_inds] = matched_col_inds + 1
+        assigned_labels[matched_row_inds] = gt_labels[matched_col_inds]
+        return AssignResult(
+            num_gts, assigned_gt_inds, None, labels=assigned_labels)
diff --git a/modelscope/models/cv/object_detection_3d/depe/mmdet3d_plugin/core/bbox/coders/__init__.py b/modelscope/models/cv/object_detection_3d/depe/mmdet3d_plugin/core/bbox/coders/__init__.py
new file mode 100644
index 00000000..3fe7191f
--- /dev/null
+++ b/modelscope/models/cv/object_detection_3d/depe/mmdet3d_plugin/core/bbox/coders/__init__.py
@@ -0,0 +1,9 @@
+"""
+The implementation here is modified based on PETR, originally Apache-2.0 license and publicly avaialbe at
+https://github.com/megvii-research/PETR/blob/main/projects/mmdet3d_plugin/core/bbox/coders
+"""
+from .nms_free_coder import NMSFreeCoder
+
+__all__ = [
+    'NMSFreeCoder',
+]
diff --git a/modelscope/models/cv/object_detection_3d/depe/mmdet3d_plugin/core/bbox/coders/nms_free_coder.py b/modelscope/models/cv/object_detection_3d/depe/mmdet3d_plugin/core/bbox/coders/nms_free_coder.py
new file mode 100644
index 00000000..c51e3945
--- /dev/null
+++ b/modelscope/models/cv/object_detection_3d/depe/mmdet3d_plugin/core/bbox/coders/nms_free_coder.py
@@ -0,0 +1,118 @@
+"""
+The implementation here is modified based on PETR, originally Apache-2.0 license and publicly avaialbe at
+https://github.com/megvii-research/PETR/blob/main/projects/mmdet3d_plugin/core/bbox/coders
+"""
+import torch
+import torch.nn.functional as F
+from mmdet.core.bbox import BaseBBoxCoder
+from mmdet.core.bbox.builder import BBOX_CODERS
+
+from modelscope.models.cv.object_detection_3d.depe.mmdet3d_plugin.core.bbox.util import \
+    denormalize_bbox
+
+
+@BBOX_CODERS.register_module()
+class NMSFreeCoder(BaseBBoxCoder):
+    """Bbox coder for NMS-free detector.
+    Args:
+        pc_range (list[float]): Range of point cloud.
+        post_center_range (list[float]): Limit of the center.
+            Default: None.
+        max_num (int): Max number to be kept. Default: 100.
+        score_threshold (float): Threshold to filter boxes based on score.
+            Default: None.
+        code_size (int): Code size of bboxes. Default: 9
+    """
+
+    def __init__(self,
+                 pc_range,
+                 voxel_size=None,
+                 post_center_range=None,
+                 max_num=100,
+                 score_threshold=None,
+                 num_classes=10):
+
+        self.pc_range = pc_range
+        self.voxel_size = voxel_size
+        self.post_center_range = post_center_range
+        self.max_num = max_num
+        self.score_threshold = score_threshold
+        self.num_classes = num_classes
+
+    def encode(self):
+        pass
+
+    def decode_single(self, cls_scores, bbox_preds):
+        """Decode bboxes.
+        Args:
+            cls_scores (Tensor): Outputs from the classification head, \
+                shape [num_query, cls_out_channels]. Note \
+                cls_out_channels should includes background.
+            bbox_preds (Tensor): Outputs from the regression \
+                head with normalized coordinate format (cx, cy, w, l, cz, h, rot_sine, rot_cosine, vx, vy). \
+                Shape [num_query, 9].
+        Returns:
+            list[dict]: Decoded boxes.
+        """
+        max_num = self.max_num
+
+        cls_scores = cls_scores.sigmoid()
+        scores, indexs = cls_scores.view(-1).topk(max_num)
+        labels = indexs % self.num_classes
+        bbox_index = torch.div(indexs, self.num_classes, rounding_mode='trunc')
+        bbox_preds = bbox_preds[bbox_index]
+
+        final_box_preds = denormalize_bbox(bbox_preds, self.pc_range)
+        final_scores = scores
+        final_preds = labels
+
+        # use score threshold
+        if self.score_threshold is not None:
+            thresh_mask = final_scores > self.score_threshold
+        if self.post_center_range is not None:
+            self.post_center_range = torch.tensor(
+                self.post_center_range, device=scores.device)
+            tmp = final_box_preds[..., :3] >= self.post_center_range[:3]
+            mask = (tmp).all(1)
+            tmp = final_box_preds[..., :3] <= self.post_center_range[3:]
+            mask &= (tmp).all(1)
+
+            if self.score_threshold:
+                mask &= thresh_mask
+
+            boxes3d = final_box_preds[mask]
+            scores = final_scores[mask]
+            labels = final_preds[mask]
+            predictions_dict = {
+                'bboxes': boxes3d,
+                'scores': scores,
+                'labels': labels
+            }
+
+        else:
+            raise NotImplementedError(
+                'Need to reorganize output as a batch, only '
+                'support post_center_range is not None for now!')
+        return predictions_dict
+
+    def decode(self, preds_dicts):
+        """Decode bboxes.
+        Args:
+            all_cls_scores (Tensor): Outputs from the classification head, \
+                shape [nb_dec, bs, num_query, cls_out_channels]. Note \
+                cls_out_channels should includes background.
+            all_bbox_preds (Tensor): Sigmoid outputs from the regression \
+                head with normalized coordinate format (cx, cy, w, l, cz, h, rot_sine, rot_cosine, vx, vy). \
+                Shape [nb_dec, bs, num_query, 9].
+        Returns:
+            list[dict]: Decoded boxes.
+        """
+        all_cls_scores = preds_dicts['all_cls_scores'][-1]
+        all_bbox_preds = preds_dicts['all_bbox_preds'][-1]
+
+        batch_size = all_cls_scores.size()[0]
+        predictions_list = []
+        for i in range(batch_size):
+            predictions_list.append(
+                self.decode_single(all_cls_scores[i], all_bbox_preds[i]))
+        return predictions_list
diff --git a/modelscope/models/cv/object_detection_3d/depe/mmdet3d_plugin/core/bbox/match_costs/__init__.py b/modelscope/models/cv/object_detection_3d/depe/mmdet3d_plugin/core/bbox/match_costs/__init__.py
new file mode 100644
index 00000000..d8630b21
--- /dev/null
+++ b/modelscope/models/cv/object_detection_3d/depe/mmdet3d_plugin/core/bbox/match_costs/__init__.py
@@ -0,0 +1,7 @@
+"""
+The implementation here is modified based on PETR, originally Apache-2.0 license and publicly avaialbe at
+https://github.com/megvii-research/PETR/blob/main/projects/mmdet3d_plugin/core/bbox/match_costs
+"""
+from .match_cost import BBox3DL1Cost
+
+__all__ = ['BBox3DL1Cost']
diff --git a/modelscope/models/cv/object_detection_3d/depe/mmdet3d_plugin/core/bbox/match_costs/match_cost.py b/modelscope/models/cv/object_detection_3d/depe/mmdet3d_plugin/core/bbox/match_costs/match_cost.py
new file mode 100644
index 00000000..c8faa270
--- /dev/null
+++ b/modelscope/models/cv/object_detection_3d/depe/mmdet3d_plugin/core/bbox/match_costs/match_cost.py
@@ -0,0 +1,31 @@
+"""
+The implementation here is modified based on PETR, originally Apache-2.0 license and publicly avaialbe at
+https://github.com/megvii-research/PETR/blob/main/projects/mmdet3d_plugin/core/bbox/match_costs
+"""
+import torch
+from mmdet.core.bbox.match_costs.builder import MATCH_COST
+
+
+@MATCH_COST.register_module()
+class BBox3DL1Cost(object):
+    """BBox3DL1Cost.
+     Args:
+         weight (int | float, optional): loss_weight
+    """
+
+    def __init__(self, weight=1.):
+        self.weight = weight
+
+    def __call__(self, bbox_pred, gt_bboxes):
+        """
+        Args:
+            bbox_pred (Tensor): Predicted boxes with normalized coordinates
+                (cx, cy, w, h), which are all in range [0, 1]. Shape
+                [num_query, 4].
+            gt_bboxes (Tensor): Ground truth boxes with normalized
+                coordinates (x1, y1, x2, y2). Shape [num_gt, 4].
+        Returns:
+            torch.Tensor: bbox_cost value with weight
+        """
+        bbox_cost = torch.cdist(bbox_pred, gt_bboxes, p=1)
+        return bbox_cost * self.weight
diff --git a/modelscope/models/cv/object_detection_3d/depe/mmdet3d_plugin/core/bbox/util.py b/modelscope/models/cv/object_detection_3d/depe/mmdet3d_plugin/core/bbox/util.py
new file mode 100644
index 00000000..d36a4517
--- /dev/null
+++ b/modelscope/models/cv/object_detection_3d/depe/mmdet3d_plugin/core/bbox/util.py
@@ -0,0 +1,67 @@
+"""
+The implementation here is modified based on PETR, originally Apache-2.0 license and publicly avaialbe at
+https://github.com/megvii-research/PETR/blob/main/projects/mmdet3d_plugin/core/bbox
+"""
+import mmdet3d
+import numpy as np
+import torch
+
+
+def normalize_bbox(bboxes, pc_range):
+    cx = bboxes[..., 0:1]
+    cy = bboxes[..., 1:2]
+    cz = bboxes[..., 2:3]
+    w = bboxes[..., 3:4].log()
+    _l = bboxes[..., 4:5].log()
+    h = bboxes[..., 5:6].log()
+
+    rot = bboxes[..., 6:7]
+    if bboxes.size(-1) > 7:
+        vx = bboxes[..., 7:8]
+        vy = bboxes[..., 8:9]
+        normalized_bboxes = torch.cat(
+            (cx, cy, w, _l, cz, h, rot.sin(), rot.cos(), vx, vy), dim=-1)
+    else:
+        normalized_bboxes = torch.cat(
+            (cx, cy, w, _l, cz, h, rot.sin(), rot.cos()), dim=-1)
+    return normalized_bboxes
+
+
+def denormalize_bbox(normalized_bboxes, pc_range):
+    nan = normalized_bboxes.isnan().sum()
+    if nan > 0:
+        print(f'found {nan} nan!')
+        normalized_bboxes = torch.nan_to_num(normalized_bboxes, 0.1)
+    # rotation
+    rot_sine = normalized_bboxes[..., 6:7]
+    rot_cosine = normalized_bboxes[..., 7:8]
+    rot = torch.atan2(rot_sine, rot_cosine)
+
+    # center in the bev
+    cx = normalized_bboxes[..., 0:1]
+    cy = normalized_bboxes[..., 1:2]
+    cz = normalized_bboxes[..., 4:5]
+
+    # size
+    w = normalized_bboxes[..., 2:3]
+    _l = normalized_bboxes[..., 3:4]
+    h = normalized_bboxes[..., 5:6]
+
+    w = w.exp()
+    _l = _l.exp()
+    h = h.exp()
+    # check mmdet3d version
+    if int(mmdet3d.__version__.split('.')[0]) > 0:
+        tmp = w
+        w = _l
+        _l = tmp
+        rot = -rot - 0.5 * np.pi
+    if normalized_bboxes.size(-1) > 8:
+        # velocity
+        vx = normalized_bboxes[:, 8:9]
+        vy = normalized_bboxes[:, 9:10]
+        denormalized_bboxes = torch.cat([cx, cy, cz, w, _l, h, rot, vx, vy],
+                                        dim=-1)
+    else:
+        denormalized_bboxes = torch.cat([cx, cy, cz, w, _l, h, rot], dim=-1)
+    return denormalized_bboxes
diff --git a/modelscope/models/cv/object_detection_3d/depe/mmdet3d_plugin/datasets/__init__.py b/modelscope/models/cv/object_detection_3d/depe/mmdet3d_plugin/datasets/__init__.py
new file mode 100644
index 00000000..222115dd
--- /dev/null
+++ b/modelscope/models/cv/object_detection_3d/depe/mmdet3d_plugin/datasets/__init__.py
@@ -0,0 +1,9 @@
+"""
+The implementation here is modified based on PETR, originally Apache-2.0 license and publicly avaialbe at
+https://github.com/megvii-research/PETR/blob/main/projects/mmdet3d_plugin/datasets
+"""
+from .nuscenes_dataset import CustomNuScenesDataset
+
+__all__ = [
+    'CustomNuScenesDataset',
+]
diff --git a/modelscope/models/cv/object_detection_3d/depe/mmdet3d_plugin/datasets/nuscenes_dataset.py b/modelscope/models/cv/object_detection_3d/depe/mmdet3d_plugin/datasets/nuscenes_dataset.py
new file mode 100644
index 00000000..60e72914
--- /dev/null
+++ b/modelscope/models/cv/object_detection_3d/depe/mmdet3d_plugin/datasets/nuscenes_dataset.py
@@ -0,0 +1,77 @@
+"""
+The implementation here is modified based on PETR, originally Apache-2.0 license and publicly avaialbe at
+https://github.com/megvii-research/PETR/blob/main/projects/mmdet3d_plugin/datasets
+"""
+import numpy as np
+from mmdet3d.datasets import NuScenesDataset
+from mmdet.datasets import DATASETS
+
+
+@DATASETS.register_module()
+class CustomNuScenesDataset(NuScenesDataset):
+    r"""NuScenes Dataset.
+    This datset only add camera intrinsics and extrinsics to the results.
+    """
+
+    def __init__(self, idx_range=None, **kwargs):
+        self.idx_range = idx_range
+        super().__init__(**kwargs)
+        if idx_range is not None:
+            assert isinstance(idx_range, (tuple, list))
+            assert len(idx_range) == 2
+            assert idx_range[0] < idx_range[1]
+            assert idx_range[1] <= len(
+                self.data_infos
+            ), f'the idx_range {idx_range} exceeds total number of dataset:{len(self.data_infos)}'
+            self.data_infos = self.data_infos[idx_range[0]:idx_range[1]]
+
+    def get_data_info(self, index):
+        info = self.data_infos[index]
+        # standard protocal modified from SECOND.Pytorch
+        input_dict = dict(
+            sample_idx=info['token'],
+            pts_filename=[info['lidar_path']],
+            sweeps=info['sweeps'],
+            timestamp=info['timestamp'] / 1e6,
+        )
+
+        if self.modality['use_camera']:
+            image_paths = []
+            lidar2img_rts = []
+            intrinsics = []
+            extrinsics = []
+            extrinsics_sweep = []
+            img_timestamp = []
+            for cam_type, cam_info in info['cams'].items():
+                img_timestamp.append(cam_info['timestamp'] / 1e6)
+                image_paths.append(cam_info['data_path'])
+                # obtain lidar to image transformation matrix
+                lidar2cam_r = np.linalg.inv(cam_info['sensor2lidar_rotation'])
+                lidar2cam_t = cam_info[
+                    'sensor2lidar_translation'] @ lidar2cam_r.T
+                lidar2cam_rt = np.eye(4)
+                lidar2cam_rt[:3, :3] = lidar2cam_r.T
+                lidar2cam_rt[3, :3] = -lidar2cam_t
+                intrinsic = cam_info['cam_intrinsic']
+                viewpad = np.eye(4)
+                viewpad[:intrinsic.shape[0], :intrinsic.shape[1]] = intrinsic
+                lidar2img_rt = (viewpad @ lidar2cam_rt.T)
+                intrinsics.append(viewpad)
+                extrinsics.append(lidar2cam_rt)
+                lidar2img_rts.append(lidar2img_rt)
+                extrinsics_sweep.append(None)  # placeholder for sweeps
+
+            input_dict.update(
+                dict(
+                    img_timestamp=img_timestamp,
+                    img_filename=image_paths,
+                    lidar2img=lidar2img_rts,
+                    intrinsics=intrinsics,
+                    extrinsics=extrinsics,
+                    extrinsics_sweep=extrinsics_sweep,
+                ))
+
+        if not self.test_mode:
+            annos = self.get_ann_info(index)
+            input_dict['ann_info'] = annos
+        return input_dict
diff --git a/modelscope/models/cv/object_detection_3d/depe/mmdet3d_plugin/datasets/pipelines/__init__.py b/modelscope/models/cv/object_detection_3d/depe/mmdet3d_plugin/datasets/pipelines/__init__.py
new file mode 100644
index 00000000..33c3736b
--- /dev/null
+++ b/modelscope/models/cv/object_detection_3d/depe/mmdet3d_plugin/datasets/pipelines/__init__.py
@@ -0,0 +1,12 @@
+"""
+The implementation here is modified based on PETR, originally Apache-2.0 license and publicly avaialbe at
+https://github.com/megvii-research/PETR/blob/main/projects/mmdet3d_plugin/datasets/pipelines
+"""
+from .loading import LoadMultiViewImageFromMultiSweepsFiles
+from .transform_3d import (NormalizeMultiviewImage, PadMultiViewImage,
+                           ResizeCropFlipImage)
+
+__all__ = [
+    'PadMultiViewImage', 'NormalizeMultiviewImage',
+    'LoadMultiViewImageFromMultiSweepsFiles', 'ResizeCropFlipImage'
+]
diff --git a/modelscope/models/cv/object_detection_3d/depe/mmdet3d_plugin/datasets/pipelines/loading.py b/modelscope/models/cv/object_detection_3d/depe/mmdet3d_plugin/datasets/pipelines/loading.py
new file mode 100644
index 00000000..6a40eb5c
--- /dev/null
+++ b/modelscope/models/cv/object_detection_3d/depe/mmdet3d_plugin/datasets/pipelines/loading.py
@@ -0,0 +1,177 @@
+"""
+The implementation here is modified based on PETR, originally Apache-2.0 license and publicly avaialbe at
+https://github.com/megvii-research/PETR/blob/main/projects/mmdet3d_plugin/datasets/pipelines
+"""
+import mmcv
+import numpy as np
+from mmdet.datasets.builder import PIPELINES
+
+
+@PIPELINES.register_module()
+class LoadMultiViewImageFromMultiSweepsFiles(object):
+    """Load multi channel images from a list of separate channel files.
+    Expects results['img_filename'] to be a list of filenames.
+    Args:
+        to_float32 (bool): Whether to convert the img to float32.
+            Defaults to False.
+        color_type (str): Color type of the file. Defaults to 'unchanged'.
+    """
+
+    def __init__(
+        self,
+        sweeps_num=5,
+        to_float32=False,
+        file_client_args=dict(backend='disk'),
+        pad_empty_sweeps=False,
+        sweep_range=[3, 27],
+        sweeps_id=None,
+        color_type='unchanged',
+        sensors=[
+            'CAM_FRONT', 'CAM_FRONT_RIGHT', 'CAM_FRONT_LEFT', 'CAM_BACK',
+            'CAM_BACK_LEFT', 'CAM_BACK_RIGHT'
+        ],
+        test_mode=True,
+        prob=1.0,
+    ):
+
+        self.sweeps_num = sweeps_num
+        self.to_float32 = to_float32
+        self.color_type = color_type
+        self.file_client_args = file_client_args.copy()
+        self.file_client = None
+        self.pad_empty_sweeps = pad_empty_sweeps
+        self.sensors = sensors
+        self.test_mode = test_mode
+        self.sweeps_id = sweeps_id
+        self.sweep_range = sweep_range
+        self.prob = prob
+        if self.sweeps_id:
+            assert len(self.sweeps_id) == self.sweeps_num
+
+    def __call__(self, results):
+        """Call function to load multi-view image from files.
+        Args:
+            results (dict): Result dict containing multi-view image filenames.
+        Returns:
+            dict: The result dict containing the multi-view image data. \
+                Added keys and values are described below.
+                - filename (str): Multi-view image filenames.
+                - img (np.ndarray): Multi-view image arrays.
+                - img_shape (tuple[int]): Shape of multi-view image arrays.
+                - ori_shape (tuple[int]): Shape of original image arrays.
+                - pad_shape (tuple[int]): Shape of padded image arrays.
+                - scale_factor (float): Scale factor.
+                - img_norm_cfg (dict): Normalization configuration of images.
+        """
+        sweep_imgs_list = []
+        timestamp_imgs_list = []
+        imgs = results['img']
+        img_timestamp = results['img_timestamp']
+        lidar_timestamp = results['timestamp']
+        img_timestamp = [
+            lidar_timestamp - timestamp for timestamp in img_timestamp
+        ]
+        sweep_imgs_list.extend(imgs)
+        timestamp_imgs_list.extend(img_timestamp)
+        nums = len(imgs)
+        if self.pad_empty_sweeps and len(results['sweeps']) == 0:
+            for i in range(self.sweeps_num):
+                sweep_imgs_list.extend(imgs)
+                results['pts_filename'] += [results['pts_filename'][0]]
+                mean_time = (self.sweep_range[0]
+                             + self.sweep_range[1]) / 2.0 * 0.083
+                timestamp_imgs_list.extend(
+                    [time + mean_time for time in img_timestamp])
+                for j in range(nums):
+                    results['filename'].append(results['filename'][j])
+                    results['lidar2img'].append(
+                        np.copy(results['lidar2img'][j]))
+                    results['intrinsics'].append(
+                        np.copy(results['intrinsics'][j]))
+                    results['extrinsics'].append(
+                        np.copy(results['extrinsics'][j]))
+        else:
+            if self.sweeps_id:
+                choices = self.sweeps_id
+            elif len(results['sweeps']) <= self.sweeps_num:
+                bin_size = len(results['sweeps']) / (self.sweeps_num + 1)
+                choices = [
+                    int(np.floor((i + 1) * bin_size))
+                    for i in range(self.sweeps_num)
+                ]
+            elif self.test_mode:
+                if self.sweep_range[1] <= len(results['sweeps']):
+                    sweep_range = list(
+                        range(self.sweep_range[0], self.sweep_range[1]))
+                elif self.sweep_range[0] >= len(results['sweeps']):
+                    sweep_range = list(range(0, len(results['sweeps'])))
+                else:
+                    sweep_range = list(
+                        range(self.sweep_range[0], len(results['sweeps'])))
+                    if len(sweep_range) <= self.sweeps_num:
+                        sweep_range = list(range(0, len(results['sweeps'])))
+                bin_size = len(sweep_range) / (self.sweeps_num + 1)
+                choices = [
+                    sweep_range[0] + int(np.floor((i + 1) * bin_size))
+                    for i in range(self.sweeps_num)
+                ]
+            else:
+                if np.random.random() < self.prob:
+                    if self.sweep_range[1] <= len(results['sweeps']):
+                        sweep_range = list(
+                            range(self.sweep_range[0], self.sweep_range[1]))
+                    elif self.sweep_range[0] >= len(results['sweeps']):
+                        sweep_range = list(range(0, len(results['sweeps'])))
+                    else:
+                        sweep_range = list(
+                            range(self.sweep_range[0], len(results['sweeps'])))
+                        if len(sweep_range) <= self.sweeps_num:
+                            sweep_range = list(
+                                range(0, len(results['sweeps'])))
+                    choices = np.random.choice(
+                        sweep_range, self.sweeps_num, replace=False)
+                else:
+                    bin_size = len(results['sweeps']) / (self.sweeps_num + 1)
+                    choices = [
+                        int(np.floor((i + 1) * bin_size))
+                        for i in range(self.sweeps_num)
+                    ]
+            choices = sorted(choices)
+            for idx in choices:
+                sweep_idx = min(idx, len(results['sweeps']) - 1)
+                sweep = results['sweeps'][sweep_idx]
+                if 'lidar_path' in sweep:
+                    results['pts_filename'] += [sweep['lidar_path']]
+                if len(sweep.keys()) < len(self.sensors):
+                    sweep = results['sweeps'][sweep_idx - 1]
+                results['filename'].extend(
+                    [sweep[sensor]['data_path'] for sensor in self.sensors])
+                tmp = [
+                    mmcv.imread(sweep[sensor]['data_path'], self.color_type)
+                    for sensor in self.sensors
+                ]
+                img = np.stack(tmp, axis=-1)
+                if self.to_float32:
+                    img = img.astype(np.float32)
+                img = [img[..., i] for i in range(img.shape[-1])]
+                sweep_imgs_list.extend(img)
+                sweep_ts = [
+                    lidar_timestamp - sweep[sensor]['timestamp'] / 1e6
+                    for sensor in self.sensors
+                ]
+                timestamp_imgs_list.extend(sweep_ts)
+                for sensor in self.sensors:
+                    results['lidar2img'].append(sweep[sensor]['lidar2img'])
+                    results['intrinsics'].append(sweep[sensor]['intrinsics'])
+                    results['extrinsics'].append(sweep[sensor]['extrinsics'])
+        results['img'] = sweep_imgs_list
+        results['timestamp'] = timestamp_imgs_list
+
+        return results
+
+    def __repr__(self):
+        """str: Return a string that describes the module."""
+        repr_str = self.__class__.__name__
+        repr_str += f'(to_float32={self.to_float32}, '
+        repr_str += f"color_type='{self.color_type}')"
+        return repr_str
diff --git a/modelscope/models/cv/object_detection_3d/depe/mmdet3d_plugin/datasets/pipelines/transform_3d.py b/modelscope/models/cv/object_detection_3d/depe/mmdet3d_plugin/datasets/pipelines/transform_3d.py
new file mode 100644
index 00000000..23db18ce
--- /dev/null
+++ b/modelscope/models/cv/object_detection_3d/depe/mmdet3d_plugin/datasets/pipelines/transform_3d.py
@@ -0,0 +1,234 @@
+"""
+The implementation here is modified based on PETR, originally Apache-2.0 license and publicly avaialbe at
+https://github.com/megvii-research/PETR/blob/main/projects/mmdet3d_plugin/datasets/pipelines
+"""
+import copy
+
+import mmcv
+import mmdet3d
+import numpy as np
+import torch
+from mmdet.datasets.builder import PIPELINES
+from numpy import random
+from PIL import Image
+
+
+@PIPELINES.register_module()
+class PadMultiViewImage(object):
+    """Pad the multi-view image.
+    There are two padding modes: (1) pad to a fixed size and (2) pad to the
+    minimum size that is divisible by some number.
+    Added keys are "pad_shape", "pad_fixed_size", "pad_size_divisor",
+    Args:
+        size (tuple, optional): Fixed padding size.
+        size_divisor (int, optional): The divisor of padded size.
+        pad_val (float, optional): Padding value, 0 by default.
+    """
+
+    def __init__(self, size=None, size_divisor=None, pad_val=0):
+        self.size = size
+        self.size_divisor = size_divisor
+        self.pad_val = pad_val
+        # only one of size and size_divisor should be valid
+        assert size is not None or size_divisor is not None
+        assert size is None or size_divisor is None
+
+    def _pad_img(self, results):
+        """Pad images according to ``self.size``."""
+        if self.size is not None:
+            padded_img = [
+                mmcv.impad(img, shape=self.size, pad_val=self.pad_val)
+                for img in results['img']
+            ]
+        elif self.size_divisor is not None:
+            padded_img = [
+                mmcv.impad_to_multiple(
+                    img, self.size_divisor, pad_val=self.pad_val)
+                for img in results['img']
+            ]
+        results['img_shape'] = [img.shape for img in results['img']]
+        results['img'] = padded_img
+        results['pad_shape'] = [img.shape for img in padded_img]
+        results['pad_fixed_size'] = self.size
+        results['pad_size_divisor'] = self.size_divisor
+
+    def __call__(self, results):
+        """Call function to pad images, masks, semantic segmentation maps.
+        Args:
+            results (dict): Result dict from loading pipeline.
+        Returns:
+            dict: Updated result dict.
+        """
+        self._pad_img(results)
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(size={self.size}, '
+        repr_str += f'size_divisor={self.size_divisor}, '
+        repr_str += f'pad_val={self.pad_val})'
+        return repr_str
+
+
+@PIPELINES.register_module()
+class NormalizeMultiviewImage(object):
+    """Normalize the image.
+    Added key is "img_norm_cfg".
+    Args:
+        mean (sequence): Mean values of 3 channels.
+        std (sequence): Std values of 3 channels.
+        to_rgb (bool): Whether to convert the image from BGR to RGB,
+            default is true.
+    """
+
+    def __init__(self, mean, std, to_rgb=True):
+        self.mean = np.array(mean, dtype=np.float32)
+        self.std = np.array(std, dtype=np.float32)
+        self.to_rgb = to_rgb
+
+    def __call__(self, results):
+        """Call function to normalize images.
+        Args:
+            results (dict): Result dict from loading pipeline.
+        Returns:
+            dict: Normalized results, 'img_norm_cfg' key is added into
+                result dict.
+        """
+        results['img'] = [
+            mmcv.imnormalize(img, self.mean, self.std, self.to_rgb)
+            for img in results['img']
+        ]
+        results['img_norm_cfg'] = dict(
+            mean=self.mean, std=self.std, to_rgb=self.to_rgb)
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(mean={self.mean}, std={self.std}, to_rgb={self.to_rgb})'
+        return repr_str
+
+
+@PIPELINES.register_module()
+class ResizeCropFlipImage(object):
+    """Random resize, Crop and flip the image
+    Args:
+        size (tuple, optional): Fixed padding size.
+    """
+
+    def __init__(self, data_aug_conf=None, training=True, diff_aug=False):
+        self.data_aug_conf = data_aug_conf
+        self.training = training
+        self.diff_aug = diff_aug  # whether use different aug in all views
+
+    def __call__(self, results):
+        """Call function to pad images, masks, semantic segmentation maps.
+        Args:
+            results (dict): Result dict from loading pipeline.
+        Returns:
+            dict: Updated result dict.
+        """
+
+        imgs = results['img']
+        N = len(imgs)
+        new_imgs = []
+        results['intrin_ori'] = []
+        results['extrin_ori'] = []
+        results['ida_mat'] = []
+        results['bda_mat'] = [np.eye(4)] * len(
+            results['lidar2img'])  # no bda when test
+        if not self.diff_aug:
+            resize, resize_dims, crop, flip, rotate = self._sample_augmentation(
+            )
+        for i in range(N):
+            img = Image.fromarray(np.uint8(imgs[i]))
+            # augmentation (resize, crop, horizontal flip, rotate)
+            if self.diff_aug:
+                resize, resize_dims, crop, flip, rotate = self._sample_augmentation(
+                )  # different view use different aug (BEV Det)
+            img, ida_mat = self._img_transform(
+                img,
+                resize=resize,
+                resize_dims=resize_dims,
+                crop=crop,
+                flip=flip,
+                rotate=rotate,
+            )
+            new_imgs.append(np.array(img).astype(np.float32))
+            # --save intrin&extrin for depthnet input_mlp--
+            results['intrin_ori'] += [results['intrinsics'][i][:3, :3].copy()]
+            # results['extrin_ori'] += [results['extrinsics'][i % 6].T.copy()]  # all sweeps use keyframe's lidar2cam
+            results['extrin_ori'] += [results['extrinsics'][i].T.copy()]
+            results['ida_mat'] += [ida_mat.cpu().numpy()]
+            # --save intrin&extrin for depthnet input_mlp--
+            results['intrinsics'][
+                i][:3, :3] = ida_mat @ results['intrinsics'][i][:3, :3]
+        results['img'] = new_imgs
+        results['lidar2img'] = [
+            results['intrinsics'][i] @ results['extrinsics'][i].T
+            for i in range(len(results['extrinsics']))
+        ]
+
+        return results
+
+    def _get_rot(self, h):
+
+        return torch.Tensor([
+            [np.cos(h), np.sin(h)],
+            [-np.sin(h), np.cos(h)],
+        ])
+
+    def _img_transform(self, img, resize, resize_dims, crop, flip, rotate):
+        ida_rot = torch.eye(2)
+        ida_tran = torch.zeros(2)
+        # adjust image
+        img = img.resize(resize_dims)
+        img = img.crop(crop)
+        if flip:
+            img = img.transpose(method=Image.FLIP_LEFT_RIGHT)
+        img = img.rotate(rotate)
+
+        # post-homography transformation
+        ida_rot *= resize
+        ida_tran -= torch.Tensor(crop[:2])
+        if flip:
+            A = torch.Tensor([[-1, 0], [0, 1]])
+            b = torch.Tensor([crop[2] - crop[0], 0])
+            ida_rot = A.matmul(ida_rot)
+            ida_tran = A.matmul(ida_tran) + b
+        A = self._get_rot(rotate / 180 * np.pi)
+        b = torch.Tensor([crop[2] - crop[0], crop[3] - crop[1]]) / 2
+        b = A.matmul(-b) + b
+        ida_rot = A.matmul(ida_rot)
+        ida_tran = A.matmul(ida_tran) + b
+        ida_mat = torch.eye(3)
+        ida_mat[:2, :2] = ida_rot
+        ida_mat[:2, 2] = ida_tran
+        return img, ida_mat
+
+    def _sample_augmentation(self):
+        H, W = self.data_aug_conf['H'], self.data_aug_conf['W']
+        fH, fW = self.data_aug_conf['final_dim']
+        if self.training:
+            resize = np.random.uniform(*self.data_aug_conf['resize_lim'])
+            resize_dims = (int(W * resize), int(H * resize))
+            newW, newH = resize_dims
+            crop_h = int(
+                (1 - np.random.uniform(*self.data_aug_conf['bot_pct_lim']))
+                * newH) - fH
+            crop_w = int(np.random.uniform(0, max(0, newW - fW)))
+            crop = (crop_w, crop_h, crop_w + fW, crop_h + fH)
+            flip = False
+            if self.data_aug_conf['rand_flip'] and np.random.choice([0, 1]):
+                flip = True
+            rotate = np.random.uniform(*self.data_aug_conf['rot_lim'])
+        else:
+            resize = max(fH / H, fW / W)
+            resize_dims = (int(W * resize), int(H * resize))
+            newW, newH = resize_dims
+            crop_h = int(
+                (1 - np.mean(self.data_aug_conf['bot_pct_lim'])) * newH) - fH
+            crop_w = int(max(0, newW - fW) / 2)
+            crop = (crop_w, crop_h, crop_w + fW, crop_h + fH)
+            flip = False
+            rotate = 0
+        return resize, resize_dims, crop, flip, rotate
diff --git a/modelscope/models/cv/object_detection_3d/depe/mmdet3d_plugin/models/__init__.py b/modelscope/models/cv/object_detection_3d/depe/mmdet3d_plugin/models/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/modelscope/models/cv/object_detection_3d/depe/mmdet3d_plugin/models/backbones/__init__.py b/modelscope/models/cv/object_detection_3d/depe/mmdet3d_plugin/models/backbones/__init__.py
new file mode 100644
index 00000000..5d10ef76
--- /dev/null
+++ b/modelscope/models/cv/object_detection_3d/depe/mmdet3d_plugin/models/backbones/__init__.py
@@ -0,0 +1,7 @@
+"""
+The implementation here is modified based on PETR, originally Apache-2.0 license and publicly avaialbe at
+https://github.com/megvii-research/PETR/blob/main/projects/mmdet3d_plugin/models/backbones
+"""
+from .vovnet import VoVNet
+
+__all__ = ['VoVNet']
diff --git a/modelscope/models/cv/object_detection_3d/depe/mmdet3d_plugin/models/backbones/vovnet.py b/modelscope/models/cv/object_detection_3d/depe/mmdet3d_plugin/models/backbones/vovnet.py
new file mode 100644
index 00000000..44cdd2a1
--- /dev/null
+++ b/modelscope/models/cv/object_detection_3d/depe/mmdet3d_plugin/models/backbones/vovnet.py
@@ -0,0 +1,441 @@
+"""
+The implementation here is modified based on PETR, originally Apache-2.0 license and publicly avaialbe at
+https://github.com/megvii-research/PETR/blob/main/projects/mmdet3d_plugin/models/backbones
+"""
+from collections import OrderedDict
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.runner import BaseModule
+from mmdet.models.builder import BACKBONES
+from torch.nn.modules.batchnorm import _BatchNorm
+
+VoVNet19_slim_dw_eSE = {
+    'stem': [64, 64, 64],
+    'stage_conv_ch': [64, 80, 96, 112],
+    'stage_out_ch': [112, 256, 384, 512],
+    'layer_per_block': 3,
+    'block_per_stage': [1, 1, 1, 1],
+    'eSE': True,
+    'dw': True
+}
+
+VoVNet19_dw_eSE = {
+    'stem': [64, 64, 64],
+    'stage_conv_ch': [128, 160, 192, 224],
+    'stage_out_ch': [256, 512, 768, 1024],
+    'layer_per_block': 3,
+    'block_per_stage': [1, 1, 1, 1],
+    'eSE': True,
+    'dw': True
+}
+
+VoVNet19_slim_eSE = {
+    'stem': [64, 64, 128],
+    'stage_conv_ch': [64, 80, 96, 112],
+    'stage_out_ch': [112, 256, 384, 512],
+    'layer_per_block': 3,
+    'block_per_stage': [1, 1, 1, 1],
+    'eSE': True,
+    'dw': False
+}
+
+VoVNet19_eSE = {
+    'stem': [64, 64, 128],
+    'stage_conv_ch': [128, 160, 192, 224],
+    'stage_out_ch': [256, 512, 768, 1024],
+    'layer_per_block': 3,
+    'block_per_stage': [1, 1, 1, 1],
+    'eSE': True,
+    'dw': False
+}
+
+VoVNet39_eSE = {
+    'stem': [64, 64, 128],
+    'stage_conv_ch': [128, 160, 192, 224],
+    'stage_out_ch': [256, 512, 768, 1024],
+    'layer_per_block': 5,
+    'block_per_stage': [1, 1, 2, 2],
+    'eSE': True,
+    'dw': False
+}
+
+VoVNet57_eSE = {
+    'stem': [64, 64, 128],
+    'stage_conv_ch': [128, 160, 192, 224],
+    'stage_out_ch': [256, 512, 768, 1024],
+    'layer_per_block': 5,
+    'block_per_stage': [1, 1, 4, 3],
+    'eSE': True,
+    'dw': False
+}
+
+VoVNet99_eSE = {
+    'stem': [64, 64, 128],
+    'stage_conv_ch': [128, 160, 192, 224],
+    'stage_out_ch': [256, 512, 768, 1024],
+    'layer_per_block': 5,
+    'block_per_stage': [1, 3, 9, 3],
+    'eSE': True,
+    'dw': False
+}
+
+_STAGE_SPECS = {
+    'V-19-slim-dw-eSE': VoVNet19_slim_dw_eSE,
+    'V-19-dw-eSE': VoVNet19_dw_eSE,
+    'V-19-slim-eSE': VoVNet19_slim_eSE,
+    'V-19-eSE': VoVNet19_eSE,
+    'V-39-eSE': VoVNet39_eSE,
+    'V-57-eSE': VoVNet57_eSE,
+    'V-99-eSE': VoVNet99_eSE,
+}
+
+
+def dw_conv3x3(in_channels,
+               out_channels,
+               module_name,
+               postfix,
+               stride=1,
+               kernel_size=3,
+               padding=1):
+    """3x3 convolution with padding"""
+    return [
+        ('{}_{}/dw_conv3x3'.format(module_name, postfix),
+         nn.Conv2d(
+             in_channels,
+             out_channels,
+             kernel_size=kernel_size,
+             stride=stride,
+             padding=padding,
+             groups=out_channels,
+             bias=False)),
+        ('{}_{}/pw_conv1x1'.format(module_name, postfix),
+         nn.Conv2d(
+             in_channels,
+             out_channels,
+             kernel_size=1,
+             stride=1,
+             padding=0,
+             groups=1,
+             bias=False)),
+        ('{}_{}/pw_norm'.format(module_name,
+                                postfix), nn.BatchNorm2d(out_channels)),
+        ('{}_{}/pw_relu'.format(module_name, postfix), nn.ReLU(inplace=True)),
+    ]
+
+
+def conv3x3(in_channels,
+            out_channels,
+            module_name,
+            postfix,
+            stride=1,
+            groups=1,
+            kernel_size=3,
+            padding=1):
+    """3x3 convolution with padding"""
+    return [
+        (
+            f'{module_name}_{postfix}/conv',
+            nn.Conv2d(
+                in_channels,
+                out_channels,
+                kernel_size=kernel_size,
+                stride=stride,
+                padding=padding,
+                groups=groups,
+                bias=False,
+            ),
+        ),
+        (f'{module_name}_{postfix}/norm', nn.BatchNorm2d(out_channels)),
+        (f'{module_name}_{postfix}/relu', nn.ReLU(inplace=True)),
+    ]
+
+
+def conv1x1(in_channels,
+            out_channels,
+            module_name,
+            postfix,
+            stride=1,
+            groups=1,
+            kernel_size=1,
+            padding=0):
+    """1x1 convolution with padding"""
+    return [
+        (
+            f'{module_name}_{postfix}/conv',
+            nn.Conv2d(
+                in_channels,
+                out_channels,
+                kernel_size=kernel_size,
+                stride=stride,
+                padding=padding,
+                groups=groups,
+                bias=False,
+            ),
+        ),
+        (f'{module_name}_{postfix}/norm', nn.BatchNorm2d(out_channels)),
+        (f'{module_name}_{postfix}/relu', nn.ReLU(inplace=True)),
+    ]
+
+
+class Hsigmoid(nn.Module):
+
+    def __init__(self, inplace=True):
+        super(Hsigmoid, self).__init__()
+        self.inplace = inplace
+
+    def forward(self, x):
+        return F.relu6(x + 3.0, inplace=self.inplace) / 6.0
+
+
+class eSEModule(nn.Module):
+
+    def __init__(self, channel, reduction=4):
+        super(eSEModule, self).__init__()
+        self.avg_pool = nn.AdaptiveAvgPool2d(1)
+        self.fc = nn.Conv2d(channel, channel, kernel_size=1, padding=0)
+        self.hsigmoid = Hsigmoid()
+
+    def forward(self, x):
+        input = x
+        x = self.avg_pool(x)
+        x = self.fc(x)
+        x = self.hsigmoid(x)
+        return input * x
+
+
+class _OSA_module(nn.Module):
+
+    def __init__(self,
+                 in_ch,
+                 stage_ch,
+                 concat_ch,
+                 layer_per_block,
+                 module_name,
+                 SE=False,
+                 identity=False,
+                 depthwise=False):
+
+        super(_OSA_module, self).__init__()
+
+        self.identity = identity
+        self.depthwise = depthwise
+        self.isReduced = False
+        self.layers = nn.ModuleList()
+        in_channel = in_ch
+        if self.depthwise and in_channel != stage_ch:
+            self.isReduced = True
+            self.conv_reduction = nn.Sequential(
+                OrderedDict(
+                    conv1x1(in_channel, stage_ch,
+                            '{}_reduction'.format(module_name), '0')))
+        for i in range(layer_per_block):
+            if self.depthwise:
+                self.layers.append(
+                    nn.Sequential(
+                        OrderedDict(
+                            dw_conv3x3(stage_ch, stage_ch, module_name, i))))
+            else:
+                self.layers.append(
+                    nn.Sequential(
+                        OrderedDict(
+                            conv3x3(in_channel, stage_ch, module_name, i))))
+            in_channel = stage_ch
+
+        # feature aggregation
+        in_channel = in_ch + layer_per_block * stage_ch
+        self.concat = nn.Sequential(
+            OrderedDict(conv1x1(in_channel, concat_ch, module_name, 'concat')))
+
+        self.ese = eSEModule(concat_ch)
+
+    def forward(self, x):
+
+        identity_feat = x
+
+        output = []
+        output.append(x)
+        if self.depthwise and self.isReduced:
+            x = self.conv_reduction(x)
+        for layer in self.layers:
+            x = layer(x)
+            output.append(x)
+
+        x = torch.cat(output, dim=1)
+        xt = self.concat(x)
+
+        xt = self.ese(xt)
+
+        if self.identity:
+            xt = xt + identity_feat
+
+        return xt
+
+
+class _OSA_stage(nn.Sequential):
+
+    def __init__(self,
+                 in_ch,
+                 stage_ch,
+                 concat_ch,
+                 block_per_stage,
+                 layer_per_block,
+                 stage_num,
+                 SE=False,
+                 depthwise=False):
+
+        super(_OSA_stage, self).__init__()
+
+        if not stage_num == 2:
+            self.add_module(
+                'Pooling',
+                nn.MaxPool2d(kernel_size=3, stride=2, ceil_mode=True))
+
+        if block_per_stage != 1:
+            SE = False
+        module_name = f'OSA{stage_num}_1'
+        self.add_module(
+            module_name,
+            _OSA_module(
+                in_ch,
+                stage_ch,
+                concat_ch,
+                layer_per_block,
+                module_name,
+                SE,
+                depthwise=depthwise))
+        for i in range(block_per_stage - 1):
+            if i != block_per_stage - 2:  # last block
+                SE = False
+            module_name = f'OSA{stage_num}_{i + 2}'
+            self.add_module(
+                module_name,
+                _OSA_module(
+                    concat_ch,
+                    stage_ch,
+                    concat_ch,
+                    layer_per_block,
+                    module_name,
+                    SE,
+                    identity=True,
+                    depthwise=depthwise),
+            )
+
+
+@BACKBONES.register_module()
+class VoVNet(BaseModule):
+
+    def __init__(self,
+                 spec_name,
+                 input_ch=3,
+                 out_features=None,
+                 frozen_stages=-1,
+                 norm_eval=True,
+                 pretrained=None,
+                 init_cfg=None):
+        """
+        Args:
+            input_ch(int) : the number of input channel
+            out_features (list[str]): name of the layers whose outputs should
+                be returned in forward. Can be anything in "stem", "stage2" ...
+        """
+        super(VoVNet, self).__init__(init_cfg)
+        self.frozen_stages = frozen_stages
+        self.norm_eval = norm_eval
+
+        if isinstance(pretrained, str):
+            warnings.warn('DeprecationWarning: pretrained is deprecated, '
+                          'please use "init_cfg" instead')
+            self.init_cfg = dict(type='Pretrained', checkpoint=pretrained)
+        stage_specs = _STAGE_SPECS[spec_name]
+
+        stem_ch = stage_specs['stem']
+        config_stage_ch = stage_specs['stage_conv_ch']
+        config_concat_ch = stage_specs['stage_out_ch']
+        block_per_stage = stage_specs['block_per_stage']
+        layer_per_block = stage_specs['layer_per_block']
+        SE = stage_specs['eSE']
+        depthwise = stage_specs['dw']
+
+        self._out_features = out_features
+
+        # Stem module
+        conv_type = dw_conv3x3 if depthwise else conv3x3
+        stem = conv3x3(input_ch, stem_ch[0], 'stem', '1', 2)
+        stem += conv_type(stem_ch[0], stem_ch[1], 'stem', '2', 1)
+        stem += conv_type(stem_ch[1], stem_ch[2], 'stem', '3', 2)
+        self.add_module('stem', nn.Sequential((OrderedDict(stem))))
+        current_stirde = 4
+        self._out_feature_strides = {
+            'stem': current_stirde,
+            'stage2': current_stirde
+        }
+        self._out_feature_channels = {'stem': stem_ch[2]}
+
+        stem_out_ch = [stem_ch[2]]
+        in_ch_list = stem_out_ch + config_concat_ch[:-1]
+        # OSA stages
+        self.stage_names = []
+        for i in range(4):  # num_stages
+            name = 'stage%d' % (i + 2)  # stage 2 ... stage 5
+            self.stage_names.append(name)
+            self.add_module(
+                name,
+                _OSA_stage(
+                    in_ch_list[i],
+                    config_stage_ch[i],
+                    config_concat_ch[i],
+                    block_per_stage[i],
+                    layer_per_block,
+                    i + 2,
+                    SE,
+                    depthwise,
+                ),
+            )
+
+            self._out_feature_channels[name] = config_concat_ch[i]
+            if not i == 0:
+                self._out_feature_strides[name] = current_stirde = int(
+                    current_stirde * 2)
+
+    def _initialize_weights(self):
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(m.weight)
+
+    def forward(self, x):
+        outputs = {}
+        x = self.stem(x)
+        if 'stem' in self._out_features:
+            outputs['stem'] = x
+        for name in self.stage_names:
+            x = getattr(self, name)(x)
+            if name in self._out_features:
+                outputs[name] = x
+
+        return outputs
+
+    def _freeze_stages(self):
+        if self.frozen_stages >= 0:
+            m = getattr(self, 'stem')
+            m.eval()
+            for param in m.parameters():
+                param.requires_grad = False
+
+        for i in range(1, self.frozen_stages + 1):
+            m = getattr(self, f'stage{i+1}')
+            m.eval()
+            for param in m.parameters():
+                param.requires_grad = False
+
+    def train(self, mode=True):
+        """Convert the model into training mode while keep normalization layer
+        freezed."""
+        super(VoVNet, self).train(mode)
+        self._freeze_stages()
+        if mode and self.norm_eval:
+            for m in self.modules():
+                # trick: eval have effect on BatchNorm only
+                if isinstance(m, _BatchNorm):
+                    m.eval()
diff --git a/modelscope/models/cv/object_detection_3d/depe/mmdet3d_plugin/models/dense_heads/__init__.py b/modelscope/models/cv/object_detection_3d/depe/mmdet3d_plugin/models/dense_heads/__init__.py
new file mode 100644
index 00000000..3ab9b185
--- /dev/null
+++ b/modelscope/models/cv/object_detection_3d/depe/mmdet3d_plugin/models/dense_heads/__init__.py
@@ -0,0 +1,7 @@
+"""
+The implementation here is modified based on PETR, originally Apache-2.0 license and publicly avaialbe at
+https://github.com/megvii-research/PETR/blob/main/projects/mmdet3d_plugin/models/dense_heads
+"""
+from .petrv2_dednhead import PETRv2DEDNHead
+
+__all__ = ['PETRv2DEDNHead']
diff --git a/modelscope/models/cv/object_detection_3d/depe/mmdet3d_plugin/models/dense_heads/depth_net.py b/modelscope/models/cv/object_detection_3d/depe/mmdet3d_plugin/models/dense_heads/depth_net.py
new file mode 100644
index 00000000..1b39d33b
--- /dev/null
+++ b/modelscope/models/cv/object_detection_3d/depe/mmdet3d_plugin/models/dense_heads/depth_net.py
@@ -0,0 +1,234 @@
+"""
+The implementation here is modified based on BEVDepth, originally MIT license and publicly avaialbe at
+https://github.com/Megvii-BaseDetection/BEVDepth/blob/main/bevdepth/layers/backbones/base_lss_fpn.py
+"""
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import build_conv_layer
+from mmdet.models.backbones.resnet import BasicBlock
+
+
+class _ASPPModule(nn.Module):
+
+    def __init__(self, inplanes, planes, kernel_size, padding, dilation,
+                 BatchNorm):
+        super(_ASPPModule, self).__init__()
+        self.atrous_conv = nn.Conv2d(
+            inplanes,
+            planes,
+            kernel_size=kernel_size,
+            stride=1,
+            padding=padding,
+            dilation=dilation,
+            bias=False)
+        self.bn = BatchNorm(planes)
+        self.relu = nn.ReLU()
+
+        self._init_weight()
+
+    def forward(self, x):
+        x = self.atrous_conv(x)
+        x = self.bn(x)
+
+        return self.relu(x)
+
+    def _init_weight(self):
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                torch.nn.init.kaiming_normal_(m.weight)
+            elif isinstance(m, nn.BatchNorm2d):
+                m.weight.data.fill_(1)
+                m.bias.data.zero_()
+
+
+class ASPP(nn.Module):
+
+    def __init__(self, inplanes, mid_channels=256, BatchNorm=nn.BatchNorm2d):
+        super(ASPP, self).__init__()
+
+        dilations = [1, 6, 12, 18]
+
+        self.aspp1 = _ASPPModule(
+            inplanes,
+            mid_channels,
+            1,
+            padding=0,
+            dilation=dilations[0],
+            BatchNorm=BatchNorm)
+        self.aspp2 = _ASPPModule(
+            inplanes,
+            mid_channels,
+            3,
+            padding=dilations[1],
+            dilation=dilations[1],
+            BatchNorm=BatchNorm)
+        self.aspp3 = _ASPPModule(
+            inplanes,
+            mid_channels,
+            3,
+            padding=dilations[2],
+            dilation=dilations[2],
+            BatchNorm=BatchNorm)
+        self.aspp4 = _ASPPModule(
+            inplanes,
+            mid_channels,
+            3,
+            padding=dilations[3],
+            dilation=dilations[3],
+            BatchNorm=BatchNorm)
+
+        self.global_avg_pool = nn.Sequential(
+            nn.AdaptiveAvgPool2d((1, 1)),
+            nn.Conv2d(inplanes, mid_channels, 1, stride=1, bias=False),
+            BatchNorm(mid_channels),
+            nn.ReLU(),
+        )
+        self.conv1 = nn.Conv2d(
+            int(mid_channels * 5), mid_channels, 1, bias=False)
+        self.bn1 = BatchNorm(mid_channels)
+        self.relu = nn.ReLU()
+        self.dropout = nn.Dropout(0.5)
+        self._init_weight()
+
+    def forward(self, x):
+        x1 = self.aspp1(x)
+        x2 = self.aspp2(x)
+        x3 = self.aspp3(x)
+        x4 = self.aspp4(x)
+        x5 = self.global_avg_pool(x)
+        x5 = F.interpolate(
+            x5, size=x4.size()[2:], mode='bilinear', align_corners=True)
+        x = torch.cat((x1, x2, x3, x4, x5), dim=1)
+
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = self.relu(x)
+
+        return self.dropout(x)
+
+    def _init_weight(self):
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                torch.nn.init.kaiming_normal_(m.weight)
+            elif isinstance(m, nn.BatchNorm2d):
+                m.weight.data.fill_(1)
+                m.bias.data.zero_()
+
+
+class Mlp(nn.Module):
+
+    def __init__(self,
+                 in_features,
+                 hidden_features=None,
+                 out_features=None,
+                 act_layer=nn.ReLU,
+                 drop=0.0):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = act_layer()
+        self.drop1 = nn.Dropout(drop)
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop2 = nn.Dropout(drop)
+
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop1(x)
+        x = self.fc2(x)
+        x = self.drop2(x)
+        return x
+
+
+class SELayer(nn.Module):
+
+    def __init__(self, channels, act_layer=nn.ReLU, gate_layer=nn.Sigmoid):
+        super().__init__()
+        self.conv_reduce = nn.Conv2d(channels, channels, 1, bias=True)
+        self.act1 = act_layer()
+        self.conv_expand = nn.Conv2d(channels, channels, 1, bias=True)
+        self.gate = gate_layer()
+
+    def forward(self, x, x_se):
+        x_se = self.conv_reduce(x_se)
+        x_se = self.act1(x_se)
+        x_se = self.conv_expand(x_se)
+        return x * self.gate(x_se)
+
+
+class DepthNet(nn.Module):
+
+    def __init__(self,
+                 in_channels,
+                 mid_channels,
+                 context_channels,
+                 depth_channels,
+                 use_dcn=True,
+                 use_aspp=True,
+                 use_mlp=False):
+        super(DepthNet, self).__init__()
+        self.use_mlp = use_mlp
+        if use_mlp:
+            self.reduce_conv = nn.Sequential(
+                nn.Conv2d(
+                    in_channels,
+                    mid_channels,
+                    kernel_size=3,
+                    stride=1,
+                    padding=1),
+                nn.BatchNorm2d(mid_channels),
+                nn.ReLU(inplace=True),
+            )
+            self.bn = nn.BatchNorm1d(12)
+            self.depth_mlp = Mlp(12, mid_channels, mid_channels)
+            self.depth_se = SELayer(mid_channels)  # NOTE: add camera-aware
+            self.context_mlp = Mlp(12, mid_channels, mid_channels)
+            self.context_se = SELayer(mid_channels)  # NOTE: add camera-aware
+
+        self.context_conv = nn.Conv2d(
+            mid_channels, context_channels, kernel_size=1, stride=1, padding=0)
+        depth_conv_list = [
+            BasicBlock(mid_channels, mid_channels),
+            BasicBlock(mid_channels, mid_channels),
+            BasicBlock(mid_channels, mid_channels),
+        ]
+        if use_aspp:
+            depth_conv_list.append(ASPP(mid_channels, mid_channels))
+        if use_dcn:
+            depth_conv_list.append(
+                build_conv_layer(
+                    cfg=dict(
+                        type='DCN',
+                        in_channels=mid_channels,
+                        out_channels=mid_channels,
+                        kernel_size=3,
+                        padding=1,
+                        groups=4,
+                        im2col_step=128,
+                    )))
+        depth_conv_list.append(
+            nn.Conv2d(
+                mid_channels,
+                depth_channels,
+                kernel_size=1,
+                stride=1,
+                padding=0))
+        self.depth_conv = nn.Sequential(*depth_conv_list)
+
+    def forward(self, x, mlp_input):
+        if self.use_mlp:
+            mlp_input = self.bn(mlp_input.reshape(-1, mlp_input.shape[-1]))
+            x = self.reduce_conv(x)
+            context_se = self.context_mlp(mlp_input)[..., None, None]
+            context = self.context_se(x, context_se)
+            context = self.context_conv(context)
+            depth_se = self.depth_mlp(mlp_input)[..., None, None]
+            depth = self.depth_se(x, depth_se)
+            depth = self.depth_conv(depth)
+        else:
+            context = self.context_conv(x)
+            depth = self.depth_conv(x)
+
+        return torch.cat([depth, context], dim=1)
diff --git a/modelscope/models/cv/object_detection_3d/depe/mmdet3d_plugin/models/dense_heads/petrv2_dednhead.py b/modelscope/models/cv/object_detection_3d/depe/mmdet3d_plugin/models/dense_heads/petrv2_dednhead.py
new file mode 100644
index 00000000..52aaba2c
--- /dev/null
+++ b/modelscope/models/cv/object_detection_3d/depe/mmdet3d_plugin/models/dense_heads/petrv2_dednhead.py
@@ -0,0 +1,1307 @@
+"""
+The implementation here is modified based on PETR, originally Apache-2.0 license and publicly avaialbe at
+https://github.com/megvii-research/PETR/blob/main/projects/mmdet3d_plugin/models/dense_heads
+"""
+import copy
+import math
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import Conv2d, Linear, bias_init_with_prob
+from mmcv.cnn.bricks.transformer import FFN, build_positional_encoding
+from mmcv.runner import force_fp32
+from mmdet3d.core.bbox.coders import build_bbox_coder
+from mmdet.core import (bbox_cxcywh_to_xyxy, bbox_xyxy_to_cxcywh,
+                        build_assigner, build_sampler, multi_apply,
+                        reduce_mean)
+from mmdet.models import HEADS, build_loss
+from mmdet.models.dense_heads.anchor_free_head import AnchorFreeHead
+from mmdet.models.utils import NormedLinear, build_transformer
+from mmdet.models.utils.transformer import inverse_sigmoid
+from torch.cuda.amp.autocast_mode import autocast
+
+from modelscope.models.cv.object_detection_3d.depe.mmdet3d_plugin.core.bbox.util import \
+    normalize_bbox
+from .depth_net import DepthNet
+
+
+def pos2posemb3d(pos, num_pos_feats=128, temperature=10000):
+    scale = 2 * math.pi
+    pos = pos * scale
+    dim_t = torch.arange(num_pos_feats, dtype=torch.float32, device=pos.device)
+    dim_t = temperature**(2 * torch.div(dim_t, 2, rounding_mode='floor')
+                          / num_pos_feats)
+    pos_x = pos[..., 0, None] / dim_t
+    pos_y = pos[..., 1, None] / dim_t
+    pos_z = pos[..., 2, None] / dim_t
+    pos_x = torch.stack((pos_x[..., 0::2].sin(), pos_x[..., 1::2].cos()),
+                        dim=-1).flatten(-2)
+    pos_y = torch.stack((pos_y[..., 0::2].sin(), pos_y[..., 1::2].cos()),
+                        dim=-1).flatten(-2)
+    pos_z = torch.stack((pos_z[..., 0::2].sin(), pos_z[..., 1::2].cos()),
+                        dim=-1).flatten(-2)
+    posemb = torch.cat((pos_y, pos_x, pos_z), dim=-1)
+    return posemb
+
+
+class SELayer(nn.Module):
+
+    def __init__(self,
+                 se_channels,
+                 x_channels,
+                 act_layer=nn.ReLU,
+                 gate_layer=nn.Sigmoid):
+        super().__init__()
+        self.conv_reduce = nn.Conv2d(se_channels, x_channels, 1, bias=True)
+        self.act1 = act_layer()
+        self.conv_expand = nn.Conv2d(x_channels, x_channels, 1, bias=True)
+        self.gate = gate_layer()
+
+    def forward(self, x, x_se):
+        x_se = self.conv_reduce(x_se)
+        x_se = self.act1(x_se)
+        x_se = self.conv_expand(x_se)
+        return x * self.gate(x_se)
+
+
+class RegLayer(nn.Module):
+
+    def __init__(
+            self,
+            embed_dims=256,
+            shared_reg_fcs=2,
+            group_reg_dims=(2, 2, 2, 2, 2),  # xy, wl, zh, rot, velo
+            act_layer=nn.ReLU,
+            drop=0.0):
+        super().__init__()
+
+        reg_branch = []
+        for _ in range(shared_reg_fcs):
+            reg_branch.append(Linear(embed_dims, embed_dims))
+            reg_branch.append(act_layer())
+            reg_branch.append(nn.Dropout(drop))
+        self.reg_branch = nn.Sequential(*reg_branch)
+
+        self.task_heads = nn.ModuleList()
+        for reg_dim in group_reg_dims:
+            task_head = nn.Sequential(
+                Linear(embed_dims, embed_dims), act_layer(),
+                Linear(embed_dims, reg_dim))
+            self.task_heads.append(task_head)
+
+    def forward(self, x):
+        reg_feat = self.reg_branch(x)
+        outs = []
+        for task_head in self.task_heads:
+            out = task_head(reg_feat.clone())
+            outs.append(out)
+        outs = torch.cat(outs, -1)
+        return outs
+
+
+@HEADS.register_module()
+class PETRv2DEDNHead(AnchorFreeHead):
+    """Implements the DETR transformer head.
+    See `paper: End-to-End Object Detection with Transformers
+    <https://arxiv.org/pdf/2005.12872>`_ for details.
+    Args:
+        num_classes (int): Number of categories excluding the background.
+        in_channels (int): Number of channels in the input feature map.
+        num_query (int): Number of query in Transformer.
+        num_reg_fcs (int, optional): Number of fully-connected layers used in
+            `FFN`, which is then used for the regression head. Default 2.
+        transformer (obj:`mmcv.ConfigDict`|dict): Config for transformer.
+            Default: None.
+        sync_cls_avg_factor (bool): Whether to sync the avg_factor of
+            all ranks. Default to False.
+        positional_encoding (obj:`mmcv.ConfigDict`|dict):
+            Config for position encoding.
+        loss_cls (obj:`mmcv.ConfigDict`|dict): Config of the
+            classification loss. Default `CrossEntropyLoss`.
+        loss_bbox (obj:`mmcv.ConfigDict`|dict): Config of the
+            regression loss. Default `L1Loss`.
+        loss_iou (obj:`mmcv.ConfigDict`|dict): Config of the
+            regression iou loss. Default `GIoULoss`.
+        tran_cfg (obj:`mmcv.ConfigDict`|dict): Training config of
+            transformer head.
+        test_cfg (obj:`mmcv.ConfigDict`|dict): Testing config of
+            transformer head.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None
+    """
+
+    def __init__(
+            self,
+            num_classes,
+            in_channels,
+            num_query=100,
+            num_reg_fcs=2,
+            transformer=None,
+            sync_cls_avg_factor=False,
+            positional_encoding=dict(
+                type='SinePositionalEncoding', num_feats=128, normalize=True),
+            code_weights=None,
+            bbox_coder=None,
+            loss_cls=dict(
+                type='CrossEntropyLoss',
+                bg_cls_weight=0.1,
+                use_sigmoid=False,
+                loss_weight=1.0,
+                class_weight=1.0),
+            loss_bbox=dict(type='L1Loss', loss_weight=5.0),
+            loss_iou=dict(type='GIoULoss', loss_weight=2.0),
+            train_cfg=dict(
+                assigner=dict(
+                    type='HungarianAssigner',
+                    cls_cost=dict(type='ClassificationCost', weight=1.),
+                    reg_cost=dict(type='BBoxL1Cost', weight=5.0),
+                    iou_cost=dict(type='IoUCost', iou_mode='giou',
+                                  weight=2.0))),
+            test_cfg=dict(max_per_img=100),
+            with_position=True,
+            with_multiview=False,
+            depth_step=0.8,
+            depth_num=64,
+            LID=False,
+            depth_start=1,
+            position_level=0,
+            depth_level=0,
+            position_range=[-65, -65, -8.0, 65, 65, 8.0],
+            group_reg_dims=(2, 2, 2, 2, 2),  # xy, wl, zh, rot, velo
+            scalar=5,
+            noise_scale=0.4,
+            noise_trans=0.0,
+            dn_weight=1.0,
+            split=0.5,
+            init_cfg=None,
+            normedlinear=False,
+            with_fpe=False,
+            with_time=False,
+            with_multi=False,
+            **kwargs):
+        # NOTE here use `AnchorFreeHead` instead of `TransformerHead`,
+        # since it brings inconvenience when the initialization of
+        # `AnchorFreeHead` is called.
+        if 'code_size' in kwargs:
+            self.code_size = kwargs['code_size']
+        else:
+            self.code_size = 10
+        if code_weights is not None:
+            self.code_weights = code_weights
+        else:
+            self.code_weights = [
+                1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2
+            ]
+        self.code_weights = self.code_weights[:self.code_size]
+        self.bg_cls_weight = 0
+        self.sync_cls_avg_factor = sync_cls_avg_factor
+        class_weight = loss_cls.get('class_weight', None)
+        if class_weight is not None and (self.__class__ is PETRv2DEDNHead):
+            assert isinstance(class_weight, float), 'Expected ' \
+                'class_weight to have type float. Found ' \
+                f'{type(class_weight)}.'
+            # NOTE following the official DETR rep0, bg_cls_weight means
+            # relative classification weight of the no-object class.
+            bg_cls_weight = loss_cls.get('bg_cls_weight', class_weight)
+            assert isinstance(bg_cls_weight, float), 'Expected ' \
+                'bg_cls_weight to have type float. Found ' \
+                f'{type(bg_cls_weight)}.'
+            class_weight = torch.ones(num_classes + 1) * class_weight
+            # set background class as the last indice
+            class_weight[num_classes] = bg_cls_weight
+            loss_cls.update({'class_weight': class_weight})
+            if 'bg_cls_weight' in loss_cls:
+                loss_cls.pop('bg_cls_weight')
+            self.bg_cls_weight = bg_cls_weight
+
+        if train_cfg:
+            assert 'assigner' in train_cfg, 'assigner should be provided '\
+                'when train_cfg is set.'
+            assigner = train_cfg['assigner']
+            self.assigner = build_assigner(assigner)
+            # DETR sampling=False, so use PseudoSampler
+            sampler_cfg = dict(type='PseudoSampler')
+            self.sampler = build_sampler(sampler_cfg, context=self)
+
+        self.num_query = num_query
+        self.num_classes = num_classes
+        self.in_channels = in_channels
+        self.num_reg_fcs = num_reg_fcs
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+        self.fp16_enabled = False
+        self.embed_dims = 256
+        self.depth_step = depth_step
+        self.depth_num = depth_num
+        self.position_range = position_range
+        self.LID = LID
+        self.with_depthnet = kwargs.get('with_depthnet', False)
+        self.with_fde = kwargs.get('with_fde', False)
+        self.depth_pe = kwargs.get('depth_pe', False)
+        self.sweep_lidar = kwargs.get('sweep_lidar', False)
+        self.depth_LID = kwargs.get('depth_LID', False)
+        self.depth_LID_num = kwargs.get('depth_LID_num', 64)
+        self.depth_thresh = kwargs.get('depth_thresh', 0.0)
+        if not self.with_depthnet:
+            self.with_fde = False
+            self.depth_pe = False
+            self.sweep_lidar = False
+        self.depth_start = depth_start
+        self.de_intv = 0.5
+        if self.with_depthnet:  # get num of depth bin for depthnet
+            if self.depth_LID:
+                self.D = self.depth_LID_num
+            else:
+                coords_d = np.arange(self.depth_start, self.position_range[3],
+                                     self.de_intv)
+                self.D = len(coords_d)
+        self.position_dim = 3 * self.depth_num
+        self.position_level = position_level
+        self.depth_level = depth_level
+        self.with_position = with_position
+        self.with_multiview = with_multiview
+        self.scalar = scalar
+        self.bbox_noise_scale = noise_scale
+        self.bbox_noise_trans = noise_trans
+        self.dn_weight = dn_weight
+        self.split = split
+        assert 'num_feats' in positional_encoding
+        num_feats = positional_encoding['num_feats']
+        assert num_feats * 2 == self.embed_dims, 'embed_dims should' \
+            f' be exactly 2 times of num_feats. Found {self.embed_dims}' \
+            f' and {num_feats}.'
+        self.act_cfg = transformer.get('act_cfg',
+                                       dict(type='ReLU', inplace=True))
+        self.num_pred = 6
+        self.normedlinear = normedlinear
+        self.with_fpe = with_fpe
+        self.with_time = with_time
+        self.with_multi = with_multi
+        self.group_reg_dims = group_reg_dims
+        super(PETRv2DEDNHead, self).__init__(
+            num_classes, in_channels, init_cfg=init_cfg)
+
+        self.loss_cls = build_loss(loss_cls)
+        self.loss_bbox = build_loss(loss_bbox)
+        self.loss_iou = build_loss(loss_iou)
+
+        if self.loss_cls.use_sigmoid:
+            self.cls_out_channels = num_classes
+        else:
+            self.cls_out_channels = num_classes + 1
+        self.positional_encoding = build_positional_encoding(
+            positional_encoding)
+        self.transformer = build_transformer(transformer)
+        self.code_weights = nn.Parameter(
+            torch.tensor(self.code_weights, requires_grad=False),
+            requires_grad=False)
+        self.bbox_coder = build_bbox_coder(bbox_coder)
+        self.pc_range = self.bbox_coder.pc_range
+        if 'depthnet_cfg' in kwargs:
+            depthnet_cfg = kwargs['depthnet_cfg']
+        else:
+            depthnet_cfg = {}
+        self.depthnet_cfg = depthnet_cfg
+        self.loss_depth_weight = kwargs.get('loss_depth_weight', 3.0)
+        if self.with_depthnet:
+            self.depth_net = DepthNet(self.in_channels, self.in_channels,
+                                      self.in_channels, self.D, **depthnet_cfg)
+        self._init_layers()
+
+    def _init_layers(self):
+        """Initialize layers of the transformer head."""
+        if not self.with_depthnet:
+            self.input_proj = Conv2d(
+                self.in_channels, self.embed_dims, kernel_size=1)
+
+        cls_branch = []
+        for _ in range(self.num_reg_fcs):
+            cls_branch.append(Linear(self.embed_dims, self.embed_dims))
+            cls_branch.append(nn.LayerNorm(self.embed_dims))
+            cls_branch.append(nn.ReLU(inplace=True))
+        if self.normedlinear:
+            cls_branch.append(
+                NormedLinear(self.embed_dims, self.cls_out_channels))
+        else:
+            cls_branch.append(Linear(self.embed_dims, self.cls_out_channels))
+        fc_cls = nn.Sequential(*cls_branch)
+
+        if self.with_multi:
+            reg_branch = RegLayer(self.embed_dims, self.num_reg_fcs,
+                                  self.group_reg_dims)
+        else:
+            reg_branch = []
+            for _ in range(self.num_reg_fcs):
+                reg_branch.append(Linear(self.embed_dims, self.embed_dims))
+                reg_branch.append(nn.ReLU())
+            reg_branch.append(Linear(self.embed_dims, self.code_size))
+            reg_branch = nn.Sequential(*reg_branch)
+
+        self.cls_branches = nn.ModuleList(
+            [copy.deepcopy(fc_cls) for _ in range(self.num_pred)])
+        self.reg_branches = nn.ModuleList(
+            [copy.deepcopy(reg_branch) for _ in range(self.num_pred)])
+
+        if self.with_multiview:
+            self.adapt_pos3d = nn.Sequential(
+                nn.Conv2d(
+                    self.embed_dims * 3 // 2,
+                    self.embed_dims * 4,
+                    kernel_size=1,
+                    stride=1,
+                    padding=0),
+                nn.ReLU(),
+                nn.Conv2d(
+                    self.embed_dims * 4,
+                    self.embed_dims,
+                    kernel_size=1,
+                    stride=1,
+                    padding=0),
+            )
+        else:
+            self.adapt_pos3d = nn.Sequential(
+                nn.Conv2d(
+                    self.embed_dims,
+                    self.embed_dims,
+                    kernel_size=1,
+                    stride=1,
+                    padding=0),
+                nn.ReLU(),
+                nn.Conv2d(
+                    self.embed_dims,
+                    self.embed_dims,
+                    kernel_size=1,
+                    stride=1,
+                    padding=0),
+            )
+
+        if self.with_position:
+            chan_in = self.position_dim
+            chan_mid = self.embed_dims * 4
+            self.position_encoder = nn.Sequential(
+                nn.Conv2d(
+                    chan_in, chan_mid, kernel_size=1, stride=1, padding=0),
+                nn.ReLU(),
+                nn.Conv2d(
+                    chan_mid,
+                    self.embed_dims,
+                    kernel_size=1,
+                    stride=1,
+                    padding=0),
+            )
+
+        if self.depth_pe:
+            self.depth_encoder = nn.Conv2d(
+                self.D, self.embed_dims, kernel_size=1, stride=1, padding=0)
+
+        self.reference_points = nn.Embedding(self.num_query, 3)
+        self.query_embedding = nn.Sequential(
+            nn.Linear(self.embed_dims * 3 // 2, self.embed_dims),
+            nn.ReLU(),
+            nn.Linear(self.embed_dims, self.embed_dims),
+        )
+        if self.with_fpe:
+            self.fpe = SELayer(self.embed_dims, self.embed_dims)
+        if self.with_fde:
+            self.fde = SELayer(self.embed_dims, self.D)
+
+    def init_weights(self):
+        """Initialize weights of the transformer head."""
+        # The initialization for transformer is important
+        self.transformer.init_weights()
+        nn.init.uniform_(self.reference_points.weight.data, 0, 1)
+        if self.loss_cls.use_sigmoid:
+            bias_init = bias_init_with_prob(0.01)
+            for m in self.cls_branches:
+                nn.init.constant_(m[-1].bias, bias_init)
+
+    def get_mlp_input(self, img_metas):
+        num_view = 6
+        num_img = len(img_metas[0]['lidar2img'])
+        lidar2img = torch.tensor(
+            [img_meta['lidar2img'][:num_view] for img_meta in img_metas])
+        lidar2img = lidar2img.repeat(1, num_img // num_view, 1, 1)
+        lidar2img = lidar2img[..., :3, :]
+        B, N, _, _, = lidar2img.shape
+        mlp_input = lidar2img.reshape(B, N, -1)
+        return mlp_input
+
+    def position_embeding(self, img_feats, img_metas, masks=None):
+        eps = 1e-5
+        pad_h, pad_w, _ = img_metas[0]['pad_shape'][0]
+        B, N, C, H, W = img_feats[self.position_level].shape
+        coords_h = torch.arange(
+            H, device=img_feats[0].device).float() * pad_h / H
+        coords_w = torch.arange(
+            W, device=img_feats[0].device).float() * pad_w / W
+        # move to center
+        coords_h += pad_h / H / 2.0
+        coords_w += pad_w / W / 2.0
+
+        if self.LID:
+            index = torch.arange(
+                start=0,
+                end=self.depth_num,
+                step=1,
+                device=img_feats[0].device).float()
+            index_1 = index + 1
+            bin_size = (self.position_range[3] - self.depth_start) / (
+                self.depth_num * (1 + self.depth_num))
+            coords_d = self.depth_start + bin_size * index * index_1
+        else:
+            index = torch.arange(
+                start=0,
+                end=self.depth_num,
+                step=1,
+                device=img_feats[0].device).float()
+            bin_size = (self.position_range[3]
+                        - self.depth_start) / self.depth_num
+            coords_d = self.depth_start + bin_size * index
+        self.coords_d = coords_d
+
+        D = coords_d.shape[0]
+        coords = torch.stack(torch.meshgrid([coords_w, coords_h, coords_d
+                                             ])).permute(1, 2, 3,
+                                                         0)  # W, H, D, 3
+        coords = torch.cat((coords, torch.ones_like(coords[..., :1])), -1)
+        coords[..., :2] = coords[..., :2] * torch.maximum(
+            coords[..., 2:3],
+            torch.ones_like(coords[..., 2:3]) * eps)
+
+        img2lidars = []
+        for img_meta in img_metas:
+            img2lidar = []
+            for i in range(len(img_meta['lidar2img'])):
+                img2lidar.append(np.linalg.inv(img_meta['lidar2img'][i]))
+            img2lidars.append(np.asarray(img2lidar))
+        img2lidars = np.asarray(img2lidars)
+        img2lidars = coords.new_tensor(img2lidars)  # (B, N, 4, 4)
+
+        coords = coords.view(1, 1, W, H, D, 4, 1).repeat(B, N, 1, 1, 1, 1, 1)
+        img2lidars = img2lidars.view(B, N, 1, 1, 1, 4,
+                                     4).repeat(1, 1, W, H, D, 1, 1)
+        coords3d = torch.matmul(img2lidars, coords).squeeze(-1)[..., :3]
+        coords3d[..., 0:1] = (coords3d[..., 0:1] - self.position_range[0]) / (
+            self.position_range[3] - self.position_range[0])
+        coords3d[..., 1:2] = (coords3d[..., 1:2] - self.position_range[1]) / (
+            self.position_range[4] - self.position_range[1])
+        coords3d[..., 2:3] = (coords3d[..., 2:3] - self.position_range[2]) / (
+            self.position_range[5] - self.position_range[2])
+        coords_mask = (coords3d > 1.0) | (coords3d < 0.0)
+        coords_mask = coords_mask.flatten(-2).sum(-1) > (D * 0.5)
+        coords_mask = masks | coords_mask.permute(0, 1, 3, 2)
+        coords3d = coords3d.permute(0, 1, 4, 5, 3,
+                                    2).contiguous().view(B * N, -1, H, W)
+        coords3d = inverse_sigmoid(coords3d)
+        coords_position_embeding = self.position_encoder(coords3d)
+        return coords_position_embeding.view(B, N, self.embed_dims, H,
+                                             W), coords_mask
+
+    def prepare_for_dn(self, batch_size, reference_points, img_metas):
+        if self.training:
+            targets = [
+                torch.cat((img_meta['gt_bboxes_3d']._data.gravity_center,
+                           img_meta['gt_bboxes_3d']._data.tensor[:, 3:]),
+                          dim=1) for img_meta in img_metas
+            ]
+            labels = [img_meta['gt_labels_3d']._data for img_meta in img_metas]
+            known = [(torch.ones_like(t)).cuda() for t in labels]
+            know_idx = known
+            unmask_bbox = unmask_label = torch.cat(known)
+            known_num = [t.size(0) for t in targets]
+            labels = torch.cat([t for t in labels])
+            boxes = torch.cat([t for t in targets])
+            batch_idx = torch.cat(
+                [torch.full((t.size(0), ), i) for i, t in enumerate(targets)])
+
+            known_indice = torch.nonzero(unmask_label + unmask_bbox)
+            known_indice = known_indice.view(-1)
+            # add noise
+            known_indice = known_indice.repeat(self.scalar, 1).view(-1)
+            known_labels = labels.repeat(self.scalar, 1).view(-1).long().to(
+                reference_points.device)
+            known_bid = batch_idx.repeat(self.scalar, 1).view(-1)
+            known_bboxs = boxes.repeat(self.scalar,
+                                       1).to(reference_points.device)
+            known_bbox_center = known_bboxs[:, :3].clone()
+            known_bbox_scale = known_bboxs[:, 3:6].clone()
+
+            if self.bbox_noise_scale > 0:
+                diff = known_bbox_scale / 2 + self.bbox_noise_trans
+                rand_prob = torch.rand_like(
+                    known_bbox_center) * 2 - 1.0  # (-1, 1)
+                known_bbox_center += torch.mul(rand_prob,
+                                               diff) * self.bbox_noise_scale
+                known_bbox_center[
+                    ...,
+                    0:1] = (known_bbox_center[..., 0:1] - self.pc_range[0]) / (
+                        self.pc_range[3] - self.pc_range[0])
+                known_bbox_center[
+                    ...,
+                    1:2] = (known_bbox_center[..., 1:2] - self.pc_range[1]) / (
+                        self.pc_range[4] - self.pc_range[1])
+                known_bbox_center[
+                    ...,
+                    2:3] = (known_bbox_center[..., 2:3] - self.pc_range[2]) / (
+                        self.pc_range[5] - self.pc_range[2])
+                known_bbox_center = known_bbox_center.clamp(min=0.0, max=1.0)
+                mask = torch.norm(rand_prob, 2, 1) > self.split
+                known_labels[mask] = self.num_classes
+
+            single_pad = int(max(known_num))
+            pad_size = int(single_pad * self.scalar)
+            padding_bbox = torch.zeros(pad_size, 3).to(reference_points.device)
+            padded_reference_points = torch.cat(
+                [padding_bbox, reference_points],
+                dim=0).unsqueeze(0).repeat(batch_size, 1, 1)
+
+            if len(known_num):
+                map_known_indice = torch.cat([
+                    torch.tensor(range(num)) for num in known_num
+                ])  # [1,2, 1,2,3]
+                map_known_indice = torch.cat([
+                    map_known_indice + single_pad * i
+                    for i in range(self.scalar)
+                ]).long()
+            if len(known_bid):
+                padded_reference_points[(
+                    known_bid.long(),
+                    map_known_indice)] = known_bbox_center.to(
+                        reference_points.device)
+
+            tgt_size = pad_size + self.num_query
+            attn_mask = torch.ones(tgt_size, tgt_size).to(
+                reference_points.device) < 0
+            # match query cannot see the reconstruct
+            attn_mask[pad_size:, :pad_size] = True
+            # reconstruct cannot see each other
+            for i in range(self.scalar):
+                if i == 0:
+                    attn_mask[single_pad * i:single_pad * (i + 1),
+                              single_pad * (i + 1):pad_size] = True
+                if i == self.scalar - 1:
+                    attn_mask[single_pad * i:single_pad * (i + 1), :single_pad
+                              * i] = True
+                else:
+                    attn_mask[single_pad * i:single_pad * (i + 1),
+                              single_pad * (i + 1):pad_size] = True
+                    attn_mask[single_pad * i:single_pad * (i + 1), :single_pad
+                              * i] = True
+
+            mask_dict = {
+                'known_indice': torch.as_tensor(known_indice).long(),
+                'batch_idx': torch.as_tensor(batch_idx).long(),
+                'map_known_indice': torch.as_tensor(map_known_indice).long(),
+                'known_lbs_bboxes': (known_labels, known_bboxs),
+                'know_idx': know_idx,
+                'pad_size': pad_size
+            }
+        else:
+            padded_reference_points = reference_points.unsqueeze(0).repeat(
+                batch_size, 1, 1)
+            attn_mask = None
+            mask_dict = None
+
+        return padded_reference_points, attn_mask, mask_dict
+
+    def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict,
+                              missing_keys, unexpected_keys, error_msgs):
+        """load checkpoints."""
+        # NOTE here use `AnchorFreeHead` instead of `TransformerHead`,
+        # since `AnchorFreeHead._load_from_state_dict` should not be
+        # called here. Invoking the default `Module._load_from_state_dict`
+        # is enough.
+
+        # Names of some parameters in has been changed.
+        version = local_metadata.get('version', None)
+        if (version is None
+                or version < 2) and self.__class__ is PETRv2DEDNHead:
+            convert_dict = {
+                '.self_attn.': '.attentions.0.',
+                '.multihead_attn.': '.attentions.1.',
+                '.decoder.norm.': '.decoder.post_norm.'
+            }
+            state_dict_keys = list(state_dict.keys())
+            for k in state_dict_keys:
+                for ori_key, convert_key in convert_dict.items():
+                    if ori_key in k:
+                        convert_key = k.replace(ori_key, convert_key)
+                        state_dict[convert_key] = state_dict[k]
+                        del state_dict[k]
+
+        super(AnchorFreeHead,
+              self)._load_from_state_dict(state_dict, prefix, local_metadata,
+                                          strict, missing_keys,
+                                          unexpected_keys, error_msgs)
+
+    def forward(self, mlvl_feats, img_metas):
+        """Forward function.
+        Args:
+            mlvl_feats (tuple[Tensor]): Features from the upstream
+                network, each is a 5D-tensor with shape
+                (B, N, C, H, W).
+        Returns:
+            all_cls_scores (Tensor): Outputs from the classification head, \
+                shape [nb_dec, bs, num_query, cls_out_channels]. Note \
+                cls_out_channels should includes background.
+            all_bbox_preds (Tensor): Sigmoid outputs from the regression \
+                head with normalized coordinate format (cx, cy, w, l, cz, h, theta, vx, vy). \
+                Shape [nb_dec, bs, num_query, 9].
+        """
+
+        x = mlvl_feats[self.position_level]
+        batch_size, num_cams = x.size(0), x.size(1)
+
+        input_img_h, input_img_w, _ = img_metas[0]['pad_shape'][0]
+        masks = x.new_ones((batch_size, num_cams, input_img_h, input_img_w))
+        for img_id in range(batch_size):
+            for cam_id in range(num_cams):
+                img_h, img_w, _ = img_metas[img_id]['img_shape'][cam_id]
+                masks[img_id, cam_id, :img_h, :img_w] = 0
+
+        if not self.with_depthnet:
+            feat = self.input_proj(x.flatten(0, 1))
+            feat = feat.view(batch_size, num_cams, *feat.shape[-3:])
+            depth = None
+            depth_all = None
+            depth_enc = None
+        else:  # with depth net
+            feat = x
+            use_mlp = self.depthnet_cfg['use_mlp']
+            if use_mlp:
+                mlp_input_all = self.get_mlp_input(img_metas).to(
+                    x.device).float()
+            x_de = mlvl_feats[self.depth_level]
+            self.depth_downsample = int(input_img_w / x_de.shape[-1])
+            num_views = 6
+            depth_list = []
+            feat_list = []
+            keyFrame = True
+            for frame in range(num_cams // num_views):
+                input_x = x_de[:, frame * num_views:(frame + 1) * num_views,
+                               ...]
+                mlp_input = mlp_input_all[:, frame * num_views:(frame + 1)
+                                          * num_views,
+                                          ...] if use_mlp else None
+                if keyFrame:
+                    output_x = self.depth_net(input_x.flatten(0, 1), mlp_input)
+                    output_x = output_x.view(batch_size, num_views,
+                                             *output_x.shape[-3:])
+                    depth_digit = output_x[:, :, :self.D, ...]
+                    depth = depth_digit.softmax(dim=2)
+                    feat = output_x[:, :, self.D:self.D + self.in_channels,
+                                    ...]
+                else:
+                    with torch.no_grad():
+                        output_x = self.depth_net(
+                            input_x.flatten(0, 1), mlp_input)
+                        output_x = output_x.view(batch_size, num_views,
+                                                 *output_x.shape[-3:])
+                        depth_digit = output_x[:, :, :self.D, ...]
+                        depth = depth_digit.softmax(dim=2)
+                        feat = output_x[:, :, self.D:self.D + self.in_channels,
+                                        ...]
+                if not self.sweep_lidar:
+                    keyFrame = False  # if not use sweep lidar, not take sweep as keyframe and no grad.
+                depth_list += [depth]
+                feat_list += [feat]
+            depth = torch.cat(depth_list, dim=1)
+            feat = torch.cat(feat_list, dim=1)
+            depth_all = depth
+            if self.with_fde:
+                depth_all = self.fde(depth_all,
+                                     feat.flatten(0, 1)).view(depth_all.size())
+            # filter depth with low prob
+            if self.depth_thresh > 0.0:
+                d_mask = depth_all.max(dim=2, keepdim=True).values
+                d_mask = torch.where(d_mask > self.depth_thresh,
+                                     torch.ones_like(d_mask),
+                                     torch.zeros_like(d_mask))
+                d_mask = d_mask.repeat(1, 1, depth_all.shape[2], 1, 1)
+                depth_all = torch.mul(depth_all, d_mask)
+            # resize depth map
+            de_d, de_h, de_w = depth_all.shape[-3:]
+            ft_h, ft_w = feat.shape[-2:]
+            if de_h != ft_h or de_w != ft_w:  # make depth and feat same size
+                depth_all = F.interpolate(depth_all, size=(de_d, ft_h, ft_w))
+            if self.depth_pe:
+                depth_enc = self.depth_encoder(depth_all.flatten(0, 1))
+
+        # interpolate masks to have the same spatial shape with x
+        masks = F.interpolate(masks, size=x.shape[-2:]).to(torch.bool)
+
+        if self.with_position:
+            coords_position_embeding, _ = self.position_embeding(
+                mlvl_feats, img_metas, masks)
+            if self.with_fpe:
+                coords_position_embeding = self.fpe(
+                    coords_position_embeding.flatten(0, 1),
+                    feat.flatten(0, 1)).view(feat.size())
+
+            pos_embed = coords_position_embeding
+            if self.depth_pe:
+                pos_embed += depth_enc.view(feat.size())
+
+            if self.with_multiview:
+                sin_embed = self.positional_encoding(masks)
+                sin_embed = self.adapt_pos3d(sin_embed.flatten(0, 1)).view(
+                    feat.size())
+                pos_embed = pos_embed + sin_embed
+            else:
+                pos_embeds = []
+                for i in range(num_cams):
+                    xy_embed = self.positional_encoding(masks[:, i, :, :])
+                    pos_embeds.append(xy_embed.unsqueeze(1))
+                sin_embed = torch.cat(pos_embeds, 1)
+                sin_embed = self.adapt_pos3d(sin_embed.flatten(0, 1)).view(
+                    feat.size())
+                pos_embed = pos_embed + sin_embed
+        else:
+            if self.with_multiview:
+                pos_embed = self.positional_encoding(masks)
+                pos_embed = self.adapt_pos3d(pos_embed.flatten(0, 1)).view(
+                    feat.size())
+            else:
+                pos_embeds = []
+                for i in range(num_cams):
+                    pos_embed = self.positional_encoding(masks[:, i, :, :])
+                    pos_embeds.append(pos_embed.unsqueeze(1))
+                pos_embed = torch.cat(pos_embeds, 1)
+
+        reference_points = self.reference_points.weight
+        reference_points, attn_mask, mask_dict = self.prepare_for_dn(
+            batch_size, reference_points, img_metas)
+        query_embeds = self.query_embedding(pos2posemb3d(reference_points))
+        outs_dec, _ = self.transformer(feat, masks, query_embeds, pos_embed,
+                                       attn_mask, self.reg_branches)
+
+        if self.with_time:
+            time_stamps = []
+            for img_meta in img_metas:
+                time_stamps.append(np.asarray(img_meta['timestamp'][:12]))
+            time_stamp = x.new_tensor(np.array(time_stamps))
+            time_stamp = time_stamp.view(batch_size, -1, 6)
+            mean_time_stamp = (time_stamp[:, 1, :]
+                               - time_stamp[:, 0, :]).mean(-1)
+
+        outputs_classes = []
+        outputs_coords = []
+        for lvl in range(outs_dec.shape[0]):
+            reference = inverse_sigmoid(reference_points.clone())
+            assert reference.shape[-1] == 3
+            outputs_class = self.cls_branches[lvl](outs_dec[lvl])
+            tmp = self.reg_branches[lvl](outs_dec[lvl])
+
+            tmp[..., 0:2] += reference[..., 0:2]
+            tmp[..., 0:2] = tmp[..., 0:2].sigmoid()
+            tmp[..., 4:5] += reference[..., 2:3]
+            tmp[..., 4:5] = tmp[..., 4:5].sigmoid()
+
+            if self.with_time:
+                tmp[..., 8:] = tmp[..., 8:] / mean_time_stamp[:, None, None]
+
+            outputs_coord = tmp
+            outputs_classes.append(outputs_class)
+            outputs_coords.append(outputs_coord)
+
+        all_cls_scores = torch.stack(outputs_classes)
+        all_bbox_preds = torch.stack(outputs_coords)
+
+        tmp = all_bbox_preds[..., 0:1] * (self.pc_range[3] - self.pc_range[0])
+        all_bbox_preds[..., 0:1] = (tmp + self.pc_range[0])
+        tmp = all_bbox_preds[..., 1:2] * (self.pc_range[4] - self.pc_range[1])
+        all_bbox_preds[..., 1:2] = (tmp + self.pc_range[1])
+        tmp = all_bbox_preds[..., 4:5] * (self.pc_range[5] - self.pc_range[2])
+        all_bbox_preds[..., 4:5] = (tmp + self.pc_range[2])
+
+        if mask_dict and mask_dict['pad_size'] > 0:
+            output_known_class = all_cls_scores[:, :, :
+                                                mask_dict['pad_size'], :]
+            output_known_coord = all_bbox_preds[:, :, :
+                                                mask_dict['pad_size'], :]
+            outputs_class = all_cls_scores[:, :, mask_dict['pad_size']:, :]
+            outputs_coord = all_bbox_preds[:, :, mask_dict['pad_size']:, :]
+            mask_dict['output_known_lbs_bboxes'] = (output_known_class,
+                                                    output_known_coord)
+            outs = {
+                'all_cls_scores': outputs_class,
+                'all_bbox_preds': outputs_coord,
+                'enc_cls_scores': None,
+                'enc_bbox_preds': None,
+                'dn_mask_dict': mask_dict,
+                'depth_pred': depth,
+            }
+        else:
+            outs = {
+                'all_cls_scores': all_cls_scores,
+                'all_bbox_preds': all_bbox_preds,
+                'enc_cls_scores': None,
+                'enc_bbox_preds': None,
+                'dn_mask_dict': None,
+                'depth_pred': depth,
+            }
+        return outs
+
+    def get_downsampled_gt_depth(self, gt_depths):
+        """
+        Input:
+            gt_depths: [B, N, H, W]
+        Output:
+            gt_depths: [B*N*h*w, d]
+        """
+        downsample = self.depth_downsample
+        depth_config = [self.depth_start, self.position_range[3], self.de_intv]
+        B, N, H, W = gt_depths.shape
+        gt_depths = gt_depths.view(B * N, H // downsample, downsample,
+                                   W // downsample, downsample, 1)
+        gt_depths = gt_depths.permute(0, 1, 3, 5, 2, 4).contiguous()
+        gt_depths = gt_depths.view(-1, downsample * downsample)
+        gt_depths_tmp = torch.where(gt_depths == 0.0,
+                                    1e5 * torch.ones_like(gt_depths),
+                                    gt_depths)
+        gt_depths = torch.min(gt_depths_tmp, dim=-1).values
+        gt_depths = gt_depths.view(B * N, H // downsample, W // downsample)
+
+        if self.depth_LID:  # use 64 LID
+            bin_size = (self.position_range[3] - self.depth_start) / (
+                self.D * (1 + self.D))
+            t = (gt_depths - self.depth_start) / bin_size
+            gt_depths = (torch.sqrt(1 + 4 * t)
+                         - 1) / 2.0 + 1  # (-b+sqrt(b^2-4ac))/2a
+        else:  # use 121 UD
+            tmp = gt_depths - (depth_config[0] - depth_config[2])
+            gt_depths = tmp / depth_config[2]
+        gt_depths = torch.where((gt_depths < self.D + 1) & (gt_depths >= 0.0),
+                                gt_depths, torch.zeros_like(gt_depths))
+        gt_depths = F.one_hot(
+            gt_depths.long(), num_classes=self.D + 1).view(-1, self.D + 1)[:,
+                                                                           1:]
+        return gt_depths.float()
+
+    @force_fp32()
+    def get_depth_loss(self, gt_depth, pred_depth):
+        depth_labels = self.get_downsampled_gt_depth(gt_depth)
+        depth_preds = pred_depth.permute(0, 2, 3,
+                                         1).contiguous().view(-1, self.D)
+        fg_mask = torch.max(depth_labels, dim=1).values > 0.0
+        depth_labels = depth_labels[fg_mask]
+        depth_preds = depth_preds[fg_mask]
+        with autocast(enabled=False):
+            depth_loss = F.binary_cross_entropy(
+                depth_preds,
+                depth_labels,
+                reduction='none',
+            ).sum() / max(1.0, fg_mask.sum())
+        return self.loss_depth_weight * depth_loss
+
+    def prepare_for_loss(self, mask_dict):
+        """
+        prepare dn components to calculate loss
+        Args:
+            mask_dict: a dict that contains dn information
+        """
+        output_known_class, output_known_coord = mask_dict[
+            'output_known_lbs_bboxes']
+        known_labels, known_bboxs = mask_dict['known_lbs_bboxes']
+        map_known_indice = mask_dict['map_known_indice'].long()
+        known_indice = mask_dict['known_indice'].long()
+        batch_idx = mask_dict['batch_idx'].long()
+        bid = batch_idx[known_indice]
+        if len(output_known_class) > 0:
+            output_known_class = output_known_class.permute(
+                1, 2, 0, 3)[(bid, map_known_indice)].permute(1, 0, 2)
+            output_known_coord = output_known_coord.permute(
+                1, 2, 0, 3)[(bid, map_known_indice)].permute(1, 0, 2)
+        num_tgt = known_indice.numel()
+        return known_labels, known_bboxs, output_known_class, output_known_coord, num_tgt
+
+    def _get_target_single(self,
+                           cls_score,
+                           bbox_pred,
+                           gt_labels,
+                           gt_bboxes,
+                           gt_bboxes_ignore=None):
+        """"Compute regression and classification targets for one image.
+        Outputs from a single decoder layer of a single feature level are used.
+        Args:
+            cls_score (Tensor): Box score logits from a single decoder layer
+                for one image. Shape [num_query, cls_out_channels].
+            bbox_pred (Tensor): Sigmoid outputs from a single decoder layer
+                for one image, with normalized coordinate (cx, cy, w, h) and
+                shape [num_query, 4].
+            gt_bboxes (Tensor): Ground truth bboxes for one image with
+                shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
+            gt_labels (Tensor): Ground truth class indices for one image
+                with shape (num_gts, ).
+            gt_bboxes_ignore (Tensor, optional): Bounding boxes
+                which can be ignored. Default None.
+        Returns:
+            tuple[Tensor]: a tuple containing the following for one image.
+                - labels (Tensor): Labels of each image.
+                - label_weights (Tensor]): Label weights of each image.
+                - bbox_targets (Tensor): BBox targets of each image.
+                - bbox_weights (Tensor): BBox weights of each image.
+                - pos_inds (Tensor): Sampled positive indices for each image.
+                - neg_inds (Tensor): Sampled negative indices for each image.
+        """
+
+        num_bboxes = bbox_pred.size(0)
+        # assigner and sampler
+        assign_result = self.assigner.assign(bbox_pred, cls_score, gt_bboxes,
+                                             gt_labels, gt_bboxes_ignore)
+        sampling_result = self.sampler.sample(assign_result, bbox_pred,
+                                              gt_bboxes)
+        pos_inds = sampling_result.pos_inds
+        neg_inds = sampling_result.neg_inds
+
+        # label targets
+        labels = gt_bboxes.new_full((num_bboxes, ),
+                                    self.num_classes,
+                                    dtype=torch.long)
+        labels[pos_inds] = gt_labels[sampling_result.pos_assigned_gt_inds]
+        label_weights = gt_bboxes.new_ones(num_bboxes)
+
+        # bbox targets
+        code_size = gt_bboxes.size(1)
+        bbox_targets = torch.zeros_like(bbox_pred)[..., :code_size]
+        bbox_weights = torch.zeros_like(bbox_pred)
+        bbox_weights[pos_inds] = 1.0
+
+        # DETR
+        if sampling_result.pos_gt_bboxes.shape[1] == 4:
+            bbox_targets[pos_inds] = sampling_result.pos_gt_bboxes.reshape(
+                sampling_result.pos_gt_bboxes.shape[0], self.code_size - 1)
+        else:
+            bbox_targets[pos_inds] = sampling_result.pos_gt_bboxes
+
+        return (labels, label_weights, bbox_targets, bbox_weights, pos_inds,
+                neg_inds)
+
+    def get_targets(self,
+                    cls_scores_list,
+                    bbox_preds_list,
+                    gt_bboxes_list,
+                    gt_labels_list,
+                    gt_bboxes_ignore_list=None):
+        """"Compute regression and classification targets for a batch image.
+        Outputs from a single decoder layer of a single feature level are used.
+        Args:
+            cls_scores_list (list[Tensor]): Box score logits from a single
+                decoder layer for each image with shape [num_query,
+                cls_out_channels].
+            bbox_preds_list (list[Tensor]): Sigmoid outputs from a single
+                decoder layer for each image, with normalized coordinate
+                (cx, cy, w, h) and shape [num_query, 4].
+            gt_bboxes_list (list[Tensor]): Ground truth bboxes for each image
+                with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
+            gt_labels_list (list[Tensor]): Ground truth class indices for each
+                image with shape (num_gts, ).
+            gt_bboxes_ignore_list (list[Tensor], optional): Bounding
+                boxes which can be ignored for each image. Default None.
+        Returns:
+            tuple: a tuple containing the following targets.
+                - labels_list (list[Tensor]): Labels for all images.
+                - label_weights_list (list[Tensor]): Label weights for all \
+                    images.
+                - bbox_targets_list (list[Tensor]): BBox targets for all \
+                    images.
+                - bbox_weights_list (list[Tensor]): BBox weights for all \
+                    images.
+                - num_total_pos (int): Number of positive samples in all \
+                    images.
+                - num_total_neg (int): Number of negative samples in all \
+                    images.
+        """
+        assert gt_bboxes_ignore_list is None, \
+            'Only supports for gt_bboxes_ignore setting to None.'
+        num_imgs = len(cls_scores_list)
+        gt_bboxes_ignore_list = [
+            gt_bboxes_ignore_list for _ in range(num_imgs)
+        ]
+
+        (labels_list, label_weights_list, bbox_targets_list, bbox_weights_list,
+         pos_inds_list,
+         neg_inds_list) = multi_apply(self._get_target_single, cls_scores_list,
+                                      bbox_preds_list, gt_labels_list,
+                                      gt_bboxes_list, gt_bboxes_ignore_list)
+        num_total_pos = sum((inds.numel() for inds in pos_inds_list))
+        num_total_neg = sum((inds.numel() for inds in neg_inds_list))
+        return (labels_list, label_weights_list, bbox_targets_list,
+                bbox_weights_list, num_total_pos, num_total_neg)
+
+    def loss_single(self,
+                    cls_scores,
+                    bbox_preds,
+                    gt_bboxes_list,
+                    gt_labels_list,
+                    gt_bboxes_ignore_list=None):
+        """"Loss function for outputs from a single decoder layer of a single
+        feature level.
+        Args:
+            cls_scores (Tensor): Box score logits from a single decoder layer
+                for all images. Shape [bs, num_query, cls_out_channels].
+            bbox_preds (Tensor): Sigmoid outputs from a single decoder layer
+                for all images, with normalized coordinate (cx, cy, w, h) and
+                shape [bs, num_query, 4].
+            gt_bboxes_list (list[Tensor]): Ground truth bboxes for each image
+                with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
+            gt_labels_list (list[Tensor]): Ground truth class indices for each
+                image with shape (num_gts, ).
+            gt_bboxes_ignore_list (list[Tensor], optional): Bounding
+                boxes which can be ignored for each image. Default None.
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components for outputs from
+                a single decoder layer.
+        """
+        num_imgs = cls_scores.size(0)
+        cls_scores_list = [cls_scores[i] for i in range(num_imgs)]
+        bbox_preds_list = [bbox_preds[i] for i in range(num_imgs)]
+        cls_reg_targets = self.get_targets(cls_scores_list, bbox_preds_list,
+                                           gt_bboxes_list, gt_labels_list,
+                                           gt_bboxes_ignore_list)
+        (labels_list, label_weights_list, bbox_targets_list, bbox_weights_list,
+         num_total_pos, num_total_neg) = cls_reg_targets
+        labels = torch.cat(labels_list, 0)
+        label_weights = torch.cat(label_weights_list, 0)
+        bbox_targets = torch.cat(bbox_targets_list, 0)
+        bbox_weights = torch.cat(bbox_weights_list, 0)
+
+        # classification loss
+        cls_scores = cls_scores.reshape(-1, self.cls_out_channels)
+        # construct weighted avg_factor to match with the official DETR repo
+        cls_avg_factor = num_total_pos * 1.0 + \
+            num_total_neg * self.bg_cls_weight
+        if self.sync_cls_avg_factor:
+            cls_avg_factor = reduce_mean(
+                cls_scores.new_tensor([cls_avg_factor]))
+
+        cls_avg_factor = max(cls_avg_factor, 1)
+        loss_cls = self.loss_cls(
+            cls_scores, labels, label_weights, avg_factor=cls_avg_factor)
+
+        # Compute the average number of gt boxes accross all gpus, for
+        # normalization purposes
+        num_total_pos = loss_cls.new_tensor([num_total_pos])
+        num_total_pos = torch.clamp(reduce_mean(num_total_pos), min=1).item()
+
+        # regression L1 loss
+        bbox_preds = bbox_preds.reshape(-1, bbox_preds.size(-1))
+        normalized_bbox_targets = normalize_bbox(bbox_targets, self.pc_range)
+        isnotnan = torch.isfinite(normalized_bbox_targets).all(dim=-1)
+        bbox_weights = bbox_weights * self.code_weights
+
+        loss_bbox = self.loss_bbox(
+            bbox_preds[isnotnan, :10],
+            normalized_bbox_targets[isnotnan, :10],
+            bbox_weights[isnotnan, :10],
+            avg_factor=num_total_pos)
+
+        loss_cls = torch.nan_to_num(loss_cls)
+        loss_bbox = torch.nan_to_num(loss_bbox)
+        return loss_cls, loss_bbox
+
+    def dn_loss_single(self,
+                       cls_scores,
+                       bbox_preds,
+                       known_bboxs,
+                       known_labels,
+                       num_total_pos=None):
+        """"Loss function for outputs from a single decoder layer of a single
+        feature level.
+        Args:
+            cls_scores (Tensor): Box score logits from a single decoder layer
+                for all images. Shape [bs, num_query, cls_out_channels].
+            bbox_preds (Tensor): Sigmoid outputs from a single decoder layer
+                for all images, with normalized coordinate (cx, cy, w, h) and
+                shape [bs, num_query, 4].
+            gt_bboxes_list (list[Tensor]): Ground truth bboxes for each image
+                with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
+            gt_labels_list (list[Tensor]): Ground truth class indices for each
+                image with shape (num_gts, ).
+            gt_bboxes_ignore_list (list[Tensor], optional): Bounding
+                boxes which can be ignored for each image. Default None.
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components for outputs from
+                a single decoder layer.
+        """
+        # classification loss
+        cls_scores = cls_scores.reshape(-1, self.cls_out_channels)
+        # construct weighted avg_factor to match with the official DETR repo
+        cls_avg_factor = num_total_pos * 3.14159 / 6 * self.split * self.split * self.split  # positive rate
+        if self.sync_cls_avg_factor:
+            cls_avg_factor = reduce_mean(
+                cls_scores.new_tensor([cls_avg_factor]))
+        bbox_weights = torch.ones_like(bbox_preds)
+        label_weights = torch.ones_like(known_labels)
+        cls_avg_factor = max(cls_avg_factor, 1)
+        loss_cls = self.loss_cls(
+            cls_scores,
+            known_labels.long(),
+            label_weights,
+            avg_factor=cls_avg_factor)
+
+        # Compute the average number of gt boxes accross all gpus, for
+        # normalization purposes
+        num_total_pos = loss_cls.new_tensor([num_total_pos])
+        num_total_pos = torch.clamp(reduce_mean(num_total_pos), min=1).item()
+
+        # regression L1 loss
+        bbox_preds = bbox_preds.reshape(-1, bbox_preds.size(-1))
+        normalized_bbox_targets = normalize_bbox(known_bboxs, self.pc_range)
+        isnotnan = torch.isfinite(normalized_bbox_targets).all(dim=-1)
+        bbox_weights = bbox_weights * self.code_weights
+        bbox_weights[:, 6:
+                     8] = 0  # dn alaways reduce the mAOE, which is useless when training for a long time.
+        loss_bbox = self.loss_bbox(
+            bbox_preds[isnotnan, :10],
+            normalized_bbox_targets[isnotnan, :10],
+            bbox_weights[isnotnan, :10],
+            avg_factor=num_total_pos)
+
+        loss_cls = torch.nan_to_num(loss_cls)
+        loss_bbox = torch.nan_to_num(loss_bbox)
+
+        return self.dn_weight * loss_cls, self.dn_weight * loss_bbox
+
+    @force_fp32(apply_to=('preds_dicts'))
+    def loss(self,
+             gt_bboxes_list,
+             gt_labels_list,
+             preds_dicts,
+             gt_depth,
+             gt_bboxes_ignore=None):
+        """"Loss function.
+        Args:
+            gt_bboxes_list (list[Tensor]): Ground truth bboxes for each image
+                with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
+            gt_labels_list (list[Tensor]): Ground truth class indices for each
+                image with shape (num_gts, ).
+            preds_dicts:
+                all_cls_scores (Tensor): Classification score of all
+                    decoder layers, has shape
+                    [nb_dec, bs, num_query, cls_out_channels].
+                all_bbox_preds (Tensor): Sigmoid regression
+                    outputs of all decode layers. Each is a 4D-tensor with
+                    normalized coordinate format (cx, cy, w, h) and shape
+                    [nb_dec, bs, num_query, 4].
+                enc_cls_scores (Tensor): Classification scores of
+                    points on encode feature map , has shape
+                    (N, h*w, num_classes). Only be passed when as_two_stage is
+                    True, otherwise is None.
+                enc_bbox_preds (Tensor): Regression results of each points
+                    on the encode feature map, has shape (N, h*w, 4). Only be
+                    passed when as_two_stage is True, otherwise is None.
+            gt_bboxes_ignore (list[Tensor], optional): Bounding boxes
+                which can be ignored for each image. Default None.
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        assert gt_bboxes_ignore is None, \
+            f'{self.__class__.__name__} only supports ' \
+            f'for gt_bboxes_ignore setting to None.'
+
+        loss_dict = dict()
+        # depth loss
+        if self.with_depthnet:
+            depth_pred = preds_dicts['depth_pred']
+            if not self.sweep_lidar:
+                depth_pred = depth_pred[:, :6, ...]
+                gt_depth = gt_depth[:, :6, ...]
+            B, N, C, H, W = depth_pred.shape
+            depth_pred = depth_pred.view(B * N, C, H, W)
+            loss_depth = self.get_depth_loss(
+                gt_depth.to(depth_pred.device), depth_pred)
+            loss_dict['loss_depth'] = loss_depth
+
+        all_cls_scores = preds_dicts['all_cls_scores']
+        all_bbox_preds = preds_dicts['all_bbox_preds']
+        enc_cls_scores = preds_dicts['enc_cls_scores']
+        enc_bbox_preds = preds_dicts['enc_bbox_preds']
+
+        num_dec_layers = len(all_cls_scores)
+        device = gt_labels_list[0].device
+        gt_bboxes_list = [
+            torch.cat((gt_bboxes.gravity_center, gt_bboxes.tensor[:, 3:]),
+                      dim=1).to(device) for gt_bboxes in gt_bboxes_list
+        ]
+
+        all_gt_bboxes_list = [gt_bboxes_list for _ in range(num_dec_layers)]
+        all_gt_labels_list = [gt_labels_list for _ in range(num_dec_layers)]
+        all_gt_bboxes_ignore_list = [
+            gt_bboxes_ignore for _ in range(num_dec_layers)
+        ]
+
+        losses_cls, losses_bbox = multi_apply(self.loss_single, all_cls_scores,
+                                              all_bbox_preds,
+                                              all_gt_bboxes_list,
+                                              all_gt_labels_list,
+                                              all_gt_bboxes_ignore_list)
+
+        # loss of proposal generated from encode feature map.
+        if enc_cls_scores is not None:
+            binary_labels_list = [
+                torch.zeros_like(gt_labels_list[i])
+                for i in range(len(all_gt_labels_list))
+            ]
+            enc_loss_cls, enc_losses_bbox = \
+                self.loss_single(enc_cls_scores, enc_bbox_preds,
+                                 gt_bboxes_list, binary_labels_list, gt_bboxes_ignore)
+            loss_dict['enc_loss_cls'] = enc_loss_cls
+            loss_dict['enc_loss_bbox'] = enc_losses_bbox
+
+        if preds_dicts['dn_mask_dict'] is not None:
+            known_labels, known_bboxs, output_known_class, output_known_coord, num_tgt = self.prepare_for_loss(
+                preds_dicts['dn_mask_dict'])
+            all_known_bboxs_list = [known_bboxs for _ in range(num_dec_layers)]
+            all_known_labels_list = [
+                known_labels for _ in range(num_dec_layers)
+            ]
+            all_num_tgts_list = [num_tgt for _ in range(num_dec_layers)]
+            dn_losses_cls, dn_losses_bbox = multi_apply(
+                self.dn_loss_single, output_known_class, output_known_coord,
+                all_known_bboxs_list, all_known_labels_list, all_num_tgts_list)
+            loss_dict['dn_loss_cls'] = dn_losses_cls[-1]
+            loss_dict['dn_loss_bbox'] = dn_losses_bbox[-1]
+            num_dec_layer = 0
+            for loss_cls_i, loss_bbox_i in zip(dn_losses_cls[:-1],
+                                               dn_losses_bbox[:-1]):
+                loss_dict[f'd{num_dec_layer}.dn_loss_cls'] = loss_cls_i
+                loss_dict[f'd{num_dec_layer}.dn_loss_bbox'] = loss_bbox_i
+                num_dec_layer += 1
+
+        # loss from the last decoder layer
+        loss_dict['loss_cls'] = losses_cls[-1]
+        loss_dict['loss_bbox'] = losses_bbox[-1]
+
+        # loss from other decoder layers
+        num_dec_layer = 0
+        for loss_cls_i, loss_bbox_i in zip(losses_cls[:-1], losses_bbox[:-1]):
+            loss_dict[f'd{num_dec_layer}.loss_cls'] = loss_cls_i
+            loss_dict[f'd{num_dec_layer}.loss_bbox'] = loss_bbox_i
+            num_dec_layer += 1
+        return loss_dict
+
+    @force_fp32(apply_to=('preds_dicts'))
+    def get_bboxes(self, preds_dicts, img_metas, rescale=False):
+        """Generate bboxes from bbox head predictions.
+        Args:
+            preds_dicts (tuple[list[dict]]): Prediction results.
+            img_metas (list[dict]): Point cloud and image's meta info.
+        Returns:
+            list[dict]: Decoded bbox, scores and labels after nms.
+        """
+        preds_dicts = self.bbox_coder.decode(preds_dicts)
+        num_samples = len(preds_dicts)
+
+        ret_list = []
+        for i in range(num_samples):
+            preds = preds_dicts[i]
+            bboxes = preds['bboxes']
+            bboxes[:, 2] = bboxes[:, 2] - bboxes[:, 5] * 0.5
+            bboxes = img_metas[i]['box_type_3d'](bboxes, bboxes.size(-1))
+            scores = preds['scores']
+            labels = preds['labels']
+            ret_list.append([bboxes, scores, labels])
+        return ret_list
diff --git a/modelscope/models/cv/object_detection_3d/depe/mmdet3d_plugin/models/detectors/__init__.py b/modelscope/models/cv/object_detection_3d/depe/mmdet3d_plugin/models/detectors/__init__.py
new file mode 100644
index 00000000..7c38c1eb
--- /dev/null
+++ b/modelscope/models/cv/object_detection_3d/depe/mmdet3d_plugin/models/detectors/__init__.py
@@ -0,0 +1,7 @@
+"""
+The implementation here is modified based on PETR, originally Apache-2.0 license and publicly avaialbe at
+https://github.com/megvii-research/PETR/blob/main/projects/mmdet3d_plugin/models/detectors
+"""
+from .petr3d import Petr3D
+
+__all__ = ['Petr3D']
diff --git a/modelscope/models/cv/object_detection_3d/depe/mmdet3d_plugin/models/detectors/petr3d.py b/modelscope/models/cv/object_detection_3d/depe/mmdet3d_plugin/models/detectors/petr3d.py
new file mode 100644
index 00000000..c8e0a4e1
--- /dev/null
+++ b/modelscope/models/cv/object_detection_3d/depe/mmdet3d_plugin/models/detectors/petr3d.py
@@ -0,0 +1,225 @@
+"""
+The implementation here is modified based on PETR, originally Apache-2.0 license and publicly avaialbe at
+https://github.com/megvii-research/PETR/blob/main/projects/mmdet3d_plugin/models/detectors
+"""
+import numpy as np
+import torch
+from mmcv.parallel.data_container import DataContainer as DC
+from mmcv.runner import auto_fp16, force_fp32
+from mmdet3d.core import bbox3d2result
+from mmdet3d.models.detectors.mvx_two_stage import MVXTwoStageDetector
+from mmdet.models import DETECTORS
+
+import modelscope.models.cv.object_detection_3d.depe.mmdet3d_plugin.models.utils
+
+
+@DETECTORS.register_module()
+class Petr3D(MVXTwoStageDetector):
+    """Petr3D."""
+
+    def __init__(self,
+                 use_grid_mask=False,
+                 pts_voxel_layer=None,
+                 pts_voxel_encoder=None,
+                 pts_middle_encoder=None,
+                 pts_fusion_layer=None,
+                 img_backbone=None,
+                 pts_backbone=None,
+                 img_neck=None,
+                 pts_neck=None,
+                 pts_bbox_head=None,
+                 img_roi_head=None,
+                 img_rpn_head=None,
+                 train_cfg=None,
+                 test_cfg=None,
+                 pretrained=None):
+        super(Petr3D,
+              self).__init__(pts_voxel_layer, pts_voxel_encoder,
+                             pts_middle_encoder, pts_fusion_layer,
+                             img_backbone, pts_backbone, img_neck, pts_neck,
+                             pts_bbox_head, img_roi_head, img_rpn_head,
+                             train_cfg, test_cfg, pretrained)
+
+    def extract_img_feat(self, img, img_metas):
+        """Extract features of images."""
+        if isinstance(img, list):
+            img = torch.stack(img, dim=0)
+
+        B = img.size(0)
+        if img is not None:
+            input_shape = img.shape[-2:]
+            # update real input shape of each single img
+            for img_meta in img_metas:
+                img_meta.update(input_shape=input_shape)
+            if img.dim() == 5:
+                if img.size(0) == 1 and img.size(1) != 1:
+                    img.squeeze_()
+                else:
+                    B, N, C, H, W = img.size()
+                    img = img.view(B * N, C, H, W)
+            img_feats = self.img_backbone(img)
+            if isinstance(img_feats, dict):
+                img_feats = list(img_feats.values())
+        else:
+            return None
+        if self.with_img_neck:
+            img_feats = self.img_neck(img_feats)
+        img_feats_reshaped = []
+        for img_feat in img_feats:
+            BN, C, H, W = img_feat.size()
+            img_feats_reshaped.append(img_feat.view(B, int(BN / B), C, H, W))
+        return img_feats_reshaped
+
+    @auto_fp16(apply_to=('img'), out_fp32=True)
+    def extract_feat(self, img, img_metas):
+        """Extract features from images and points."""
+        img_feats = self.extract_img_feat(img, img_metas)
+        return img_feats
+
+    def forward_pts_train(self,
+                          pts_feats,
+                          gt_bboxes_3d,
+                          gt_labels_3d,
+                          img_metas,
+                          gt_bboxes_ignore=None):
+        """Forward function for point cloud branch.
+        Args:
+            pts_feats (list[torch.Tensor]): Features of point cloud branch
+            gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`]): Ground truth
+                boxes for each sample.
+            gt_labels_3d (list[torch.Tensor]): Ground truth labels for
+                boxes of each sampole
+            img_metas (list[dict]): Meta information of samples.
+            gt_bboxes_ignore (list[torch.Tensor], optional): Ground truth
+                boxes to be ignored. Defaults to None.
+        Returns:
+            dict: Losses of each branch.
+        """
+        outs = self.pts_bbox_head(pts_feats, img_metas)
+        gt_depth = torch.stack(
+            [img_meta['gt_depth'] for img_meta in img_metas])
+        loss_inputs = [gt_bboxes_3d, gt_labels_3d, outs, gt_depth]
+        losses = self.pts_bbox_head.loss(*loss_inputs)
+
+        return losses
+
+    @force_fp32(apply_to=('img', 'points'))
+    def forward(self, return_loss=True, **kwargs):
+        """Calls either forward_train or forward_test depending on whether
+        return_loss=True.
+        Note this setting will change the expected inputs. When
+        `return_loss=True`, img and img_metas are single-nested (i.e.
+        torch.Tensor and list[dict]), and when `resturn_loss=False`, img and
+        img_metas should be double nested (i.e.  list[torch.Tensor],
+        list[list[dict]]), with the outer list indicating test time
+        augmentations.
+        """
+        if return_loss:
+            return self.forward_train(**kwargs)
+        else:
+            return self.forward_test(**kwargs)
+
+    def forward_train(self,
+                      points=None,
+                      img_metas=None,
+                      gt_bboxes_3d=None,
+                      gt_labels_3d=None,
+                      gt_labels=None,
+                      gt_bboxes=None,
+                      img=None,
+                      proposals=None,
+                      gt_bboxes_ignore=None,
+                      img_depth=None,
+                      img_mask=None):
+        """Forward training function.
+        Args:
+            points (list[torch.Tensor], optional): Points of each sample.
+                Defaults to None.
+            img_metas (list[dict], optional): Meta information of each sample.
+                Defaults to None.
+            gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`], optional):
+                Ground truth 3D boxes. Defaults to None.
+            gt_labels_3d (list[torch.Tensor], optional): Ground truth labels
+                of 3D boxes. Defaults to None.
+            gt_labels (list[torch.Tensor], optional): Ground truth labels
+                of 2D boxes in images. Defaults to None.
+            gt_bboxes (list[torch.Tensor], optional): Ground truth 2D boxes in
+                images. Defaults to None.
+            img (torch.Tensor optional): Images of each sample with shape
+                (N, C, H, W). Defaults to None.
+            proposals ([list[torch.Tensor], optional): Predicted proposals
+                used for training Fast RCNN. Defaults to None.
+            gt_bboxes_ignore (list[torch.Tensor], optional): Ground truth
+                2D boxes in images to be ignored. Defaults to None.
+        Returns:
+            dict: Losses of different branches.
+        """
+
+        img_feats = self.extract_feat(img=img, img_metas=img_metas)
+
+        losses = dict()
+        losses_pts = self.forward_pts_train(img_feats, gt_bboxes_3d,
+                                            gt_labels_3d, img_metas,
+                                            gt_bboxes_ignore)
+        losses.update(losses_pts)
+        return losses
+
+    def forward_test(self, img_metas, img=None, **kwargs):
+        for var, name in [(img_metas, 'img_metas')]:
+            if not isinstance(var, list):
+                raise TypeError('{} must be a list, but got {}'.format(
+                    name, type(var)))
+        img = [img] if img is None else img
+        return self.simple_test(img_metas[0], img[0], **kwargs)
+
+    def simple_test_pts(self, x, img_metas, rescale=False):
+        """Test function of point cloud branch."""
+        outs = self.pts_bbox_head(x, img_metas)
+        bbox_list = self.pts_bbox_head.get_bboxes(
+            outs, img_metas, rescale=rescale)
+        bbox_results = [
+            bbox3d2result(bboxes, scores, labels)
+            for bboxes, scores, labels in bbox_list
+        ]
+        return bbox_results
+
+    def simple_test(self, img_metas, img=None, rescale=False):
+        """Test function without augmentaiton."""
+        if not torch.cuda.is_available() and img is not None:
+            if isinstance(img, torch.Tensor):
+                img = img[0]
+            elif isinstance(img, DC) and isinstance(img_metas, DC):
+                img = img.data[0]
+                img_metas = img_metas.data[0]
+        img_feats = self.extract_feat(img=img, img_metas=img_metas)
+        bbox_list = [dict() for i in range(len(img_metas))]
+        bbox_pts = self.simple_test_pts(img_feats, img_metas, rescale=rescale)
+        for result_dict, pts_bbox in zip(bbox_list, bbox_pts):
+            result_dict['pts_bbox'] = pts_bbox
+        return bbox_list
+
+    def aug_test_pts(self, feats, img_metas, rescale=False):
+        feats_list = []
+        for j in range(len(feats[0])):
+            feats_list_level = []
+            for i in range(len(feats)):
+                feats_list_level.append(feats[i][j])
+            feats_list.append(torch.stack(feats_list_level, -1).mean(-1))
+        outs = self.pts_bbox_head(feats_list, img_metas)
+        bbox_list = self.pts_bbox_head.get_bboxes(
+            outs, img_metas, rescale=rescale)
+        bbox_results = [
+            bbox3d2result(bboxes, scores, labels)
+            for bboxes, scores, labels in bbox_list
+        ]
+        return bbox_results
+
+    def aug_test(self, img_metas, imgs=None, rescale=False):
+        """Test function with augmentaiton."""
+        img_feats = self.extract_feats(img_metas, imgs)
+        img_metas = img_metas[0]
+        bbox_list = [dict() for i in range(len(img_metas))]
+        bbox_pts = self.aug_test_pts(img_feats, img_metas, rescale)
+        for result_dict, pts_bbox in zip(bbox_list, bbox_pts):
+            result_dict['pts_bbox'] = pts_bbox
+        return bbox_list
diff --git a/modelscope/models/cv/object_detection_3d/depe/mmdet3d_plugin/models/necks/__init__.py b/modelscope/models/cv/object_detection_3d/depe/mmdet3d_plugin/models/necks/__init__.py
new file mode 100644
index 00000000..34f7571e
--- /dev/null
+++ b/modelscope/models/cv/object_detection_3d/depe/mmdet3d_plugin/models/necks/__init__.py
@@ -0,0 +1,9 @@
+"""
+The implementation here is modified based on PETR, originally Apache-2.0 license and publicly avaialbe at
+https://github.com/megvii-research/PETR/blob/main/projects/mmdet3d_plugin/models/necks
+"""
+from .cp_fpn import CPFPN
+
+__all__ = [
+    'CPFPN',
+]
diff --git a/modelscope/models/cv/object_detection_3d/depe/mmdet3d_plugin/models/necks/cp_fpn.py b/modelscope/models/cv/object_detection_3d/depe/mmdet3d_plugin/models/necks/cp_fpn.py
new file mode 100644
index 00000000..0d866149
--- /dev/null
+++ b/modelscope/models/cv/object_detection_3d/depe/mmdet3d_plugin/models/necks/cp_fpn.py
@@ -0,0 +1,207 @@
+"""
+The implementation here is modified based on PETR, originally Apache-2.0 license and publicly avaialbe at
+https://github.com/megvii-research/PETR/blob/main/projects/mmdet3d_plugin/models/necks
+"""
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import ConvModule
+from mmcv.runner import BaseModule, auto_fp16
+from mmdet.models import NECKS
+
+
+# This FPN remove the unused parameters which can used with checkpoint (with_cp = True in Backbone)
+@NECKS.register_module()
+class CPFPN(BaseModule):
+    r"""Feature Pyramid Network.
+
+    This is an implementation of paper `Feature Pyramid Networks for Object
+    Detection <https://arxiv.org/abs/1612.03144>`_.
+
+    Args:
+        in_channels (List[int]): Number of input channels per scale.
+        out_channels (int): Number of output channels (used at each scale)
+        num_outs (int): Number of output scales.
+        start_level (int): Index of the start input backbone level used to
+            build the feature pyramid. Default: 0.
+        end_level (int): Index of the end input backbone level (exclusive) to
+            build the feature pyramid. Default: -1, which means the last level.
+        add_extra_convs (bool | str): If bool, it decides whether to add conv
+            layers on top of the original feature maps. Default to False.
+            If True, it is equivalent to `add_extra_convs='on_input'`.
+            If str, it specifies the source feature map of the extra convs.
+            Only the following options are allowed
+
+            - 'on_input': Last feat map of neck inputs (i.e. backbone feature).
+            - 'on_lateral':  Last feature map after lateral convs.
+            - 'on_output': The last output feature map after fpn convs.
+        relu_before_extra_convs (bool): Whether to apply relu before the extra
+            conv. Default: False.
+        no_norm_on_lateral (bool): Whether to apply norm on lateral.
+            Default: False.
+        conv_cfg (dict): Config dict for convolution layer. Default: None.
+        norm_cfg (dict): Config dict for normalization layer. Default: None.
+        act_cfg (str): Config dict for activation layer in ConvModule.
+            Default: None.
+        upsample_cfg (dict): Config dict for interpolate layer.
+            Default: `dict(mode='nearest')`
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+
+    Example:
+        >>> import torch
+        >>> in_channels = [2, 3, 5, 7]
+        >>> scales = [340, 170, 84, 43]
+        >>> inputs = [torch.rand(1, c, s, s)
+        ...           for c, s in zip(in_channels, scales)]
+        >>> self = FPN(in_channels, 11, len(in_channels)).eval()
+        >>> outputs = self.forward(inputs)
+        >>> for i in range(len(outputs)):
+        ...     print(f'outputs[{i}].shape = {outputs[i].shape}')
+        outputs[0].shape = torch.Size([1, 11, 340, 340])
+        outputs[1].shape = torch.Size([1, 11, 170, 170])
+        outputs[2].shape = torch.Size([1, 11, 84, 84])
+        outputs[3].shape = torch.Size([1, 11, 43, 43])
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 num_outs,
+                 start_level=0,
+                 end_level=-1,
+                 add_extra_convs=False,
+                 relu_before_extra_convs=False,
+                 no_norm_on_lateral=False,
+                 conv_cfg=None,
+                 norm_cfg=None,
+                 act_cfg=None,
+                 upsample_cfg=dict(mode='nearest'),
+                 init_cfg=dict(
+                     type='Xavier', layer='Conv2d', distribution='uniform')):
+        super(CPFPN, self).__init__(init_cfg)
+        assert isinstance(in_channels, list)
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.num_ins = len(in_channels)
+        self.num_outs = num_outs
+        self.relu_before_extra_convs = relu_before_extra_convs
+        self.no_norm_on_lateral = no_norm_on_lateral
+        self.fp16_enabled = False
+        self.upsample_cfg = upsample_cfg.copy()
+
+        if end_level == -1:
+            self.backbone_end_level = self.num_ins
+            assert num_outs >= self.num_ins - start_level
+        else:
+            # if end_level < inputs, no extra level is allowed
+            self.backbone_end_level = end_level
+            assert end_level <= len(in_channels)
+            assert num_outs == end_level - start_level
+        self.start_level = start_level
+        self.end_level = end_level
+        self.add_extra_convs = add_extra_convs
+        assert isinstance(add_extra_convs, (str, bool))
+        if isinstance(add_extra_convs, str):
+            # Extra_convs_source choices: 'on_input', 'on_lateral', 'on_output'
+            assert add_extra_convs in ('on_input', 'on_lateral', 'on_output')
+        elif add_extra_convs:  # True
+            self.add_extra_convs = 'on_input'
+
+        self.lateral_convs = nn.ModuleList()
+        self.fpn_convs = nn.ModuleList()
+
+        for i in range(self.start_level, self.backbone_end_level):
+            l_conv = ConvModule(
+                in_channels[i],
+                out_channels,
+                1,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg if not self.no_norm_on_lateral else None,
+                act_cfg=act_cfg,
+                inplace=False)
+            self.lateral_convs.append(l_conv)
+            if i == 0:
+                fpn_conv = ConvModule(
+                    out_channels,
+                    out_channels,
+                    3,
+                    padding=1,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    act_cfg=act_cfg,
+                    inplace=False)
+                self.fpn_convs.append(fpn_conv)
+
+        # add extra conv layers (e.g., RetinaNet)
+        extra_levels = num_outs - self.backbone_end_level + self.start_level
+        if self.add_extra_convs and extra_levels >= 1:
+            for i in range(extra_levels):
+                if i == 0 and self.add_extra_convs == 'on_input':
+                    in_channels = self.in_channels[self.backbone_end_level - 1]
+                else:
+                    in_channels = out_channels
+                extra_fpn_conv = ConvModule(
+                    in_channels,
+                    out_channels,
+                    3,
+                    stride=2,
+                    padding=1,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    act_cfg=act_cfg,
+                    inplace=False)
+                self.fpn_convs.append(extra_fpn_conv)
+
+    @auto_fp16()
+    def forward(self, inputs):
+        """Forward function."""
+        assert len(inputs) == len(self.in_channels)
+
+        # build laterals
+        laterals = [
+            lateral_conv(inputs[i + self.start_level])
+            for i, lateral_conv in enumerate(self.lateral_convs)
+        ]
+
+        # build top-down path
+        used_backbone_levels = len(laterals)
+        for i in range(used_backbone_levels - 1, 0, -1):
+            # In some cases, fixing `scale factor` (e.g. 2) is preferred, but
+            #  it cannot co-exist with `size` in `F.interpolate`.
+            if 'scale_factor' in self.upsample_cfg:
+                laterals[i - 1] += F.interpolate(laterals[i],
+                                                 **self.upsample_cfg)
+            else:
+                prev_shape = laterals[i - 1].shape[2:]
+                laterals[i - 1] += F.interpolate(
+                    laterals[i], size=prev_shape, **self.upsample_cfg)
+
+        # build outputs
+        # part 1: from original levels
+        outs = [
+            self.fpn_convs[i](laterals[i]) if i == 0 else laterals[i]
+            for i in range(used_backbone_levels)
+        ]
+        # part 2: add extra levels
+        if self.num_outs > len(outs):
+            # use max pool to get more levels on top of outputs
+            # (e.g., Faster R-CNN, Mask R-CNN)
+            if not self.add_extra_convs:
+                for i in range(self.num_outs - used_backbone_levels):
+                    outs.append(F.max_pool2d(outs[-1], 1, stride=2))
+            # add conv layers on top of original feature maps (RetinaNet)
+            else:
+                if self.add_extra_convs == 'on_input':
+                    extra_source = inputs[self.backbone_end_level - 1]
+                elif self.add_extra_convs == 'on_lateral':
+                    extra_source = laterals[-1]
+                elif self.add_extra_convs == 'on_output':
+                    extra_source = outs[-1]
+                else:
+                    raise NotImplementedError
+                outs.append(self.fpn_convs[used_backbone_levels](extra_source))
+                for i in range(used_backbone_levels + 1, self.num_outs):
+                    if self.relu_before_extra_convs:
+                        outs.append(self.fpn_convs[i](F.relu(outs[-1])))
+                    else:
+                        outs.append(self.fpn_convs[i](outs[-1]))
+        return tuple(outs)
diff --git a/modelscope/models/cv/object_detection_3d/depe/mmdet3d_plugin/models/utils/__init__.py b/modelscope/models/cv/object_detection_3d/depe/mmdet3d_plugin/models/utils/__init__.py
new file mode 100644
index 00000000..182c218e
--- /dev/null
+++ b/modelscope/models/cv/object_detection_3d/depe/mmdet3d_plugin/models/utils/__init__.py
@@ -0,0 +1,12 @@
+"""
+The implementation here is modified based on PETR, originally Apache-2.0 license and publicly avaialbe at
+https://github.com/megvii-research/PETR/blob/main/projects/mmdet3d_plugin/models/utils
+"""
+from .petr_transformer import (PETRDNTransformer, PETRMultiheadAttention,
+                               PETRTransformerDecoder, PETRTransformerEncoder)
+from .positional_encoding import SinePositionalEncoding3D
+
+__all__ = [
+    'SinePositionalEncoding3D', 'PETRDNTransformer', 'PETRMultiheadAttention',
+    'PETRTransformerEncoder', 'PETRTransformerDecoder'
+]
diff --git a/modelscope/models/cv/object_detection_3d/depe/mmdet3d_plugin/models/utils/petr_transformer.py b/modelscope/models/cv/object_detection_3d/depe/mmdet3d_plugin/models/utils/petr_transformer.py
new file mode 100644
index 00000000..d16d0d68
--- /dev/null
+++ b/modelscope/models/cv/object_detection_3d/depe/mmdet3d_plugin/models/utils/petr_transformer.py
@@ -0,0 +1,453 @@
+"""
+The implementation here is modified based on PETR, originally Apache-2.0 license and publicly avaialbe at
+https://github.com/megvii-research/PETR/blob/main/projects/mmdet3d_plugin/models/utils
+"""
+import copy
+import math
+import warnings
+from typing import Sequence
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.checkpoint as cp
+from mmcv.cnn import (build_activation_layer, build_conv_layer,
+                      build_norm_layer, xavier_init)
+from mmcv.cnn.bricks.drop import build_dropout
+from mmcv.cnn.bricks.registry import (ATTENTION, TRANSFORMER_LAYER,
+                                      TRANSFORMER_LAYER_SEQUENCE)
+from mmcv.cnn.bricks.transformer import (BaseTransformerLayer,
+                                         TransformerLayerSequence,
+                                         build_transformer_layer_sequence)
+from mmcv.runner.base_module import BaseModule
+from mmcv.utils import (ConfigDict, build_from_cfg, deprecated_api_warning,
+                        to_2tuple)
+from mmdet.models.utils.builder import TRANSFORMER
+
+
+@TRANSFORMER.register_module()
+class PETRDNTransformer(BaseModule):
+    """Implements the DETR transformer.
+    Following the official DETR implementation, this module copy-paste
+    from torch.nn.Transformer with modifications:
+        * positional encodings are passed in MultiheadAttention
+        * extra LN at the end of encoder is removed
+        * decoder returns a stack of activations from all decoding layers
+    See `paper: End-to-End Object Detection with Transformers
+    <https://arxiv.org/pdf/2005.12872>`_ for details.
+    Args:
+        encoder (`mmcv.ConfigDict` | Dict): Config of
+            TransformerEncoder. Defaults to None.
+        decoder ((`mmcv.ConfigDict` | Dict)): Config of
+            TransformerDecoder. Defaults to None
+        init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
+            Defaults to None.
+    """
+
+    def __init__(self, encoder=None, decoder=None, init_cfg=None, cross=False):
+        super(PETRDNTransformer, self).__init__(init_cfg=init_cfg)
+        if encoder is not None:
+            self.encoder = build_transformer_layer_sequence(encoder)
+        else:
+            self.encoder = None
+        self.decoder = build_transformer_layer_sequence(decoder)
+        self.embed_dims = self.decoder.embed_dims
+        self.cross = cross
+
+    def init_weights(self):
+        # follow the official DETR to init parameters
+        for m in self.modules():
+            if hasattr(m, 'weight') and m.weight.dim() > 1:
+                xavier_init(m, distribution='uniform')
+        self._is_init = True
+
+    def forward(self,
+                x,
+                mask,
+                query_embed,
+                pos_embed,
+                attn_masks=None,
+                reg_branch=None):
+        """Forward function for `Transformer`.
+        Args:
+            x (Tensor): Input query with shape [bs, c, h, w] where
+                c = embed_dims.
+            mask (Tensor): The key_padding_mask used for encoder and decoder,
+                with shape [bs, h, w].
+            query_embed (Tensor): The query embedding for decoder, with shape
+                [num_query, c].
+            pos_embed (Tensor): The positional encoding for encoder and
+                decoder, with the same shape as `x`.
+        Returns:
+            tuple[Tensor]: results of decoder containing the following tensor.
+                - out_dec: Output from decoder. If return_intermediate_dec \
+                      is True output has shape [num_dec_layers, bs,
+                      num_query, embed_dims], else has shape [1, bs, \
+                      num_query, embed_dims].
+                - memory: Output results from encoder, with shape \
+                      [bs, embed_dims, h, w].
+        """
+        bs, n, c, h, w = x.shape
+        memory = x.permute(1, 3, 4, 0,
+                           2).reshape(-1, bs,
+                                      c)  # [bs, n, c, h, w] -> [n*h*w, bs, c]
+        pos_embed = pos_embed.permute(1, 3, 4, 0, 2).reshape(
+            -1, bs, c)  # [bs, n, c, h, w] -> [n*h*w, bs, c]
+        query_embed = query_embed.transpose(
+            0, 1)  # [num_query, dim] -> [num_query, bs, dim]
+        mask = mask.view(bs, -1)  # [bs, n, h, w] -> [bs, n*h*w]
+        target = torch.zeros_like(query_embed)
+        # out_dec: [num_layers, num_query, bs, dim]
+        out_dec = self.decoder(
+            query=target,
+            key=memory,
+            value=memory,
+            key_pos=pos_embed,
+            query_pos=query_embed,
+            key_padding_mask=mask,
+            attn_masks=[attn_masks, None],
+            reg_branch=reg_branch,
+        )
+        out_dec = out_dec.transpose(1, 2)
+        memory = memory.reshape(n, h, w, bs, c).permute(3, 0, 4, 1, 2)
+        return out_dec, memory
+
+
+@TRANSFORMER_LAYER.register_module()
+class PETRTransformerDecoderLayer(BaseTransformerLayer):
+    """Implements decoder layer in DETR transformer.
+    Args:
+        attn_cfgs (list[`mmcv.ConfigDict`] | list[dict] | dict )):
+            Configs for self_attention or cross_attention, the order
+            should be consistent with it in `operation_order`. If it is
+            a dict, it would be expand to the number of attention in
+            `operation_order`.
+        feedforward_channels (int): The hidden dimension for FFNs.
+        ffn_dropout (float): Probability of an element to be zeroed
+            in ffn. Default 0.0.
+        operation_order (tuple[str]): The execution order of operation
+            in transformer. Such as ('self_attn', 'norm', 'ffn', 'norm').
+            Default：None
+        act_cfg (dict): The activation config for FFNs. Default: `LN`
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: `LN`.
+        ffn_num_fcs (int): The number of fully-connected layers in FFNs.
+            Default：2.
+    """
+
+    def __init__(self,
+                 attn_cfgs,
+                 feedforward_channels,
+                 embed_dims=256,
+                 ffn_dropout=0.0,
+                 operation_order=None,
+                 act_cfg=dict(type='ReLU', inplace=True),
+                 norm_cfg=dict(type='LN'),
+                 ffn_num_fcs=2,
+                 with_cp=True,
+                 **kwargs):
+        ffn_cfgs = dict(
+            embed_dims=embed_dims,
+            feedforward_channels=feedforward_channels,
+            ffn_dropout=ffn_dropout,
+            ffn_num_fcs=ffn_num_fcs)
+        kwargs['ffn_cfgs'] = ffn_cfgs
+        super(PETRTransformerDecoderLayer, self).__init__(
+            attn_cfgs=attn_cfgs,
+            operation_order=operation_order,
+            act_cfg=act_cfg,
+            norm_cfg=norm_cfg,
+            **kwargs)
+        assert len(operation_order) == 6
+        assert set(operation_order) == set(
+            ['self_attn', 'norm', 'cross_attn', 'ffn'])
+        self.use_checkpoint = with_cp
+
+    def _forward(
+        self,
+        query,
+        key=None,
+        value=None,
+        query_pos=None,
+        key_pos=None,
+        attn_masks=None,
+        query_key_padding_mask=None,
+        key_padding_mask=None,
+    ):
+        """Forward function for `TransformerCoder`.
+        Returns:
+            Tensor: forwarded results with shape [num_query, bs, embed_dims].
+        """
+        x = super(PETRTransformerDecoderLayer, self). \
+            forward(query,
+                    key=key,
+                    value=value,
+                    query_pos=query_pos,
+                    key_pos=key_pos,
+                    attn_masks=attn_masks,
+                    query_key_padding_mask=query_key_padding_mask,
+                    key_padding_mask=key_padding_mask,
+                    )
+        return x
+
+    def forward(self,
+                query,
+                key=None,
+                value=None,
+                query_pos=None,
+                key_pos=None,
+                attn_masks=None,
+                query_key_padding_mask=None,
+                key_padding_mask=None,
+                **kwargs):
+        """Forward function for `TransformerCoder`.
+        Returns:
+            Tensor: forwarded results with shape [num_query, bs, embed_dims].
+        """
+
+        if self.use_checkpoint and self.training:
+            x = cp.checkpoint(
+                self._forward,
+                query,
+                key,
+                value,
+                query_pos,
+                key_pos,
+                attn_masks,
+                query_key_padding_mask,
+                key_padding_mask,
+            )
+        else:
+            x = self._forward(
+                query,
+                key=key,
+                value=value,
+                query_pos=query_pos,
+                key_pos=key_pos,
+                attn_masks=attn_masks,
+                query_key_padding_mask=query_key_padding_mask,
+                key_padding_mask=key_padding_mask)
+        return x
+
+
+@ATTENTION.register_module()
+class PETRMultiheadAttention(BaseModule):
+    """A wrapper for ``torch.nn.MultiheadAttention``.
+    This module implements MultiheadAttention with identity connection,
+    and positional encoding  is also passed as input.
+    Args:
+        embed_dims (int): The embedding dimension.
+        num_heads (int): Parallel attention heads.
+        attn_drop (float): A Dropout layer on attn_output_weights.
+            Default: 0.0.
+        proj_drop (float): A Dropout layer after `nn.MultiheadAttention`.
+            Default: 0.0.
+        dropout_layer (obj:`ConfigDict`): The dropout_layer used
+            when adding the shortcut.
+        init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
+            Default: None.
+        batch_first (bool): When it is True,  Key, Query and Value are shape of
+            (batch, n, embed_dim), otherwise (n, batch, embed_dim).
+             Default to False.
+    """
+
+    def __init__(self,
+                 embed_dims,
+                 num_heads,
+                 attn_drop=0.,
+                 proj_drop=0.,
+                 dropout_layer=dict(type='Dropout', drop_prob=0.),
+                 init_cfg=None,
+                 batch_first=False,
+                 **kwargs):
+        super(PETRMultiheadAttention, self).__init__(init_cfg)
+        if 'dropout' in kwargs:
+            warnings.warn(
+                'The arguments `dropout` in MultiheadAttention '
+                'has been deprecated, now you can separately '
+                'set `attn_drop`(float), proj_drop(float), '
+                'and `dropout_layer`(dict) ', DeprecationWarning)
+            attn_drop = kwargs['dropout']
+            dropout_layer['drop_prob'] = kwargs.pop('dropout')
+
+        self.embed_dims = embed_dims
+        self.num_heads = num_heads
+        self.batch_first = batch_first
+
+        self.attn = nn.MultiheadAttention(embed_dims, num_heads, attn_drop,
+                                          **kwargs)
+
+        self.proj_drop = nn.Dropout(proj_drop)
+        self.dropout_layer = build_dropout(
+            dropout_layer) if dropout_layer else nn.Identity()
+
+    @deprecated_api_warning({'residual': 'identity'},
+                            cls_name='MultiheadAttention')
+    def forward(self,
+                query,
+                key=None,
+                value=None,
+                identity=None,
+                query_pos=None,
+                key_pos=None,
+                attn_mask=None,
+                key_padding_mask=None,
+                **kwargs):
+        """Forward function for `MultiheadAttention`.
+        **kwargs allow passing a more general data flow when combining
+        with other operations in `transformerlayer`.
+        Args:
+            query (Tensor): The input query with shape [num_queries, bs,
+                embed_dims] if self.batch_first is False, else
+                [bs, num_queries embed_dims].
+            key (Tensor): The key tensor with shape [num_keys, bs,
+                embed_dims] if self.batch_first is False, else
+                [bs, num_keys, embed_dims] .
+                If None, the ``query`` will be used. Defaults to None.
+            value (Tensor): The value tensor with same shape as `key`.
+                Same in `nn.MultiheadAttention.forward`. Defaults to None.
+                If None, the `key` will be used.
+            identity (Tensor): This tensor, with the same shape as x,
+                will be used for the identity link.
+                If None, `x` will be used. Defaults to None.
+            query_pos (Tensor): The positional encoding for query, with
+                the same shape as `x`. If not None, it will
+                be added to `x` before forward function. Defaults to None.
+            key_pos (Tensor): The positional encoding for `key`, with the
+                same shape as `key`. Defaults to None. If not None, it will
+                be added to `key` before forward function. If None, and
+                `query_pos` has the same shape as `key`, then `query_pos`
+                will be used for `key_pos`. Defaults to None.
+            attn_mask (Tensor): ByteTensor mask with shape [num_queries,
+                num_keys]. Same in `nn.MultiheadAttention.forward`.
+                Defaults to None.
+            key_padding_mask (Tensor): ByteTensor with shape [bs, num_keys].
+                Defaults to None.
+        Returns:
+            Tensor: forwarded results with shape
+            [num_queries, bs, embed_dims]
+            if self.batch_first is False, else
+            [bs, num_queries embed_dims].
+        """
+
+        if key is None:
+            key = query
+        if value is None:
+            value = key
+        if identity is None:
+            identity = query
+        if key_pos is None:
+            if query_pos is not None:
+                # use query_pos if key_pos is not available
+                if query_pos.shape == key.shape:
+                    key_pos = query_pos
+                else:
+                    warnings.warn(f'position encoding of key is'
+                                  f'missing in {self.__class__.__name__}.')
+        if query_pos is not None:
+            query = query + query_pos
+        if key_pos is not None:
+            key = key + key_pos
+
+        # Because the dataflow('key', 'query', 'value') of
+        # ``torch.nn.MultiheadAttention`` is (num_query, batch,
+        # embed_dims), We should adjust the shape of dataflow from
+        # batch_first (batch, num_query, embed_dims) to num_query_first
+        # (num_query ,batch, embed_dims), and recover ``attn_output``
+        # from num_query_first to batch_first.
+        if self.batch_first:
+            query = query.transpose(0, 1)
+            key = key.transpose(0, 1)
+            value = value.transpose(0, 1)
+
+        out = self.attn(
+            query=query,
+            key=key,
+            value=value,
+            attn_mask=attn_mask,
+            key_padding_mask=key_padding_mask)[0]
+
+        if self.batch_first:
+            out = out.transpose(0, 1)
+
+        return identity + self.dropout_layer(self.proj_drop(out))
+
+
+@TRANSFORMER_LAYER_SEQUENCE.register_module()
+class PETRTransformerEncoder(TransformerLayerSequence):
+    """TransformerEncoder of DETR.
+    Args:
+        post_norm_cfg (dict): Config of last normalization layer. Default：
+            `LN`. Only used when `self.pre_norm` is `True`
+    """
+
+    def __init__(self, *args, post_norm_cfg=dict(type='LN'), **kwargs):
+        super(PETRTransformerEncoder, self).__init__(*args, **kwargs)
+        if post_norm_cfg is not None:
+            self.post_norm = build_norm_layer(
+                post_norm_cfg, self.embed_dims)[1] if self.pre_norm else None
+        else:
+            assert not self.pre_norm, f'Use prenorm in ' \
+                                      f'{self.__class__.__name__},' \
+                                      f'Please specify post_norm_cfg'
+            self.post_norm = None
+
+    def forward(self, *args, **kwargs):
+        """Forward function for `TransformerCoder`.
+        Returns:
+            Tensor: forwarded results with shape [num_query, bs, embed_dims].
+        """
+        x = super(PETRTransformerEncoder, self).forward(*args, **kwargs)
+        if self.post_norm is not None:
+            x = self.post_norm(x)
+        return x
+
+
+@TRANSFORMER_LAYER_SEQUENCE.register_module()
+class PETRTransformerDecoder(TransformerLayerSequence):
+    """Implements the decoder in DETR transformer.
+    Args:
+        return_intermediate (bool): Whether to return intermediate outputs.
+        post_norm_cfg (dict): Config of last normalization layer. Default：
+            `LN`.
+    """
+
+    def __init__(self,
+                 *args,
+                 post_norm_cfg=dict(type='LN'),
+                 return_intermediate=False,
+                 **kwargs):
+
+        super(PETRTransformerDecoder, self).__init__(*args, **kwargs)
+        self.return_intermediate = return_intermediate
+        if post_norm_cfg is not None:
+            self.post_norm = build_norm_layer(post_norm_cfg,
+                                              self.embed_dims)[1]
+        else:
+            self.post_norm = None
+
+    def forward(self, query, *args, **kwargs):
+        """Forward function for `TransformerDecoder`.
+        Args:
+            query (Tensor): Input query with shape
+                `(num_query, bs, embed_dims)`.
+        Returns:
+            Tensor: Results with shape [1, num_query, bs, embed_dims] when
+                return_intermediate is `False`, otherwise it has shape
+                [num_layers, num_query, bs, embed_dims].
+        """
+        if not self.return_intermediate:
+            x = super().forward(query, *args, **kwargs)
+            if self.post_norm:
+                x = self.post_norm(x)[None]
+            return x
+
+        intermediate = []
+        for layer in self.layers:
+            query = layer(query, *args, **kwargs)
+            if self.return_intermediate:
+                if self.post_norm is not None:
+                    intermediate.append(self.post_norm(query))
+                else:
+                    intermediate.append(query)
+        return torch.stack(intermediate)
diff --git a/modelscope/models/cv/object_detection_3d/depe/mmdet3d_plugin/models/utils/positional_encoding.py b/modelscope/models/cv/object_detection_3d/depe/mmdet3d_plugin/models/utils/positional_encoding.py
new file mode 100644
index 00000000..0d117b65
--- /dev/null
+++ b/modelscope/models/cv/object_detection_3d/depe/mmdet3d_plugin/models/utils/positional_encoding.py
@@ -0,0 +1,120 @@
+"""
+The implementation here is modified based on PETR, originally Apache-2.0 license and publicly avaialbe at
+https://github.com/megvii-research/PETR/blob/main/projects/mmdet3d_plugin/models/utils
+"""
+import math
+
+import torch
+import torch.nn as nn
+from mmcv.cnn.bricks.transformer import POSITIONAL_ENCODING
+from mmcv.runner import BaseModule
+
+
+@POSITIONAL_ENCODING.register_module()
+class SinePositionalEncoding3D(BaseModule):
+    """Position encoding with sine and cosine functions.
+    See `End-to-End Object Detection with Transformers
+    <https://arxiv.org/pdf/2005.12872>`_ for details.
+    Args:
+        num_feats (int): The feature dimension for each position
+            along x-axis or y-axis. Note the final returned dimension
+            for each position is 2 times of this value.
+        temperature (int, optional): The temperature used for scaling
+            the position embedding. Defaults to 10000.
+        normalize (bool, optional): Whether to normalize the position
+            embedding. Defaults to False.
+        scale (float, optional): A scale factor that scales the position
+            embedding. The scale will be used only when `normalize` is True.
+            Defaults to 2*pi.
+        eps (float, optional): A value added to the denominator for
+            numerical stability. Defaults to 1e-6.
+        offset (float): offset add to embed when do the normalization.
+            Defaults to 0.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None
+    """
+
+    def __init__(self,
+                 num_feats,
+                 temperature=10000,
+                 normalize=False,
+                 scale=2 * math.pi,
+                 eps=1e-6,
+                 offset=0.,
+                 init_cfg=None,
+                 skip_n=False):
+        super(SinePositionalEncoding3D, self).__init__(init_cfg)
+        if normalize:
+            assert isinstance(scale, (float, int)), 'when normalize is set,' \
+                'scale should be provided and in float or int type, ' \
+                f'found {type(scale)}'
+        self.num_feats = num_feats
+        self.temperature = temperature
+        self.normalize = normalize
+        self.scale = scale
+        self.eps = eps
+        self.offset = offset
+        self.skip_n = skip_n
+
+    def forward(self, mask):
+        """Forward function for `SinePositionalEncoding`.
+        Args:
+            mask (Tensor): ByteTensor mask. Non-zero values representing
+                ignored positions, while zero values means valid positions
+                for this image. Shape [bs, h, w].
+        Returns:
+            pos (Tensor): Returned position embedding with shape
+                [bs, num_feats*2, h, w].
+        """
+        # For convenience of exporting to ONNX, it's required to convert
+        # `masks` from bool to int.
+        mask = mask.to(torch.int)
+        not_mask = 1 - mask  # logical_not
+        if not self.skip_n:
+            n_embed = not_mask.cumsum(1, dtype=torch.float32)
+        y_embed = not_mask.cumsum(2, dtype=torch.float32)
+        x_embed = not_mask.cumsum(3, dtype=torch.float32)
+        if self.normalize:
+            if not self.skip_n:
+                n_embed = (n_embed + self.offset) / \
+                          (n_embed[:, -1:, :, :] + self.eps) * self.scale
+            y_embed = (y_embed + self.offset) / \
+                      (y_embed[:, :, -1:, :] + self.eps) * self.scale
+            x_embed = (x_embed + self.offset) / \
+                      (x_embed[:, :, :, -1:] + self.eps) * self.scale
+        dim_t = torch.arange(
+            self.num_feats, dtype=torch.float32, device=mask.device)
+        dim_t = self.temperature**(
+            2 * torch.div(dim_t, 2, rounding_mode='floor') / self.num_feats)
+        if not self.skip_n:
+            pos_n = n_embed[:, :, :, :, None] / dim_t
+        pos_x = x_embed[:, :, :, :, None] / dim_t
+        pos_y = y_embed[:, :, :, :, None] / dim_t
+        # use `view` instead of `flatten` for dynamically exporting to ONNX
+        B, N, H, W = mask.size()
+        if not self.skip_n:
+            pos_n = torch.stack(
+                (pos_n[:, :, :, :, 0::2].sin(), pos_n[:, :, :, :, 1::2].cos()),
+                dim=4).view(B, N, H, W, -1)
+        pos_x = torch.stack(
+            (pos_x[:, :, :, :, 0::2].sin(), pos_x[:, :, :, :, 1::2].cos()),
+            dim=4).view(B, N, H, W, -1)
+        pos_y = torch.stack(
+            (pos_y[:, :, :, :, 0::2].sin(), pos_y[:, :, :, :, 1::2].cos()),
+            dim=4).view(B, N, H, W, -1)
+        if self.skip_n:
+            pos = torch.cat((pos_y, pos_x), dim=4).permute(0, 1, 4, 2, 3)
+        else:
+            pos = torch.cat((pos_n, pos_y, pos_x),
+                            dim=4).permute(0, 1, 4, 2, 3)
+        return pos
+
+    def __repr__(self):
+        """str: a string that describes the module"""
+        repr_str = self.__class__.__name__
+        repr_str += f'(num_feats={self.num_feats}, '
+        repr_str += f'temperature={self.temperature}, '
+        repr_str += f'normalize={self.normalize}, '
+        repr_str += f'scale={self.scale}, '
+        repr_str += f'eps={self.eps})'
+        return repr_str
diff --git a/modelscope/models/cv/object_detection_3d/depe/result_vis.py b/modelscope/models/cv/object_detection_3d/depe/result_vis.py
new file mode 100644
index 00000000..4dfd1ed2
--- /dev/null
+++ b/modelscope/models/cv/object_detection_3d/depe/result_vis.py
@@ -0,0 +1,261 @@
+"""
+The implementation here is modified based on BEVDet, originally Apache-2.0 license and publicly avaialbe at
+https://github.com/HuangJunJie2017/BEVDet/blob/dev2.0/tools/analysis_tools/vis.py
+"""
+import argparse
+import os
+import pickle
+
+import cv2
+import json
+import numpy as np
+from mmdet3d.core.bbox.structures.lidar_box3d import LiDARInstance3DBoxes as LB
+from pyquaternion.quaternion import Quaternion
+
+
+def check_point_in_img(points, height, width):
+    valid = np.logical_and(points[:, 0] >= 0, points[:, 1] >= 0)
+    valid = np.logical_and(
+        valid, np.logical_and(points[:, 0] < width, points[:, 1] < height))
+    return valid
+
+
+def depth2color(depth):
+    gray = max(0, min((depth + 2.5) / 3.0, 1.0))
+    max_lumi = 200
+    colors = np.array(
+        [[max_lumi, 0, max_lumi], [max_lumi, 0, 0], [max_lumi, max_lumi, 0],
+         [0, max_lumi, 0], [0, max_lumi, max_lumi], [0, 0, max_lumi]],
+        dtype=np.float32)
+    if gray == 1:
+        return tuple(colors[-1].tolist())
+    num_rank = len(colors) - 1
+    rank = np.floor(gray * num_rank).astype(np.int)
+    diff = (gray - rank / num_rank) * num_rank
+    tmp = colors[rank + 1] - colors[rank]
+    return tuple((colors[rank] + tmp * diff).tolist())
+
+
+def lidar2img(points_lidar, camrera_info):
+    points_lidar_homogeneous = \
+        np.concatenate([points_lidar,
+                        np.ones((points_lidar.shape[0], 1),
+                                dtype=points_lidar.dtype)], axis=1)
+    camera2lidar = np.eye(4, dtype=np.float32)
+    camera2lidar[:3, :3] = camrera_info['sensor2lidar_rotation']
+    camera2lidar[:3, 3] = camrera_info['sensor2lidar_translation']
+    lidar2camera = np.linalg.inv(camera2lidar)
+    points_camera_homogeneous = points_lidar_homogeneous @ lidar2camera.T
+    points_camera = points_camera_homogeneous[:, :3]
+    valid = np.ones((points_camera.shape[0]), dtype=bool)
+    valid = np.logical_and(points_camera[:, -1] > 0.5, valid)
+    points_camera = points_camera / points_camera[:, 2:3]
+    camera2img = camrera_info['cam_intrinsic']
+    points_img = points_camera @ camera2img.T
+    points_img = points_img[:, :2]
+    return points_img, valid
+
+
+def get_lidar2global(infos):
+    lidar2ego = np.eye(4, dtype=np.float32)
+    lidar2ego[:3, :3] = Quaternion(infos['lidar2ego_rotation']).rotation_matrix
+    lidar2ego[:3, 3] = infos['lidar2ego_translation']
+    ego2global = np.eye(4, dtype=np.float32)
+    ego2global[:3, :3] = Quaternion(
+        infos['ego2global_rotation']).rotation_matrix
+    ego2global[:3, 3] = infos['ego2global_translation']
+    return ego2global @ lidar2ego
+
+
+def plot_result(res_path,
+                vis_thred=0.3,
+                version='val',
+                draw_gt=True,
+                save_format='image'):
+    img_list = []
+    # fixed parameters
+    root_path = '/data/Dataset/nuScenes'
+    show_range = 50  # Range of visualization in BEV
+    canva_size = 1000  # Size of canva in pixel
+    vis_frames = 500  # Max number of frames for visualization
+    scale_factor = 2  # Trade-off between image-view and bev in size of the visualized canvas
+    fps = 5  # Frame rate of video
+    vis_dir = './video_result'  # Video output path
+    color_map = {0: (255, 255, 0), 1: (0, 255, 255)}
+
+    # load predicted results
+    res = json.load(open(res_path, 'r'))
+    # load dataset information
+    info_path = os.path.join(root_path,
+                             f'mmdet3d_nuscenes_30f_infos_{version}.pkl')
+    with open(info_path, 'rb') as f:
+        dataset = pickle.load(f)
+    # prepare save path and medium
+    if save_format == 'video' and not os.path.exists(vis_dir):
+        os.makedirs(vis_dir)
+        fourcc = cv2.VideoWriter_fourcc(*'MP4V')
+        vout = cv2.VideoWriter(
+            os.path.join(vis_dir, 'vis.mp4'), fourcc, fps,
+            (int(1600 / scale_factor * 3),
+             int(900 / scale_factor * 2 + canva_size)))
+
+    draw_boxes_indexes_bev = [(0, 1), (1, 2), (2, 3), (3, 0)]
+    draw_boxes_indexes_img_view = [(0, 1), (1, 2), (2, 3), (3, 0), (4, 5),
+                                   (5, 6), (6, 7), (7, 4), (0, 4), (1, 5),
+                                   (2, 6), (3, 7)]
+    views = [
+        'CAM_FRONT_LEFT', 'CAM_FRONT', 'CAM_FRONT_RIGHT', 'CAM_BACK_LEFT',
+        'CAM_BACK', 'CAM_BACK_RIGHT'
+    ]
+    dataset_dict = {}
+    for sample in dataset['infos']:
+        if sample['token'] not in dataset_dict:
+            dataset_dict[sample['token']] = sample
+    for cnt, rst_token in enumerate(res['results']):
+        if cnt >= vis_frames:
+            break
+        # collect instances
+        pred_res = res['results'][rst_token]
+        infos = dataset_dict[rst_token]
+        pred_boxes = [
+            pred_res[rid]['translation'] + pred_res[rid]['size'] + [
+                Quaternion(pred_res[rid]['rotation']).yaw_pitch_roll[0]
+                + np.pi / 2
+            ] for rid in range(len(pred_res))
+        ]
+        if len(pred_boxes) == 0:
+            corners_lidar = np.zeros((0, 3), dtype=np.float32)
+        else:
+            pred_boxes = np.array(pred_boxes, dtype=np.float32)
+            boxes = LB(pred_boxes, origin=(0.5, 0.5, 0.5))
+            corners_global = boxes.corners.numpy().reshape(-1, 3)
+            corners_global = np.concatenate(
+                [corners_global,
+                 np.ones([corners_global.shape[0], 1])],
+                axis=1)
+            l2g = get_lidar2global(infos)
+            corners_lidar = corners_global @ np.linalg.inv(l2g).T
+            corners_lidar = corners_lidar[:, :3]
+        pred_flag = np.ones((corners_lidar.shape[0] // 8, ), dtype=np.bool)
+        scores = [
+            pred_res[rid]['detection_score'] for rid in range(len(pred_res))
+        ]
+        if draw_gt:
+            gt_boxes = infos['gt_boxes']
+            gt_boxes[:, -1] = gt_boxes[:, -1] + np.pi / 2
+            width = gt_boxes[:, 4].copy()
+            gt_boxes[:, 4] = gt_boxes[:, 3]
+            gt_boxes[:, 3] = width
+            corners_lidar_gt = \
+                LB(infos['gt_boxes'],
+                   origin=(0.5, 0.5, 0.5)).corners.numpy().reshape(-1, 3)
+            corners_lidar = np.concatenate([corners_lidar, corners_lidar_gt],
+                                           axis=0)
+            gt_flag = np.ones((corners_lidar_gt.shape[0] // 8), dtype=np.bool)
+            pred_flag = np.concatenate(
+                [pred_flag, np.logical_not(gt_flag)], axis=0)
+            scores = scores + [0 for _ in range(infos['gt_boxes'].shape[0])]
+        scores = np.array(scores, dtype=np.float32)
+        sort_ids = np.argsort(scores)
+
+        # image view
+        imgs = []
+        for view in views:
+            img = cv2.imread(infos['cams'][view]['data_path'])
+            # draw instances
+            corners_img, valid = lidar2img(corners_lidar, infos['cams'][view])
+            valid = np.logical_and(
+                valid,
+                check_point_in_img(corners_img, img.shape[0], img.shape[1]))
+            valid = valid.reshape(
+                -1, 8)  # valid means: d>0 and visible in current view
+            corners_img = corners_img.reshape(-1, 8, 2).astype(np.int)
+            for aid in range(valid.shape[0]):
+                if scores[aid] < vis_thred and pred_flag[aid]:
+                    continue
+                for index in draw_boxes_indexes_img_view:
+                    if valid[aid, index[0]] and valid[aid, index[1]]:
+                        cv2.line(
+                            img,
+                            corners_img[aid, index[0]],
+                            corners_img[aid, index[1]],
+                            color=color_map[int(pred_flag[aid])],
+                            thickness=scale_factor)
+            imgs.append(img)
+
+        # bird-eye-view
+        canvas = np.zeros((int(canva_size), int(canva_size), 3),
+                          dtype=np.uint8)
+        # draw lidar points
+        lidar_points = np.fromfile(infos['lidar_path'], dtype=np.float32)
+        lidar_points = lidar_points.reshape(-1, 5)[:, :3]
+        lidar_points[:, 1] = -lidar_points[:, 1]
+        lidar_points[:, :2] = \
+            (lidar_points[:, :2] + show_range) / show_range / 2.0 * canva_size
+        for p in lidar_points:
+            if check_point_in_img(
+                    p.reshape(1, 3), canvas.shape[1], canvas.shape[0])[0]:
+                color = depth2color(p[2])
+                cv2.circle(
+                    canvas, (int(p[0]), int(p[1])),
+                    radius=0,
+                    color=color,
+                    thickness=1)
+
+        # draw instances
+        corners_lidar = corners_lidar.reshape(-1, 8, 3)
+        corners_lidar[:, :, 1] = -corners_lidar[:, :, 1]
+        bottom_corners_bev = corners_lidar[:, [0, 3, 7, 4], :2]
+        bottom_corners_bev = \
+            (bottom_corners_bev + show_range) / show_range / 2.0 * canva_size
+        bottom_corners_bev = np.round(bottom_corners_bev).astype(np.int32)
+        center_bev = corners_lidar[:, [0, 3, 7, 4], :2].mean(axis=1)
+        head_bev = corners_lidar[:, [0, 4], :2].mean(axis=1)
+        canter_canvas = \
+            (center_bev + show_range) / show_range / 2.0 * canva_size
+        center_canvas = canter_canvas.astype(np.int32)
+        head_canvas = (head_bev + show_range) / show_range / 2.0 * canva_size
+        head_canvas = head_canvas.astype(np.int32)
+
+        for rid in sort_ids:
+            score = scores[rid]
+            if score < vis_thred and pred_flag[rid]:
+                continue
+            score = min(score * 2.0, 1.0) if pred_flag[rid] else 1.0
+            color = color_map[int(pred_flag[rid])]
+            for index in draw_boxes_indexes_bev:
+                cv2.line(
+                    canvas,
+                    bottom_corners_bev[rid, index[0]],
+                    bottom_corners_bev[rid, index[1]],
+                    [color[0] * score, color[1] * score, color[2] * score],
+                    thickness=1)
+            cv2.line(
+                canvas,
+                center_canvas[rid],
+                head_canvas[rid],
+                [color[0] * score, color[1] * score, color[2] * score],
+                1,
+                lineType=8)
+
+        # fuse image-view and bev
+        img = np.zeros((900 * 2 + canva_size * scale_factor, 1600 * 3, 3),
+                       dtype=np.uint8)
+        img[:900, :, :] = np.concatenate(imgs[:3], axis=1)
+        img_back = np.concatenate(
+            [imgs[3][:, ::-1, :], imgs[4][:, ::-1, :], imgs[5][:, ::-1, :]],
+            axis=1)
+        img[900 + canva_size * scale_factor:, :, :] = img_back
+        img = cv2.resize(img, (int(1600 / scale_factor * 3),
+                               int(900 / scale_factor * 2 + canva_size)))
+        w_begin = int((1600 * 3 / scale_factor - canva_size) // 2)
+        img[int(900 / scale_factor):int(900 / scale_factor) + canva_size,
+            w_begin:w_begin + canva_size, :] = canvas
+
+        if save_format == 'image':
+            img_list += [img]
+        elif save_format == 'video':
+            vout.write(img)
+    if save_format == 'video':
+        vout.release()
+    return img_list
diff --git a/modelscope/models/cv/ocr_recognition/__init__.py b/modelscope/models/cv/ocr_recognition/__init__.py
new file mode 100644
index 00000000..39c09384
--- /dev/null
+++ b/modelscope/models/cv/ocr_recognition/__init__.py
@@ -0,0 +1,22 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .model import OCRRecognition
+
+else:
+    _import_structure = {
+        'model': ['OCRRecognition'],
+    }
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/models/cv/ocr_recognition/model.py b/modelscope/models/cv/ocr_recognition/model.py
new file mode 100644
index 00000000..7d76f8e8
--- /dev/null
+++ b/modelscope/models/cv/ocr_recognition/model.py
@@ -0,0 +1,109 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os
+
+import torch
+import torch.nn.functional as F
+
+from modelscope.metainfo import Models
+from modelscope.models.base.base_torch_model import TorchModel
+from modelscope.models.builder import MODELS
+from modelscope.utils.config import Config
+from modelscope.utils.constant import ModelFile, Tasks
+from modelscope.utils.logger import get_logger
+from .modules.convnextvit import ConvNextViT
+from .modules.crnn import CRNN
+
+LOGGER = get_logger()
+
+
+@MODELS.register_module(
+    Tasks.ocr_recognition, module_name=Models.ocr_recognition)
+class OCRRecognition(TorchModel):
+
+    def __init__(self, model_dir: str, **kwargs):
+        """initialize the ocr recognition model from the `model_dir` path.
+
+        Args:
+            model_dir (str): the model path.
+        """
+        super().__init__(model_dir, **kwargs)
+
+        model_path = os.path.join(model_dir, ModelFile.TORCH_MODEL_FILE)
+        cfgs = Config.from_file(
+            os.path.join(model_dir, ModelFile.CONFIGURATION))
+        self.do_chunking = cfgs.model.inference_kwargs.do_chunking
+        self.recognizer = None
+        if cfgs.model.recognizer == 'ConvNextViT':
+            self.recognizer = ConvNextViT()
+        elif cfgs.model.recognizer == 'CRNN':
+            self.recognizer = CRNN()
+        else:
+            raise TypeError(
+                f'recognizer should be either ConvNextViT, CRNN, but got {cfgs.model.recognizer}'
+            )
+        if model_path != '':
+            self.recognizer.load_state_dict(
+                torch.load(model_path, map_location='cpu'))
+
+        dict_path = os.path.join(model_dir, ModelFile.VOCAB_FILE)
+        self.labelMapping = dict()
+        with open(dict_path, 'r', encoding='utf-8') as f:
+            lines = f.readlines()
+            cnt = 1
+            for line in lines:
+                line = line.strip('\n')
+                self.labelMapping[cnt] = line
+                cnt += 1
+
+    def forward(self, inputs):
+        """
+        Args:
+            img (`torch.Tensor`): batched image tensor,
+                shape of each tensor is [N, 1, H, W].
+
+        Return:
+            `probs [T, N, Classes] of the sequence feature`
+        """
+        return self.recognizer(inputs)
+
+    def postprocess(self, inputs):
+        # naive decoder
+        if self.do_chunking:
+            preds = inputs
+            batchSize, length = preds.shape
+            PRED_LENTH = 75
+            PRED_PAD = 6
+            pred_idx = []
+            if batchSize == 1:
+                pred_idx = preds[0].cpu().data.tolist()
+            else:
+                for idx in range(batchSize):
+                    if idx == 0:
+                        pred_idx.extend(
+                            preds[idx].cpu().data[:PRED_LENTH
+                                                  - PRED_PAD].tolist())
+                    elif idx == batchSize - 1:
+                        pred_idx.extend(
+                            preds[idx].cpu().data[PRED_PAD:].tolist())
+                    else:
+                        pred_idx.extend(
+                            preds[idx].cpu().data[PRED_PAD:PRED_LENTH
+                                                  - PRED_PAD].tolist())
+            pred_idx = [its - 1 for its in pred_idx if its > 0]
+        else:
+            outprobs = inputs
+            outprobs = F.softmax(outprobs, dim=-1)
+            preds = torch.argmax(outprobs, -1)
+            length, batchSize = preds.shape
+            assert batchSize == 1, 'only support onesample inference'
+            pred_idx = preds[:, 0].cpu().data.tolist()
+
+        pred_idx = pred_idx
+        last_p = 0
+        str_pred = []
+        for p in pred_idx:
+            if p != last_p and p != 0:
+                str_pred.append(self.labelMapping[p])
+            last_p = p
+        final_str = ''.join(str_pred)
+        return final_str
diff --git a/modelscope/models/cv/ocr_recognition/modules/__init__.py b/modelscope/models/cv/ocr_recognition/modules/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/modelscope/models/cv/ocr_recognition/modules/convnext.py b/modelscope/models/cv/ocr_recognition/modules/convnext.py
new file mode 100644
index 00000000..c0e30616
--- /dev/null
+++ b/modelscope/models/cv/ocr_recognition/modules/convnext.py
@@ -0,0 +1,163 @@
+# Part of the implementation is borrowed and modified from ConvNext,
+# publicly available at https://github.com/facebookresearch/ConvNeXt
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from .timm_tinyc import DropPath
+
+
+class Block(nn.Module):
+    r""" ConvNeXt Block. There are two equivalent implementations:
+    (1) DwConv -> LayerNorm (channels_first) -> 1x1 Conv -> GELU -> 1x1 Conv; all in (N, C, H, W)
+    (2) DwConv -> Permute to (N, H, W, C); LayerNorm (channels_last) -> Linear -> GELU -> Linear; Permute back
+    We use (2) as we find it slightly faster in PyTorch
+
+    Args:
+        dim (int): Number of input channels.
+        drop_path (float): Stochastic depth rate. Default: 0.0
+        layer_scale_init_value (float): Init value for Layer Scale. Default: 1e-6.
+    """
+
+    def __init__(self, dim, drop_path=0., layer_scale_init_value=1e-6):
+        super().__init__()
+        self.dwconv = nn.Conv2d(
+            dim, dim, kernel_size=7, padding=3, groups=dim)  # depthwise conv
+        self.norm = LayerNorm(dim, eps=1e-6)
+        self.pwconv1 = nn.Linear(
+            dim,
+            4 * dim)  # pointwise/1x1 convs, implemented with linear layers
+        self.act = nn.GELU()
+        self.pwconv2 = nn.Linear(4 * dim, dim)
+        self.gamma = nn.Parameter(
+            layer_scale_init_value * torch.ones((dim)),
+            requires_grad=True) if layer_scale_init_value > 0 else None
+        self.drop_path = DropPath(
+            drop_path) if drop_path > 0. else nn.Identity()
+
+    def forward(self, x):
+        input = x
+        x = self.dwconv(x)
+        x = x.permute(0, 2, 3, 1)  # (N, C, H, W) -> (N, H, W, C)
+        x = self.norm(x)
+        x = self.pwconv1(x)
+        x = self.act(x)
+        x = self.pwconv2(x)
+        if self.gamma is not None:
+            x = self.gamma * x
+        x = x.permute(0, 3, 1, 2)  # (N, H, W, C) -> (N, C, H, W)
+
+        x = input + self.drop_path(x)
+        return x
+
+
+class ConvNeXt(nn.Module):
+    r""" ConvNeXt
+        A PyTorch impl of : `A ConvNet for the 2020s`  -
+          https://arxiv.org/pdf/2201.03545.pdf
+
+    Args:
+        in_chans (int): Number of input image channels. Default: 3
+        num_classes (int): Number of classes for classification head. Default: 1000
+        depths (tuple(int)): Number of blocks at each stage. Default: [3, 3, 9, 3]
+        dims (int): Feature dimension at each stage. Default: [96, 192, 384, 768]
+        drop_path_rate (float): Stochastic depth rate. Default: 0.
+        layer_scale_init_value (float): Init value for Layer Scale. Default: 1e-6.
+        head_init_scale (float): Init scaling value for classifier weights and biases. Default: 1.
+    """
+
+    def __init__(
+        self,
+        in_chans=1,
+        num_classes=1000,
+        depths=[3, 3, 9, 3],
+        dims=[96, 192, 384, 768],
+        drop_path_rate=0.,
+        layer_scale_init_value=1e-6,
+        head_init_scale=1.,
+    ):
+        super().__init__()
+
+        self.downsample_layers = nn.ModuleList(
+        )  # stem and 3 intermediate downsampling conv layers
+        stem = nn.Sequential(
+            nn.Conv2d(in_chans, dims[0], kernel_size=4, stride=4),
+            LayerNorm(dims[0], eps=1e-6, data_format='channels_first'))
+        self.downsample_layers.append(stem)
+        for i in range(3):
+            downsample_layer = nn.Sequential(
+                LayerNorm(dims[i], eps=1e-6, data_format='channels_first'),
+                nn.Conv2d(
+                    dims[i], dims[i + 1], kernel_size=(2, 1), stride=(2, 1)),
+            )
+            self.downsample_layers.append(downsample_layer)
+
+        self.stages = nn.ModuleList(
+        )  # 4 feature resolution stages, each consisting of multiple residual blocks
+        dp_rates = [
+            x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))
+        ]
+        cur = 0
+        for i in range(4):
+            stage = nn.Sequential(*[
+                Block(
+                    dim=dims[i],
+                    drop_path=dp_rates[cur + j],
+                    layer_scale_init_value=layer_scale_init_value)
+                for j in range(depths[i])
+            ])
+            self.stages.append(stage)
+            cur += depths[i]
+
+    def _init_weights(self, m):
+        if isinstance(m, (nn.Conv2d, nn.Linear)):
+            trunc_normal_(m.weight, std=.02)
+            nn.init.constant_(m.bias, 0)
+
+    def forward_features(self, x):
+        for i in range(4):
+            x = self.downsample_layers[i](x.contiguous())
+            x = self.stages[i](x.contiguous())
+        return x  # global average pooling, (N, C, H, W) -> (N, C)
+
+    def forward(self, x):
+        x = self.forward_features(x.contiguous())
+
+        return x.contiguous()
+
+
+class LayerNorm(nn.Module):
+    r""" LayerNorm that supports two data formats: channels_last (default) or channels_first.
+    The ordering of the dimensions in the inputs. channels_last corresponds to inputs with
+    shape (batch_size, height, width, channels) while channels_first corresponds to inputs
+    with shape (batch_size, channels, height, width).
+    """
+
+    def __init__(self,
+                 normalized_shape,
+                 eps=1e-6,
+                 data_format='channels_last'):
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(normalized_shape))
+        self.bias = nn.Parameter(torch.zeros(normalized_shape))
+        self.eps = eps
+        self.data_format = data_format
+        if self.data_format not in ['channels_last', 'channels_first']:
+            raise NotImplementedError
+        self.normalized_shape = (normalized_shape, )
+
+    def forward(self, x):
+        if self.data_format == 'channels_last':
+            return F.layer_norm(x, self.normalized_shape, self.weight,
+                                self.bias, self.eps)
+        elif self.data_format == 'channels_first':
+            u = x.mean(1, keepdim=True)
+            s = (x - u).pow(2).mean(1, keepdim=True)
+            x = (x - u) / torch.sqrt(s + self.eps)
+            x = self.weight[:, None, None] * x + self.bias[:, None, None]
+            return x
+
+
+def convnext_tiny():
+    model = ConvNeXt(depths=[3, 3, 8, 3], dims=[96, 192, 256, 512])
+    return model
diff --git a/modelscope/models/cv/ocr_recognition/modules/convnextvit.py b/modelscope/models/cv/ocr_recognition/modules/convnextvit.py
new file mode 100644
index 00000000..aaedb697
--- /dev/null
+++ b/modelscope/models/cv/ocr_recognition/modules/convnextvit.py
@@ -0,0 +1,23 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import torch
+import torch.nn as nn
+
+from .convnext import convnext_tiny
+from .vitstr import vitstr_tiny
+
+
+class ConvNextViT(nn.Module):
+
+    def __init__(self):
+        super(ConvNextViT, self).__init__()
+        self.cnn_model = convnext_tiny()
+        self.vitstr = vitstr_tiny(num_tokens=7644)
+
+    def forward(self, input):
+        """ Transformation stage """
+        features = self.cnn_model(input)
+        prediction = self.vitstr(features)
+        prediction = torch.nn.functional.softmax(prediction, dim=-1)
+
+        output = torch.argmax(prediction, -1)
+        return output
diff --git a/modelscope/models/cv/ocr_recognition/modules/crnn.py b/modelscope/models/cv/ocr_recognition/modules/crnn.py
new file mode 100644
index 00000000..e0e489e9
--- /dev/null
+++ b/modelscope/models/cv/ocr_recognition/modules/crnn.py
@@ -0,0 +1,99 @@
+# Part of the implementation is borrowed and modified from CRNN,
+# publicly available at https://github.com/meijieru/crnn.pytorch
+# paper linking at https://arxiv.org/pdf/1507.05717.pdf
+import torch
+import torch.nn as nn
+
+
+class BidirectionalLSTM(nn.Module):
+
+    def __init__(self, nIn, nHidden, nOut):
+        super(BidirectionalLSTM, self).__init__()
+        self.rnn = nn.LSTM(nIn, nHidden, bidirectional=True)
+        self.embedding = nn.Linear(nHidden * 2, nOut)
+
+    def forward(self, input):
+        recurrent, _ = self.rnn(input)
+        T, b, h = recurrent.size()
+        t_rec = recurrent.view(T * b, h)
+
+        output = self.embedding(t_rec)  # [T * b, nOut]
+        output = output.view(T, b, -1)
+        return output
+
+
+class CRNN(nn.Module):
+
+    def __init__(self):
+        super(CRNN, self).__init__()
+        self.conv0 = nn.Sequential(
+            nn.Conv2d(
+                1, 64, kernel_size=(3, 3), padding=(1, 1), stride=(1, 1)),
+            nn.BatchNorm2d(64),
+            nn.ReLU(inplace=True),
+        )
+        self.p0 = nn.MaxPool2d(
+            kernel_size=(2, 2), stride=(2, 2), padding=(0, 0))
+        self.conv1 = nn.Sequential(
+            nn.Conv2d(
+                64, 128, kernel_size=(3, 3), padding=(1, 1), stride=(1, 1)),
+            nn.BatchNorm2d(128),
+            nn.ReLU(inplace=True),
+        )
+        self.p1 = nn.MaxPool2d(
+            kernel_size=(2, 2), stride=(2, 2), padding=(0, 0))
+        self.conv2 = nn.Sequential(
+            nn.Conv2d(
+                128, 256, kernel_size=(3, 3), padding=(1, 1), stride=(1, 1)),
+            nn.BatchNorm2d(256),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(
+                256, 256, kernel_size=(3, 3), padding=(1, 1), stride=(1, 1)),
+            nn.BatchNorm2d(256),
+            nn.ReLU(inplace=True),
+        )
+        self.p2 = nn.MaxPool2d(
+            kernel_size=(2, 1), stride=(2, 1), padding=(0, 0))
+        self.conv3 = nn.Sequential(
+            nn.Conv2d(
+                256, 512, kernel_size=(3, 3), padding=(1, 1), stride=(1, 1)),
+            nn.BatchNorm2d(512),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(
+                512, 512, kernel_size=(3, 3), padding=(1, 1), stride=(1, 1)),
+            nn.BatchNorm2d(512),
+            nn.ReLU(inplace=True),
+        )
+        self.p3 = nn.MaxPool2d(
+            kernel_size=(2, 1), stride=(2, 1), padding=(0, 0))
+        self.conv4 = nn.Sequential(
+            nn.Conv2d(
+                512, 512, kernel_size=(2, 1), padding=(0, 0), stride=(2, 1)),
+            nn.BatchNorm2d(512),
+            nn.ReLU(inplace=True),
+        )
+
+        self.rnn = nn.Sequential(
+            BidirectionalLSTM(512, 256, 256), BidirectionalLSTM(256, 256, 512))
+
+        self.cls = nn.Linear(512, 7644, bias=False)
+
+    def forward(self, input):
+        feats = self.conv0(input)
+        feats = self.p0(feats)
+        feats = self.conv1(feats)
+        feats = self.p1(feats)
+        feats = self.conv2(feats)
+        feats = self.p2(feats)
+        feats = self.conv3(feats)
+        feats = self.p3(feats)
+        convfeats = self.conv4(feats)
+
+        b, c, h, w = convfeats.size()
+        assert h == 1, 'the height of conv must be 1'
+        convfeats = convfeats.squeeze(2)
+        convfeats = convfeats.permute(2, 0, 1)  # [w, b, c]
+
+        rnnfeats = self.rnn(convfeats)
+        output = self.cls(rnnfeats)
+        return output
diff --git a/modelscope/models/cv/ocr_recognition/modules/timm_tinyc.py b/modelscope/models/cv/ocr_recognition/modules/timm_tinyc.py
new file mode 100644
index 00000000..555b1e42
--- /dev/null
+++ b/modelscope/models/cv/ocr_recognition/modules/timm_tinyc.py
@@ -0,0 +1,332 @@
+# Part of the implementation is borrowed and modified from timm,
+# publicly available at https://github.com/rwightman/pytorch-image-models
+import collections.abc
+import logging
+import math
+from collections import OrderedDict
+from copy import deepcopy
+from functools import partial
+from itertools import repeat
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+def _ntuple(n):
+
+    def parse(x):
+        if isinstance(x, collections.abc.Iterable):
+            return x
+        return tuple(repeat(x, n))
+
+    return parse
+
+
+class PatchEmbed(nn.Module):
+    """ 2D Image to Patch Embedding
+    """
+
+    def __init__(self,
+                 img_size=224,
+                 patch_size=16,
+                 in_chans=3,
+                 embed_dim=768,
+                 norm_layer=None,
+                 flatten=True):
+        super().__init__()
+        img_size = (1, 75)
+        to_2tuple = _ntuple(2)
+        patch_size = to_2tuple(patch_size)
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.grid_size = (img_size[0] // patch_size[0],
+                          img_size[1] // patch_size[1])
+        self.num_patches = self.grid_size[0] * self.grid_size[1]
+        self.flatten = flatten
+
+        self.proj = nn.Conv2d(
+            in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
+        self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity()
+
+    def forward(self, x):
+        B, C, H, W = x.shape
+        assert H == self.img_size[0] and W == self.img_size[1], \
+            f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})."
+        x = self.proj(x)
+        x = x.permute(0, 1, 3, 2)
+        if self.flatten:
+            x = x.flatten(2).transpose(1, 2)  # BCHW -> BNC
+        x = self.norm(x)
+        return x
+
+
+class Mlp(nn.Module):
+    """ MLP as used in Vision Transformer, MLP-Mixer and related networks
+    """
+
+    def __init__(self,
+                 in_features,
+                 hidden_features=None,
+                 out_features=None,
+                 act_layer=nn.GELU,
+                 drop=0.):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+
+
+def drop_path(x, drop_prob: float = 0., training: bool = False):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+
+    This is the same as the DropConnect impl I created for EfficientNet, etc networks, however,
+    the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
+    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for
+    changing the layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use
+    'survival rate' as the argument.
+
+    """
+    if drop_prob == 0. or not training:
+        return x
+    keep_prob = 1 - drop_prob
+    shape = (x.shape[0], ) + (1, ) * (
+        x.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
+    random_tensor = keep_prob + torch.rand(
+        shape, dtype=x.dtype, device=x.device)
+    random_tensor.floor_()  # binarize
+    output = x.div(keep_prob) * random_tensor
+    return output
+
+
+class DropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks).
+    """
+
+    def __init__(self, drop_prob=None):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training)
+
+
+class Attention(nn.Module):
+
+    def __init__(self,
+                 dim,
+                 num_heads=8,
+                 qkv_bias=False,
+                 attn_drop=0.1,
+                 proj_drop=0.1):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = head_dim**-0.5
+
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+
+    def forward(self, x):
+        B, N, C = x.shape
+        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads,
+                                  C // self.num_heads).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv[0], qkv[1], qkv[
+            2]  # make torchscript happy (cannot use tensor as tuple)
+
+        attn = (q @ k.transpose(-2, -1)) * self.scale
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+
+        x = (attn @ v).transpose(1, 2).reshape(B, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+
+
+class Block(nn.Module):
+
+    def __init__(self,
+                 dim,
+                 num_heads,
+                 mlp_ratio=4.,
+                 qkv_bias=False,
+                 drop=0.,
+                 attn_drop=0.,
+                 drop_path=0.,
+                 act_layer=nn.GELU,
+                 norm_layer=nn.LayerNorm):
+        super().__init__()
+        self.norm1 = norm_layer(dim)
+        self.attn = Attention(
+            dim,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            attn_drop=attn_drop,
+            proj_drop=drop)
+        # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
+        self.drop_path = DropPath(
+            drop_path) if drop_path > 0. else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(
+            in_features=dim,
+            hidden_features=mlp_hidden_dim,
+            act_layer=act_layer,
+            drop=drop)
+
+    def forward(self, x):
+        x = x + self.drop_path(self.attn(self.norm1(x)))
+        x = x + self.drop_path(self.mlp(self.norm2(x)))
+        return x
+
+
+class VisionTransformer(nn.Module):
+
+    def __init__(self,
+                 img_size=224,
+                 patch_size=16,
+                 in_chans=3,
+                 num_classes=1000,
+                 embed_dim=768,
+                 depth=12,
+                 num_heads=12,
+                 mlp_ratio=4.,
+                 qkv_bias=True,
+                 representation_size=None,
+                 distilled=False,
+                 drop_rate=0.1,
+                 attn_drop_rate=0.1,
+                 drop_path_rate=0.,
+                 embed_layer=PatchEmbed,
+                 norm_layer=None,
+                 act_layer=None,
+                 weight_init=''):
+        """
+        Args:
+            img_size (int, tuple): input image size
+            patch_size (int, tuple): patch size
+            in_chans (int): number of input channels
+            num_classes (int): number of classes for classification head
+            embed_dim (int): embedding dimension
+            depth (int): depth of transformer
+            num_heads (int): number of attention heads
+            mlp_ratio (int): ratio of mlp hidden dim to embedding dim
+            qkv_bias (bool): enable bias for qkv if True
+            representation_size (Optional[int]): enable and set representation layer (pre-logits) to this value if set
+            distilled (bool): model includes a distillation token and head as in DeiT models
+            drop_rate (float): dropout rate
+            attn_drop_rate (float): attention dropout rate
+            drop_path_rate (float): stochastic depth rate
+            embed_layer (nn.Module): patch embedding layer
+            norm_layer: (nn.Module): normalization layer
+            weight_init: (str): weight init scheme
+        """
+        super().__init__()
+        self.num_classes = num_classes
+        self.num_features = self.embed_dim = embed_dim  # num_features for consistency with other models
+        self.num_tokens = 2 if distilled else 1
+        norm_layer = norm_layer or partial(nn.LayerNorm, eps=1e-6)
+        act_layer = act_layer or nn.GELU
+
+        self.patch_embed = embed_layer(
+            img_size=img_size,
+            patch_size=patch_size,
+            in_chans=in_chans,
+            embed_dim=embed_dim)
+        num_patches = self.patch_embed.num_patches
+
+        self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
+        self.dist_token = nn.Parameter(torch.zeros(
+            1, 1, embed_dim)) if distilled else None
+        self.pos_embed = nn.Parameter(torch.zeros(1, num_patches, embed_dim))
+        self.pos_drop = nn.Dropout(p=drop_rate)
+
+        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)
+               ]  # stochastic depth decay rule
+        self.blocks = nn.Sequential(*[
+            Block(
+                dim=embed_dim,
+                num_heads=num_heads,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                drop=drop_rate,
+                attn_drop=attn_drop_rate,
+                drop_path=dpr[i],
+                norm_layer=norm_layer,
+                act_layer=act_layer) for i in range(depth)
+        ])
+        self.norm = norm_layer(embed_dim)
+
+        # Representation layer
+        if representation_size and not distilled:
+            self.num_features = representation_size
+            self.pre_logits = nn.Sequential(
+                OrderedDict([('fc', nn.Linear(embed_dim, representation_size)),
+                             ('act', nn.Tanh())]))
+        else:
+            self.pre_logits = nn.Identity()
+
+        # Classifier head(s)
+        self.head = nn.Linear(
+            self.num_features,
+            num_classes) if num_classes > 0 else nn.Identity()
+        self.head_dist = None
+        if distilled:
+            self.head_dist = nn.Linear(
+                self.embed_dim,
+                self.num_classes) if num_classes > 0 else nn.Identity()
+
+    def reset_classifier(self, num_classes, global_pool=''):
+        self.num_classes = num_classes
+        self.head = nn.Linear(
+            self.embed_dim, num_classes) if num_classes > 0 else nn.Identity()
+        if self.num_tokens == 2:
+            self.head_dist = nn.Linear(
+                self.embed_dim,
+                self.num_classes) if num_classes > 0 else nn.Identity()
+
+    def forward_features(self, x):
+        x = self.patch_embed(x)
+        cls_token = self.cls_token.expand(
+            x.shape[0], -1, -1)  # stole cls_tokens impl from Phil Wang, thanks
+        if self.dist_token is None:
+            x = torch.cat((cls_token, x), dim=1)
+        else:
+            x = torch.cat(
+                (cls_token, self.dist_token.expand(x.shape[0], -1, -1), x),
+                dim=1)
+        x = self.pos_drop(x + self.pos_embed)
+        x = self.blocks(x)
+        x = self.norm(x)
+        if self.dist_token is None:
+            return self.pre_logits(x[:, 0])
+        else:
+            return x[:, 0], x[:, 1]
+
+    def forward(self, x):
+        x = self.forward_features(x)
+        if self.head_dist is not None:
+            x, x_dist = self.head(x[0]), self.head_dist(
+                x[1])  # x must be a tuple
+            if self.training and not torch.jit.is_scripting():
+                # during inference, return the average of both classifier predictions
+                return x, x_dist
+            else:
+                return (x + x_dist) / 2
+        else:
+            x = self.head(x)
+        return x
diff --git a/modelscope/models/cv/ocr_recognition/modules/vitstr.py b/modelscope/models/cv/ocr_recognition/modules/vitstr.py
new file mode 100644
index 00000000..5ce3aeca
--- /dev/null
+++ b/modelscope/models/cv/ocr_recognition/modules/vitstr.py
@@ -0,0 +1,58 @@
+# Part of the implementation is borrowed and modified from ViTSTR,
+# publicly available at https://github.com/roatienza/deep-text-recognition-benchmark
+from __future__ import absolute_import, division, print_function
+import logging
+from copy import deepcopy
+from functools import partial
+
+import torch
+import torch.nn as nn
+import torch.utils.model_zoo as model_zoo
+
+from .timm_tinyc import VisionTransformer
+
+
+class ViTSTR(VisionTransformer):
+    '''
+    ViTSTR is basically a ViT that uses DeiT weights.
+    Modified head to support a sequence of characters prediction for STR.
+    '''
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+    def reset_classifier(self, num_classes):
+        self.num_classes = num_classes
+        self.head = nn.Linear(
+            self.embed_dim, num_classes) if num_classes > 0 else nn.Identity()
+
+    def forward_features(self, x):
+        x = self.patch_embed(x)
+
+        x = x + self.pos_embed
+        x = self.pos_drop(x)
+        for blk in self.blocks:
+            x = blk(x)
+
+        x = self.norm(x)
+        return x
+
+    def forward(self, x):
+        x = self.forward_features(x)
+        b, s, e = x.size()
+        x = x.reshape(b * s, e)
+        x = self.head(x).view(b, s, self.num_classes)
+        return x
+
+
+def vitstr_tiny(num_tokens):
+    vitstr = ViTSTR(
+        patch_size=1,
+        in_chans=512,
+        embed_dim=192,
+        depth=12,
+        num_heads=3,
+        mlp_ratio=4,
+        qkv_bias=True)
+    vitstr.reset_classifier(num_classes=num_tokens)
+    return vitstr
diff --git a/modelscope/models/cv/ocr_recognition/preprocessor.py b/modelscope/models/cv/ocr_recognition/preprocessor.py
new file mode 100644
index 00000000..f47dcaef
--- /dev/null
+++ b/modelscope/models/cv/ocr_recognition/preprocessor.py
@@ -0,0 +1,104 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import os
+
+import cv2
+import numpy as np
+import PIL
+import torch
+
+from modelscope.metainfo import Preprocessors
+from modelscope.preprocessors import Preprocessor, load_image
+from modelscope.preprocessors.builder import PREPROCESSORS
+from modelscope.utils.config import Config
+from modelscope.utils.constant import Fields, ModeKeys, ModelFile
+
+
+@PREPROCESSORS.register_module(
+    Fields.cv, module_name=Preprocessors.ocr_recognition)
+class OCRRecognitionPreprocessor(Preprocessor):
+
+    def __init__(self, model_dir: str, mode: str = ModeKeys.INFERENCE):
+        """The base constructor for all ocr recognition preprocessors.
+
+        Args:
+            model_dir (str): model directory to initialize some resource
+            mode: The mode for the preprocessor.
+        """
+        super().__init__(mode)
+        cfgs = Config.from_file(
+            os.path.join(model_dir, ModelFile.CONFIGURATION))
+        self.do_chunking = cfgs.model.inference_kwargs.do_chunking
+        self.target_height = cfgs.model.inference_kwargs.img_height
+        self.target_width = cfgs.model.inference_kwargs.img_width
+
+    def keepratio_resize(self, img):
+        cur_ratio = img.shape[1] / float(img.shape[0])
+        mask_height = self.target_height
+        mask_width = self.target_width
+        if cur_ratio > float(self.target_width) / self.target_height:
+            cur_target_height = self.target_height
+            cur_target_width = self.target_width
+        else:
+            cur_target_height = self.target_height
+            cur_target_width = int(self.target_height * cur_ratio)
+        img = cv2.resize(img, (cur_target_width, cur_target_height))
+        mask = np.zeros([mask_height, mask_width]).astype(np.uint8)
+        mask[:img.shape[0], :img.shape[1]] = img
+        img = mask
+        return img
+
+    def __call__(self, inputs):
+        """process the raw input data
+        Args:
+            inputs:
+                - A string containing an HTTP link pointing to an image
+                - A string containing a local path to an image
+                - An image loaded in PIL or opencv directly
+        Returns:
+            outputs: the preprocessed image
+        """
+        if isinstance(inputs, str):
+            img = np.array(load_image(inputs).convert('L'))
+        elif isinstance(inputs, PIL.Image.Image):
+            img = np.array(inputs.convert('L'))
+        elif isinstance(inputs, np.ndarray):
+            if len(inputs.shape) == 3:
+                img = cv2.cvtColor(inputs, cv2.COLOR_RGB2GRAY)
+        else:
+            raise TypeError(
+                f'inputs should be either str, PIL.Image, np.array, but got {type(inputs)}'
+            )
+
+        if self.do_chunking:
+            data = []
+            img_h, img_w = img.shape
+            wh_ratio = img_w / img_h
+            true_w = int(self.target_height * wh_ratio)
+            split_batch_cnt = 1
+            if true_w < self.target_width * 1.2:
+                img = cv2.resize(
+                    img, (min(true_w, self.target_width), self.target_height))
+            else:
+                split_batch_cnt = math.ceil((true_w - 48) * 1.0 / 252)
+                img = cv2.resize(img, (true_w, self.target_height))
+
+            if split_batch_cnt == 1:
+                mask = np.zeros((self.target_height, self.target_width))
+                mask[:, :img.shape[1]] = img
+                data.append(mask)
+            else:
+                for idx in range(split_batch_cnt):
+                    mask = np.zeros((self.target_height, self.target_width))
+                    left = (PRED_LENTH * 4 - PRED_PAD * 4) * idx
+                    trunk_img = img[:, left:min(left + PRED_LENTH * 4, true_w)]
+                    mask[:, :trunk_img.shape[1]] = trunk_img
+                    data.append(mask)
+
+            data = torch.FloatTensor(data).view(
+                len(data), 1, self.target_height, self.target_width) / 255.
+        else:
+            data = self.keepratio_resize(img)
+            data = torch.FloatTensor(data).view(1, 1, self.target_height,
+                                                self.target_width) / 255.
+        return data
diff --git a/modelscope/models/cv/open_vocabulary_detection_vild/__init__.py b/modelscope/models/cv/open_vocabulary_detection_vild/__init__.py
new file mode 100644
index 00000000..ba7711cc
--- /dev/null
+++ b/modelscope/models/cv/open_vocabulary_detection_vild/__init__.py
@@ -0,0 +1,22 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .vild import OpenVocabularyDetectionViLD
+
+else:
+    _import_structure = {
+        'vild': ['OpenVocabularyDetectionViLD'],
+    }
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/models/cv/open_vocabulary_detection_vild/vild.py b/modelscope/models/cv/open_vocabulary_detection_vild/vild.py
new file mode 100644
index 00000000..999ec27a
--- /dev/null
+++ b/modelscope/models/cv/open_vocabulary_detection_vild/vild.py
@@ -0,0 +1,390 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+import os
+from typing import Any, Dict, Union
+
+import clip
+import numpy as np
+import tensorflow.compat.v1 as tf
+import torch.cuda
+from scipy.special import softmax
+
+from modelscope.metainfo import Models
+from modelscope.models.base import Tensor
+from modelscope.models.base.base_model import Model
+from modelscope.models.builder import MODELS
+from modelscope.utils.config import Config
+from modelscope.utils.constant import ModelFile, Tasks
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+
+@MODELS.register_module(
+    Tasks.open_vocabulary_detection,
+    module_name=Models.open_vocabulary_detection_vild)
+class OpenVocabularyDetectionViLD(Model):
+    """
+    Vild: Open-Vocabulary Detection via Vision and Language Knowledge Distillation
+    https://arxiv.org/abs/2104.13921
+    """
+
+    def __init__(self, model_dir, *args, **kwargs):
+        self.model_dir = model_dir
+        device_name = kwargs.get('device', 'gpu')
+        self._device_name = device_name
+
+        model_path = os.path.join(model_dir, ModelFile.TF_GRAPH_FILE)
+        # model_path = os.path.join(model_dir, 'test_out.pb')
+        graph = tf.Graph()
+        with graph.as_default():
+            config = tf.ConfigProto()
+            config.gpu_options.per_process_gpu_memory_fraction = 0.2
+            compute_graph = tf.Graph()
+            compute_graph.as_default()
+            sess = tf.Session(config=config)
+
+            with tf.gfile.GFile(model_path, 'rb') as fid:
+                graph_def = tf.GraphDef()
+                graph_def.ParseFromString(fid.read())
+                tf.import_graph_def(graph_def, name='')
+        self.sess = sess
+
+        #
+        # clip.available_models()
+        self.clip, self.clip_preprocess = clip.load(
+            'ViT-B/32', device='cuda:0')
+
+        self.prompt_engineering = True
+        self.this_is = True
+        self.temperature = 100.0
+        self.use_softmax = False
+        self.out_name = [
+            'RoiBoxes:0', 'RoiScores:0', '2ndStageBoxes:0',
+            '2ndStageScoresUnused:0', 'BoxOutputs:0', 'MaskOutputs:0',
+            'VisualFeatOutputs:0', 'ImageInfo:0'
+        ]
+
+    def __call__(self, *args, **kwargs) -> Dict[str, Any]:
+        return self.postprocess(self.forward(*args, **kwargs))
+
+    def forward(self, img: np.array, category_names: str,
+                **kwargs) -> Dict[str, Any]:
+        """
+        Run the forward pass for a model.
+
+        Returns:
+            Dict[str, Any]: output from the model forward pass
+        """
+        (roi_boxes, roi_scores, detection_boxes, scores_unused, box_outputs,
+         detection_masks, visual_features, image_info) = self.sess.run(
+             self.out_name, feed_dict={'Placeholder:0': img})
+        return_dict = {
+            'roi_boxes': roi_boxes,
+            'roi_scores': roi_scores,
+            'detection_boxes': detection_boxes,
+            'scores_unused': scores_unused,
+            'box_outputs': box_outputs,
+            'detection_masks': detection_masks,
+            'visual_features': visual_features,
+            'image_info': image_info,
+            'category_names': category_names
+        }
+        return return_dict
+
+    def postprocess(self, inputs: Dict[str, Any], **kwargs) -> Dict[str, Any]:
+        """ Model specific postprocess and convert model output to
+        standard model outputs.
+
+        Args:
+            inputs:  input data
+
+        Return:
+            dict of results:  a dict containing outputs of model, each
+                output should have the standard output name.
+        """
+        max_boxes_to_return = 25
+        nms_threshold = 0.6
+        min_rpn_score_thresh = 0.9
+        min_box_area = 220
+
+        roi_boxes = inputs['roi_boxes']
+        roi_scores = inputs['roi_scores']
+        detection_boxes = inputs['detection_boxes']
+        scores_unused = inputs['scores_unused']
+        box_outputs = inputs['box_outputs']
+        detection_masks = inputs['detection_masks']
+        visual_features = inputs['visual_features']
+        image_info = inputs['image_info']
+        category_names = inputs['category_names']
+
+        #################################################################
+        # Preprocessing categories and get params
+        category_names = [x.strip() for x in category_names.split(';')]
+        category_names = ['background'] + category_names
+        categories = [{
+            'name': item,
+            'id': idx + 1,
+        } for idx, item in enumerate(category_names)]
+        # category_indices = {cat['id']: cat for cat in categories}
+
+        #################################################################
+        # Obtain results and read image
+
+        roi_boxes = np.squeeze(roi_boxes, axis=0)  # squeeze
+        # no need to clip the boxes, already done
+        roi_scores = np.squeeze(roi_scores, axis=0)
+
+        detection_boxes = np.squeeze(detection_boxes, axis=(0, 2))
+        scores_unused = np.squeeze(scores_unused, axis=0)
+        box_outputs = np.squeeze(box_outputs, axis=0)
+        detection_masks = np.squeeze(detection_masks, axis=0)
+        visual_features = np.squeeze(visual_features, axis=0)
+
+        # obtain image info
+        image_info = np.squeeze(image_info, axis=0)
+        image_scale = np.tile(image_info[2:3, :], (1, 2))
+
+        # rescale
+        rescaled_detection_boxes = detection_boxes / image_scale
+
+        #################################################################
+        # Filter boxes
+
+        # Apply non-maximum suppression to detected boxes with nms threshold.
+        nmsed_indices = nms(detection_boxes, roi_scores, thresh=nms_threshold)
+
+        # Compute RPN box size.
+        box_sizes = (rescaled_detection_boxes[:, 2]
+                     - rescaled_detection_boxes[:, 0]) * (
+                         rescaled_detection_boxes[:, 3]
+                         - rescaled_detection_boxes[:, 1])
+
+        # Filter out invalid rois (nmsed rois)
+        valid_indices = np.where(
+            np.logical_and(
+                np.isin(
+                    np.arange(len(roi_scores), dtype=np.int), nmsed_indices),
+                np.logical_and(
+                    np.logical_not(np.all(roi_boxes == 0., axis=-1)),
+                    np.logical_and(roi_scores >= min_rpn_score_thresh,
+                                   box_sizes > min_box_area))))[0]
+        # print('number of valid indices', len(valid_indices))
+
+        # detection_roi_scores = roi_scores[valid_indices][:max_boxes_to_return,
+        #                                                  ...]
+        detection_boxes = detection_boxes[valid_indices][:max_boxes_to_return,
+                                                         ...]
+        detection_masks = detection_masks[valid_indices][:max_boxes_to_return,
+                                                         ...]
+        detection_visual_feat = visual_features[
+            valid_indices][:max_boxes_to_return, ...]
+        rescaled_detection_boxes = rescaled_detection_boxes[
+            valid_indices][:max_boxes_to_return, ...]
+
+        #################################################################
+        # Compute text embeddings and detection scores, and rank results
+        text_features = self._build_text_embedings(categories)
+
+        raw_scores = detection_visual_feat.dot(text_features.T)
+        if self.use_softmax:
+            scores_all = softmax(self.temperature * raw_scores, axis=-1)
+        else:
+            scores_all = raw_scores
+
+        # Results are ranked by scores
+        indices = np.argsort(-np.max(scores_all, axis=1))
+        # indices_fg = np.array(
+        #     [i for i in indices if np.argmax(scores_all[i]) != 0])
+
+        #################################################################
+        # Plot detected boxes on the input image.
+        ymin, xmin, ymax, xmax = np.split(rescaled_detection_boxes, 4, axis=-1)
+        processed_boxes = np.concatenate([xmin, ymin, xmax, ymax], axis=-1)
+
+        n_boxes = processed_boxes.shape[0]
+        # print(rescaled_detection_boxes)
+
+        categories = []
+        bboxes = []
+        scores = []
+        labels = []
+
+        for anno_idx in indices[0:int(n_boxes)]:
+            anno_bbox = processed_boxes[anno_idx]
+            anno_scores = scores_all[anno_idx]
+
+            if np.argmax(anno_scores) == 0:
+                continue
+            bboxes.append(anno_bbox)
+            scores.append(anno_scores[1:])
+            categories.append(category_names[1:])
+            labels.append(np.argmax(anno_scores) - 1)
+        bboxes = np.vstack(bboxes)
+        scores = np.vstack(scores)
+
+        return scores, categories, bboxes
+
+    def _build_text_embedings(self, categories):
+
+        def processed_name(name, rm_dot=False):
+            # _ for lvis
+            # / for obj365
+            res = name.replace('_', ' ').replace('/', ' or ').lower()
+            if rm_dot:
+                res = res.rstrip('.')
+            return res
+
+        def article(name):
+            return 'an' if name[0] in 'aeiou' else 'a'
+
+        templates = multiple_templates
+
+        run_on_gpu = torch.cuda.is_available()
+
+        with torch.no_grad():
+            all_text_embeddings = []
+            # print('Building text embeddings...')
+            for category in categories:
+                texts = [
+                    template.format(
+                        processed_name(category['name'], rm_dot=True),
+                        article=article(category['name']))
+                    for template in templates
+                ]
+                if self.this_is:
+                    texts = [
+                        'This is ' + text if text.startswith('a')
+                        or text.startswith('the') else text for text in texts
+                    ]
+                # tokenize
+                texts = clip.tokenize(texts)
+                if run_on_gpu:
+                    texts = texts.cuda()
+                # embed with text encoder
+                text_embeddings = self.clip.encode_text(texts)
+                text_embeddings /= text_embeddings.norm(dim=-1, keepdim=True)
+                text_embedding = text_embeddings.mean(dim=0)
+                text_embedding /= text_embedding.norm()
+                all_text_embeddings.append(text_embedding)
+            all_text_embeddings = torch.stack(all_text_embeddings, dim=1)
+            if run_on_gpu:
+                all_text_embeddings = all_text_embeddings.cuda()
+        return all_text_embeddings.cpu().numpy().T
+
+
+multiple_templates = [
+    'There is {article} {} in the scene.',
+    'There is the {} in the scene.',
+    'a photo of {article} {} in the scene.',
+    'a photo of the {} in the scene.',
+    'a photo of one {} in the scene.',
+    'itap of {article} {}.',
+    'itap of my {}.',  # itap: I took a picture of
+    'itap of the {}.',
+    'a photo of {article} {}.',
+    'a photo of my {}.',
+    'a photo of the {}.',
+    'a photo of one {}.',
+    'a photo of many {}.',
+    'a good photo of {article} {}.',
+    'a good photo of the {}.',
+    'a bad photo of {article} {}.',
+    'a bad photo of the {}.',
+    'a photo of a nice {}.',
+    'a photo of the nice {}.',
+    'a photo of a cool {}.',
+    'a photo of the cool {}.',
+    'a photo of a weird {}.',
+    'a photo of the weird {}.',
+    'a photo of a small {}.',
+    'a photo of the small {}.',
+    'a photo of a large {}.',
+    'a photo of the large {}.',
+    'a photo of a clean {}.',
+    'a photo of the clean {}.',
+    'a photo of a dirty {}.',
+    'a photo of the dirty {}.',
+    'a bright photo of {article} {}.',
+    'a bright photo of the {}.',
+    'a dark photo of {article} {}.',
+    'a dark photo of the {}.',
+    'a photo of a hard to see {}.',
+    'a photo of the hard to see {}.',
+    'a low resolution photo of {article} {}.',
+    'a low resolution photo of the {}.',
+    'a cropped photo of {article} {}.',
+    'a cropped photo of the {}.',
+    'a close-up photo of {article} {}.',
+    'a close-up photo of the {}.',
+    'a jpeg corrupted photo of {article} {}.',
+    'a jpeg corrupted photo of the {}.',
+    'a blurry photo of {article} {}.',
+    'a blurry photo of the {}.',
+    'a pixelated photo of {article} {}.',
+    'a pixelated photo of the {}.',
+    'a black and white photo of the {}.',
+    'a black and white photo of {article} {}.',
+    'a plastic {}.',
+    'the plastic {}.',
+    'a toy {}.',
+    'the toy {}.',
+    'a plushie {}.',
+    'the plushie {}.',
+    'a cartoon {}.',
+    'the cartoon {}.',
+    'an embroidered {}.',
+    'the embroidered {}.',
+    'a painting of the {}.',
+    'a painting of a {}.',
+]
+
+
+def nms(dets, scores, thresh, max_dets=1000):
+    """Non-maximum suppression.
+    Args:
+        dets: [N, 4]
+        scores: [N,]
+        thresh: iou threshold. Float
+        max_dets: int.
+    """
+    y1 = dets[:, 0]
+    x1 = dets[:, 1]
+    y2 = dets[:, 2]
+    x2 = dets[:, 3]
+
+    areas = (x2 - x1) * (y2 - y1)
+    order = scores.argsort()[::-1]
+
+    keep = []
+    while order.size > 0 and len(keep) < max_dets:
+        i = order[0]
+        keep.append(i)
+
+        xx1 = np.maximum(x1[i], x1[order[1:]])
+        yy1 = np.maximum(y1[i], y1[order[1:]])
+        xx2 = np.minimum(x2[i], x2[order[1:]])
+        yy2 = np.minimum(y2[i], y2[order[1:]])
+
+        w = np.maximum(0.0, xx2 - xx1)
+        h = np.maximum(0.0, yy2 - yy1)
+        intersection = w * h
+        overlap = intersection / (
+            areas[i] + areas[order[1:]] - intersection + 1e-12)
+
+        inds = np.where(overlap <= thresh)[0]
+        order = order[inds + 1]
+    return keep
diff --git a/modelscope/models/cv/realtime_object_detection/realtime_detector.py b/modelscope/models/cv/realtime_object_detection/realtime_detector.py
deleted file mode 100644
index 2b4b3f8c..00000000
--- a/modelscope/models/cv/realtime_object_detection/realtime_detector.py
+++ /dev/null
@@ -1,90 +0,0 @@
-# Copyright (c) Alibaba, Inc. and its affiliates.
-import argparse
-import logging as logger
-import os
-import os.path as osp
-import time
-
-import cv2
-import json
-import torch
-
-from modelscope.metainfo import Models
-from modelscope.models.base.base_torch_model import TorchModel
-from modelscope.models.builder import MODELS
-from modelscope.preprocessors import LoadImage
-from modelscope.utils.config import Config
-from modelscope.utils.constant import ModelFile, Tasks
-from .yolox.data.data_augment import ValTransform
-from .yolox.exp import get_exp_by_name
-from .yolox.utils import postprocess
-
-
-@MODELS.register_module(
-    group_key=Tasks.image_object_detection,
-    module_name=Models.realtime_object_detection)
-class RealtimeDetector(TorchModel):
-
-    def __init__(self, model_dir: str, *args, **kwargs):
-        super().__init__(model_dir, *args, **kwargs)
-        self.config = Config.from_file(
-            os.path.join(self.model_dir, ModelFile.CONFIGURATION))
-
-        # model type
-        self.exp = get_exp_by_name(self.config.model_type)
-
-        # build model
-        self.model = self.exp.get_model()
-        model_path = osp.join(model_dir, ModelFile.TORCH_MODEL_BIN_FILE)
-        ckpt = torch.load(model_path, map_location='cpu')
-
-        # load the model state dict
-        self.model.load_state_dict(ckpt['model'])
-        self.model.eval()
-
-        # params setting
-        self.exp.num_classes = self.config.num_classes
-        self.confthre = self.config.conf_thr
-        self.num_classes = self.exp.num_classes
-        self.nmsthre = self.exp.nmsthre
-        self.test_size = self.exp.test_size
-        self.preproc = ValTransform(legacy=False)
-        self.label_mapping = self.config['labels']
-
-    def inference(self, img):
-        with torch.no_grad():
-            outputs = self.model(img)
-        return outputs
-
-    def forward(self, inputs):
-        return self.inference(inputs)
-
-    def preprocess(self, img):
-        img = LoadImage.convert_to_ndarray(img)
-        height, width = img.shape[:2]
-        self.ratio = min(self.test_size[0] / img.shape[0],
-                         self.test_size[1] / img.shape[1])
-
-        img, _ = self.preproc(img, None, self.test_size)
-        img = torch.from_numpy(img).unsqueeze(0)
-        img = img.float()
-
-        return img
-
-    def postprocess(self, input):
-        outputs = postprocess(
-            input,
-            self.num_classes,
-            self.confthre,
-            self.nmsthre,
-            class_agnostic=True)
-
-        if len(outputs) == 1:
-            bboxes = outputs[0][:, 0:4].cpu().numpy() / self.ratio
-            scores = outputs[0][:, 5].cpu().numpy()
-            labels = outputs[0][:, 6].cpu().int().numpy()
-            pred_label_names = []
-            for lab in labels:
-                pred_label_names.append(self.label_mapping[lab])
-
-        return bboxes, scores, pred_label_names
diff --git a/modelscope/models/cv/realtime_object_detection/yolox/exp/default/yolox_nano.py b/modelscope/models/cv/realtime_object_detection/yolox/exp/default/yolox_nano.py
deleted file mode 100644
index 7bada485..00000000
--- a/modelscope/models/cv/realtime_object_detection/yolox/exp/default/yolox_nano.py
+++ /dev/null
@@ -1,47 +0,0 @@
-# The implementation is based on YOLOX, available at https://github.com/Megvii-BaseDetection/YOLOX
-
-import os
-
-import torch.nn as nn
-
-from ..yolox_base import Exp as YoloXExp
-
-
-class YoloXNanoExp(YoloXExp):
-
-    def __init__(self):
-        super(YoloXNanoExp, self).__init__()
-        self.depth = 0.33
-        self.width = 0.25
-        self.input_size = (416, 416)
-        self.test_size = (416, 416)
-
-    def get_model(self, sublinear=False):
-
-        def init_yolo(M):
-            for m in M.modules():
-                if isinstance(m, nn.BatchNorm2d):
-                    m.eps = 1e-3
-                    m.momentum = 0.03
-
-        if 'model' not in self.__dict__:
-            from ...models import YOLOX, YOLOPAFPN, YOLOXHead
-            in_channels = [256, 512, 1024]
-            # NANO model use depthwise = True, which is main difference.
-            backbone = YOLOPAFPN(
-                self.depth,
-                self.width,
-                in_channels=in_channels,
-                act=self.act,
-                depthwise=True,
-            )
-            head = YOLOXHead(
-                self.num_classes,
-                self.width,
-                in_channels=in_channels,
-                act=self.act,
-                depthwise=True)
-            self.model = YOLOX(backbone, head)
-        self.model.apply(init_yolo)
-        self.model.head.initialize_biases(1e-2)
-        return self.model
diff --git a/modelscope/models/cv/realtime_object_detection/yolox/exp/default/yolox_s.py b/modelscope/models/cv/realtime_object_detection/yolox/exp/default/yolox_s.py
deleted file mode 100644
index 5a123b37..00000000
--- a/modelscope/models/cv/realtime_object_detection/yolox/exp/default/yolox_s.py
+++ /dev/null
@@ -1,13 +0,0 @@
-# The implementation is based on YOLOX, available at https://github.com/Megvii-BaseDetection/YOLOX
-
-import os
-
-from ..yolox_base import Exp as YoloXExp
-
-
-class YoloXSExp(YoloXExp):
-
-    def __init__(self):
-        super(YoloXSExp, self).__init__()
-        self.depth = 0.33
-        self.width = 0.50
diff --git a/modelscope/models/cv/realtime_object_detection/yolox/exp/default/yolox_tiny.py b/modelscope/models/cv/realtime_object_detection/yolox/exp/default/yolox_tiny.py
deleted file mode 100644
index a80d0f2d..00000000
--- a/modelscope/models/cv/realtime_object_detection/yolox/exp/default/yolox_tiny.py
+++ /dev/null
@@ -1,20 +0,0 @@
-# The implementation is based on YOLOX, available at https://github.com/Megvii-BaseDetection/YOLOX
-
-import os
-
-from ..yolox_base import Exp as YoloXExp
-
-
-class YoloXTinyExp(YoloXExp):
-
-    def __init__(self):
-        super(YoloXTinyExp, self).__init__()
-        self.depth = 0.33
-        self.width = 0.375
-        self.input_size = (416, 416)
-        self.mosaic_scale = (0.5, 1.5)
-        self.random_size = (10, 20)
-        self.test_size = (416, 416)
-        self.exp_name = os.path.split(
-            os.path.realpath(__file__))[1].split('.')[0]
-        self.enable_mixup = False
diff --git a/modelscope/models/cv/realtime_object_detection/yolox/models/yolo_fpn.py b/modelscope/models/cv/realtime_object_detection/yolox/models/yolo_fpn.py
deleted file mode 100644
index 0cbebb09..00000000
--- a/modelscope/models/cv/realtime_object_detection/yolox/models/yolo_fpn.py
+++ /dev/null
@@ -1,80 +0,0 @@
-# The implementation is based on YOLOX, available at https://github.com/Megvii-BaseDetection/YOLOX
-
-import torch
-import torch.nn as nn
-
-from .darknet import Darknet
-from .network_blocks import BaseConv
-
-
-class YOLOFPN(nn.Module):
-    """
-    YOLOFPN module. Darknet 53 is the default backbone of this model.
-    """
-
-    def __init__(
-        self,
-        depth=53,
-        in_features=['dark3', 'dark4', 'dark5'],
-    ):
-        super(YOLOFPN, self).__init__()
-
-        self.backbone = Darknet(depth)
-        self.in_features = in_features
-
-        # out 1
-        self.out1_cbl = self._make_cbl(512, 256, 1)
-        self.out1 = self._make_embedding([256, 512], 512 + 256)
-
-        # out 2
-        self.out2_cbl = self._make_cbl(256, 128, 1)
-        self.out2 = self._make_embedding([128, 256], 256 + 128)
-
-        # upsample
-        self.upsample = nn.Upsample(scale_factor=2, mode='nearest')
-
-    def _make_cbl(self, _in, _out, ks):
-        return BaseConv(_in, _out, ks, stride=1, act='lrelu')
-
-    def _make_embedding(self, filters_list, in_filters):
-        m = nn.Sequential(*[
-            self._make_cbl(in_filters, filters_list[0], 1),
-            self._make_cbl(filters_list[0], filters_list[1], 3),
-            self._make_cbl(filters_list[1], filters_list[0], 1),
-            self._make_cbl(filters_list[0], filters_list[1], 3),
-            self._make_cbl(filters_list[1], filters_list[0], 1),
-        ])
-        return m
-
-    def load_pretrained_model(self, filename='./weights/darknet53.mix.pth'):
-        with open(filename, 'rb') as f:
-            state_dict = torch.load(f, map_location='cpu')
-        print('loading pretrained weights...')
-        self.backbone.load_state_dict(state_dict)
-
-    def forward(self, inputs):
-        """
-        Args:
-            inputs (Tensor): input image.
-
-        Returns:
-            Tuple[Tensor]: FPN output features..
-        """
-        #  backbone
-        out_features = self.backbone(inputs)
-        x2, x1, x0 = [out_features[f] for f in self.in_features]
-
-        #  yolo branch 1
-        x1_in = self.out1_cbl(x0)
-        x1_in = self.upsample(x1_in)
-        x1_in = torch.cat([x1_in, x1], 1)
-        out_dark4 = self.out1(x1_in)
-
-        #  yolo branch 2
-        x2_in = self.out2_cbl(out_dark4)
-        x2_in = self.upsample(x2_in)
-        x2_in = torch.cat([x2_in, x2], 1)
-        out_dark3 = self.out2(x2_in)
-
-        outputs = (out_dark3, out_dark4, x0)
-        return outputs
diff --git a/modelscope/models/cv/realtime_object_detection/yolox/models/yolo_head.py b/modelscope/models/cv/realtime_object_detection/yolox/models/yolo_head.py
deleted file mode 100644
index 1eef93a4..00000000
--- a/modelscope/models/cv/realtime_object_detection/yolox/models/yolo_head.py
+++ /dev/null
@@ -1,182 +0,0 @@
-# The implementation is based on YOLOX, available at https://github.com/Megvii-BaseDetection/YOLOX
-
-import math
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-
-from ..utils import bboxes_iou, meshgrid
-from .network_blocks import BaseConv, DWConv
-
-
-class YOLOXHead(nn.Module):
-
-    def __init__(
-        self,
-        num_classes,
-        width=1.0,
-        strides=[8, 16, 32],
-        in_channels=[256, 512, 1024],
-        act='silu',
-        depthwise=False,
-    ):
-        """
-        Args:
-            act (str): activation type of conv. Defalut value: "silu".
-            depthwise (bool): whether apply depthwise conv in conv branch. Defalut value: False.
-        """
-        super(YOLOXHead, self).__init__()
-
-        self.n_anchors = 1
-        self.num_classes = num_classes
-        self.decode_in_inference = True  # for deploy, set to False
-
-        self.cls_convs = nn.ModuleList()
-        self.reg_convs = nn.ModuleList()
-        self.cls_preds = nn.ModuleList()
-        self.reg_preds = nn.ModuleList()
-        self.obj_preds = nn.ModuleList()
-        self.stems = nn.ModuleList()
-        Conv = DWConv if depthwise else BaseConv
-
-        for i in range(len(in_channels)):
-            self.stems.append(
-                BaseConv(
-                    in_channels=int(in_channels[i] * width),
-                    out_channels=int(256 * width),
-                    ksize=1,
-                    stride=1,
-                    act=act,
-                ))
-            self.cls_convs.append(
-                nn.Sequential(*[
-                    Conv(
-                        in_channels=int(256 * width),
-                        out_channels=int(256 * width),
-                        ksize=3,
-                        stride=1,
-                        act=act,
-                    ),
-                    Conv(
-                        in_channels=int(256 * width),
-                        out_channels=int(256 * width),
-                        ksize=3,
-                        stride=1,
-                        act=act,
-                    ),
-                ]))
-            self.reg_convs.append(
-                nn.Sequential(*[
-                    Conv(
-                        in_channels=int(256 * width),
-                        out_channels=int(256 * width),
-                        ksize=3,
-                        stride=1,
-                        act=act,
-                    ),
-                    Conv(
-                        in_channels=int(256 * width),
-                        out_channels=int(256 * width),
-                        ksize=3,
-                        stride=1,
-                        act=act,
-                    ),
-                ]))
-            self.cls_preds.append(
-                nn.Conv2d(
-                    in_channels=int(256 * width),
-                    out_channels=self.n_anchors * self.num_classes,
-                    kernel_size=1,
-                    stride=1,
-                    padding=0,
-                ))
-            self.reg_preds.append(
-                nn.Conv2d(
-                    in_channels=int(256 * width),
-                    out_channels=4,
-                    kernel_size=1,
-                    stride=1,
-                    padding=0,
-                ))
-            self.obj_preds.append(
-                nn.Conv2d(
-                    in_channels=int(256 * width),
-                    out_channels=self.n_anchors * 1,
-                    kernel_size=1,
-                    stride=1,
-                    padding=0,
-                ))
-
-        self.use_l1 = False
-        self.l1_loss = nn.L1Loss(reduction='none')
-        self.bcewithlog_loss = nn.BCEWithLogitsLoss(reduction='none')
-        # self.iou_loss = IOUloss(reduction="none")
-        self.strides = strides
-        self.grids = [torch.zeros(1)] * len(in_channels)
-
-    def initialize_biases(self, prior_prob):
-        for conv in self.cls_preds:
-            b = conv.bias.view(self.n_anchors, -1)
-            b.data.fill_(-math.log((1 - prior_prob) / prior_prob))
-            conv.bias = torch.nn.Parameter(b.view(-1), requires_grad=True)
-
-        for conv in self.obj_preds:
-            b = conv.bias.view(self.n_anchors, -1)
-            b.data.fill_(-math.log((1 - prior_prob) / prior_prob))
-            conv.bias = torch.nn.Parameter(b.view(-1), requires_grad=True)
-
-    def forward(self, xin, labels=None, imgs=None):
-        outputs = []
-
-        for k, (cls_conv, reg_conv, stride_this_level, x) in enumerate(
-                zip(self.cls_convs, self.reg_convs, self.strides, xin)):
-            x = self.stems[k](x)
-            cls_x = x
-            reg_x = x
-
-            cls_feat = cls_conv(cls_x)
-            cls_output = self.cls_preds[k](cls_feat)
-
-            reg_feat = reg_conv(reg_x)
-            reg_output = self.reg_preds[k](reg_feat)
-            obj_output = self.obj_preds[k](reg_feat)
-
-            if self.training:
-                pass
-            else:
-                output = torch.cat(
-                    [reg_output,
-                     obj_output.sigmoid(),
-                     cls_output.sigmoid()], 1)
-
-            outputs.append(output)
-
-        if self.training:
-            pass
-        else:
-            self.hw = [x.shape[-2:] for x in outputs]
-            # [batch, n_anchors_all, 85]
-            outputs = torch.cat([x.flatten(start_dim=2) for x in outputs],
-                                dim=2).permute(0, 2, 1)
-            if self.decode_in_inference:
-                return self.decode_outputs(outputs, dtype=xin[0].type())
-            else:
-                return outputs
-
-    def decode_outputs(self, outputs, dtype):
-        grids = []
-        strides = []
-        for (hsize, wsize), stride in zip(self.hw, self.strides):
-            yv, xv = meshgrid([torch.arange(hsize), torch.arange(wsize)])
-            grid = torch.stack((xv, yv), 2).view(1, -1, 2)
-            grids.append(grid)
-            shape = grid.shape[:2]
-            strides.append(torch.full((*shape, 1), stride))
-
-        grids = torch.cat(grids, dim=1).type(dtype)
-        strides = torch.cat(strides, dim=1).type(dtype)
-
-        outputs[..., :2] = (outputs[..., :2] + grids) * strides
-        outputs[..., 2:4] = torch.exp(outputs[..., 2:4]) * strides
-        return outputs
diff --git a/modelscope/models/cv/realtime_object_detection/yolox/models/yolo_pafpn.py b/modelscope/models/cv/realtime_object_detection/yolox/models/yolo_pafpn.py
deleted file mode 100644
index cd4258bf..00000000
--- a/modelscope/models/cv/realtime_object_detection/yolox/models/yolo_pafpn.py
+++ /dev/null
@@ -1,126 +0,0 @@
-# The implementation is based on YOLOX, available at https://github.com/Megvii-BaseDetection/YOLOX
-
-import torch
-import torch.nn as nn
-
-from .darknet import CSPDarknet
-from .network_blocks import BaseConv, CSPLayer, DWConv
-
-
-class YOLOPAFPN(nn.Module):
-    """
-    YOLOv3 model. Darknet 53 is the default backbone of this model.
-    """
-
-    def __init__(
-        self,
-        depth=1.0,
-        width=1.0,
-        in_features=('dark3', 'dark4', 'dark5'),
-        in_channels=[256, 512, 1024],
-        depthwise=False,
-        act='silu',
-    ):
-        super(YOLOPAFPN, self).__init__()
-        self.backbone = CSPDarknet(depth, width, depthwise=depthwise, act=act)
-        self.in_features = in_features
-        self.in_channels = in_channels
-        Conv = DWConv if depthwise else BaseConv
-
-        self.upsample = nn.Upsample(scale_factor=2, mode='nearest')
-        self.lateral_conv0 = BaseConv(
-            int(in_channels[2] * width),
-            int(in_channels[1] * width),
-            1,
-            1,
-            act=act)
-        self.C3_p4 = CSPLayer(
-            int(2 * in_channels[1] * width),
-            int(in_channels[1] * width),
-            round(3 * depth),
-            False,
-            depthwise=depthwise,
-            act=act,
-        )  # cat
-
-        self.reduce_conv1 = BaseConv(
-            int(in_channels[1] * width),
-            int(in_channels[0] * width),
-            1,
-            1,
-            act=act)
-        self.C3_p3 = CSPLayer(
-            int(2 * in_channels[0] * width),
-            int(in_channels[0] * width),
-            round(3 * depth),
-            False,
-            depthwise=depthwise,
-            act=act,
-        )
-
-        # bottom-up conv
-        self.bu_conv2 = Conv(
-            int(in_channels[0] * width),
-            int(in_channels[0] * width),
-            3,
-            2,
-            act=act)
-        self.C3_n3 = CSPLayer(
-            int(2 * in_channels[0] * width),
-            int(in_channels[1] * width),
-            round(3 * depth),
-            False,
-            depthwise=depthwise,
-            act=act,
-        )
-
-        # bottom-up conv
-        self.bu_conv1 = Conv(
-            int(in_channels[1] * width),
-            int(in_channels[1] * width),
-            3,
-            2,
-            act=act)
-        self.C3_n4 = CSPLayer(
-            int(2 * in_channels[1] * width),
-            int(in_channels[2] * width),
-            round(3 * depth),
-            False,
-            depthwise=depthwise,
-            act=act,
-        )
-
-    def forward(self, input):
-        """
-        Args:
-            inputs: input images.
-
-        Returns:
-            Tuple[Tensor]: FPN feature.
-        """
-
-        #  backbone
-        out_features = self.backbone(input)
-        features = [out_features[f] for f in self.in_features]
-        [x2, x1, x0] = features
-
-        fpn_out0 = self.lateral_conv0(x0)  # 1024->512/32
-        f_out0 = self.upsample(fpn_out0)  # 512/16
-        f_out0 = torch.cat([f_out0, x1], 1)  # 512->1024/16
-        f_out0 = self.C3_p4(f_out0)  # 1024->512/16
-
-        fpn_out1 = self.reduce_conv1(f_out0)  # 512->256/16
-        f_out1 = self.upsample(fpn_out1)  # 256/8
-        f_out1 = torch.cat([f_out1, x2], 1)  # 256->512/8
-        pan_out2 = self.C3_p3(f_out1)  # 512->256/8
-
-        p_out1 = self.bu_conv2(pan_out2)  # 256->256/16
-        p_out1 = torch.cat([p_out1, fpn_out1], 1)  # 256->512/16
-        pan_out1 = self.C3_n3(p_out1)  # 512->512/16
-
-        p_out0 = self.bu_conv1(pan_out1)  # 512->512/32
-        p_out0 = torch.cat([p_out0, fpn_out0], 1)  # 512->1024/32
-        pan_out0 = self.C3_n4(p_out0)  # 1024->1024/32
-
-        outputs = (pan_out2, pan_out1, pan_out0)
-        return outputs
diff --git a/modelscope/models/cv/realtime_object_detection/yolox/models/yolox.py b/modelscope/models/cv/realtime_object_detection/yolox/models/yolox.py
deleted file mode 100644
index 181c368b..00000000
--- a/modelscope/models/cv/realtime_object_detection/yolox/models/yolox.py
+++ /dev/null
@@ -1,33 +0,0 @@
-# The implementation is based on YOLOX, available at https://github.com/Megvii-BaseDetection/YOLOX
-
-import torch.nn as nn
-
-from .yolo_head import YOLOXHead
-from .yolo_pafpn import YOLOPAFPN
-
-
-class YOLOX(nn.Module):
-    """
-    YOLOX model module. The module list is defined by create_yolov3_modules function.
-    The network returns loss values from three YOLO layers during training
-    and detection results during test.
-    """
-
-    def __init__(self, backbone=None, head=None):
-        super(YOLOX, self).__init__()
-        if backbone is None:
-            backbone = YOLOPAFPN()
-        if head is None:
-            head = YOLOXHead(80)
-
-        self.backbone = backbone
-        self.head = head
-
-    def forward(self, x, targets=None):
-        fpn_outs = self.backbone(x)
-        if self.training:
-            raise NotImplementedError('Training is not supported yet!')
-        else:
-            outputs = self.head(fpn_outs)
-
-        return outputs
diff --git a/modelscope/models/cv/realtime_object_detection/yolox/utils/__init__.py b/modelscope/models/cv/realtime_object_detection/yolox/utils/__init__.py
deleted file mode 100644
index 2c1ea489..00000000
--- a/modelscope/models/cv/realtime_object_detection/yolox/utils/__init__.py
+++ /dev/null
@@ -1,5 +0,0 @@
-# The implementation is based on YOLOX, available at https://github.com/Megvii-BaseDetection/YOLOX
-
-from .boxes import *  # noqa
-
-__all__ = ['bboxes_iou', 'meshgrid', 'postprocess', 'xyxy2cxcywh', 'xyxy2xywh']
diff --git a/modelscope/models/cv/realtime_object_detection/__init__.py b/modelscope/models/cv/stream_yolo/__init__.py
similarity index 83%
rename from modelscope/models/cv/realtime_object_detection/__init__.py
rename to modelscope/models/cv/stream_yolo/__init__.py
index 66156977..33d4828b 100644
--- a/modelscope/models/cv/realtime_object_detection/__init__.py
+++ b/modelscope/models/cv/stream_yolo/__init__.py
@@ -4,11 +4,9 @@ from typing import TYPE_CHECKING
 from modelscope.utils.import_utils import LazyImportModule
 
 if TYPE_CHECKING:
-    from .realtime_detector import RealtimeDetector
     from .realtime_video_detector import RealtimeVideoDetector
 else:
     _import_structure = {
-        'realtime_detector': ['RealtimeDetector'],
         'realtime_video_detector': ['RealtimeVideoDetector'],
     }
 
diff --git a/modelscope/models/cv/stream_yolo/data/__init__.py b/modelscope/models/cv/stream_yolo/data/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/modelscope/models/cv/realtime_object_detection/yolox/data/data_augment.py b/modelscope/models/cv/stream_yolo/data/data_augment.py
similarity index 100%
rename from modelscope/models/cv/realtime_object_detection/yolox/data/data_augment.py
rename to modelscope/models/cv/stream_yolo/data/data_augment.py
diff --git a/modelscope/models/cv/realtime_object_detection/yolox/exp/__init__.py b/modelscope/models/cv/stream_yolo/exp/__init__.py
similarity index 85%
rename from modelscope/models/cv/realtime_object_detection/yolox/exp/__init__.py
rename to modelscope/models/cv/stream_yolo/exp/__init__.py
index e8e3be15..8ddd8258 100644
--- a/modelscope/models/cv/realtime_object_detection/yolox/exp/__init__.py
+++ b/modelscope/models/cv/stream_yolo/exp/__init__.py
@@ -2,4 +2,3 @@
 
 from .base_exp import BaseExp
 from .build import get_exp_by_name
-from .yolox_base import Exp
diff --git a/modelscope/models/cv/realtime_object_detection/yolox/exp/base_exp.py b/modelscope/models/cv/stream_yolo/exp/base_exp.py
similarity index 100%
rename from modelscope/models/cv/realtime_object_detection/yolox/exp/base_exp.py
rename to modelscope/models/cv/stream_yolo/exp/base_exp.py
diff --git a/modelscope/models/cv/realtime_object_detection/yolox/exp/build.py b/modelscope/models/cv/stream_yolo/exp/build.py
similarity index 57%
rename from modelscope/models/cv/realtime_object_detection/yolox/exp/build.py
rename to modelscope/models/cv/stream_yolo/exp/build.py
index 5865c53b..ac179a37 100644
--- a/modelscope/models/cv/realtime_object_detection/yolox/exp/build.py
+++ b/modelscope/models/cv/stream_yolo/exp/build.py
@@ -7,13 +7,7 @@ import sys
 def get_exp_by_name(exp_name):
     exp = exp_name.replace('-',
                            '_')  # convert string like "yolox-s" to "yolox_s"
-    if exp == 'yolox_s':
-        from .default import YoloXSExp as YoloXExp
-    elif exp == 'yolox_nano':
-        from .default import YoloXNanoExp as YoloXExp
-    elif exp == 'yolox_tiny':
-        from .default import YoloXTinyExp as YoloXExp
-    elif exp == 'streamyolo':
+    if exp == 'streamyolo':
         from .default import StreamYoloExp as YoloXExp
     else:
         pass
diff --git a/modelscope/models/cv/realtime_object_detection/yolox/exp/default/__init__.py b/modelscope/models/cv/stream_yolo/exp/default/__init__.py
similarity index 56%
rename from modelscope/models/cv/realtime_object_detection/yolox/exp/default/__init__.py
rename to modelscope/models/cv/stream_yolo/exp/default/__init__.py
index cfec836c..4c8928d6 100644
--- a/modelscope/models/cv/realtime_object_detection/yolox/exp/default/__init__.py
+++ b/modelscope/models/cv/stream_yolo/exp/default/__init__.py
@@ -1,5 +1,2 @@
 # The implementation is based on YOLOX, available at https://github.com/Megvii-BaseDetection/YOLOX
 from .streamyolo import StreamYoloExp
-from .yolox_nano import YoloXNanoExp
-from .yolox_s import YoloXSExp
-from .yolox_tiny import YoloXTinyExp
diff --git a/modelscope/models/cv/realtime_object_detection/yolox/exp/default/streamyolo.py b/modelscope/models/cv/stream_yolo/exp/default/streamyolo.py
similarity index 100%
rename from modelscope/models/cv/realtime_object_detection/yolox/exp/default/streamyolo.py
rename to modelscope/models/cv/stream_yolo/exp/default/streamyolo.py
diff --git a/modelscope/models/cv/realtime_object_detection/yolox/exp/yolox_base.py b/modelscope/models/cv/stream_yolo/exp/yolox_base.py
similarity index 100%
rename from modelscope/models/cv/realtime_object_detection/yolox/exp/yolox_base.py
rename to modelscope/models/cv/stream_yolo/exp/yolox_base.py
diff --git a/modelscope/models/cv/realtime_object_detection/yolox/models/__init__.py b/modelscope/models/cv/stream_yolo/models/__init__.py
similarity index 66%
rename from modelscope/models/cv/realtime_object_detection/yolox/models/__init__.py
rename to modelscope/models/cv/stream_yolo/models/__init__.py
index d2e889f1..0988bd5f 100644
--- a/modelscope/models/cv/realtime_object_detection/yolox/models/__init__.py
+++ b/modelscope/models/cv/stream_yolo/models/__init__.py
@@ -4,7 +4,3 @@ from .darknet import CSPDarknet, Darknet
 from .dfp_pafpn import DFPPAFPN
 from .streamyolo import StreamYOLO
 from .tal_head import TALHead
-from .yolo_fpn import YOLOFPN
-from .yolo_head import YOLOXHead
-from .yolo_pafpn import YOLOPAFPN
-from .yolox import YOLOX
diff --git a/modelscope/models/cv/realtime_object_detection/yolox/models/darknet.py b/modelscope/models/cv/stream_yolo/models/darknet.py
similarity index 100%
rename from modelscope/models/cv/realtime_object_detection/yolox/models/darknet.py
rename to modelscope/models/cv/stream_yolo/models/darknet.py
diff --git a/modelscope/models/cv/realtime_object_detection/yolox/models/dfp_pafpn.py b/modelscope/models/cv/stream_yolo/models/dfp_pafpn.py
similarity index 100%
rename from modelscope/models/cv/realtime_object_detection/yolox/models/dfp_pafpn.py
rename to modelscope/models/cv/stream_yolo/models/dfp_pafpn.py
diff --git a/modelscope/models/cv/realtime_object_detection/yolox/models/network_blocks.py b/modelscope/models/cv/stream_yolo/models/network_blocks.py
similarity index 100%
rename from modelscope/models/cv/realtime_object_detection/yolox/models/network_blocks.py
rename to modelscope/models/cv/stream_yolo/models/network_blocks.py
diff --git a/modelscope/models/cv/realtime_object_detection/yolox/models/streamyolo.py b/modelscope/models/cv/stream_yolo/models/streamyolo.py
similarity index 100%
rename from modelscope/models/cv/realtime_object_detection/yolox/models/streamyolo.py
rename to modelscope/models/cv/stream_yolo/models/streamyolo.py
diff --git a/modelscope/models/cv/realtime_object_detection/yolox/models/tal_head.py b/modelscope/models/cv/stream_yolo/models/tal_head.py
similarity index 100%
rename from modelscope/models/cv/realtime_object_detection/yolox/models/tal_head.py
rename to modelscope/models/cv/stream_yolo/models/tal_head.py
diff --git a/modelscope/models/cv/realtime_object_detection/realtime_video_detector.py b/modelscope/models/cv/stream_yolo/realtime_video_detector.py
similarity index 96%
rename from modelscope/models/cv/realtime_object_detection/realtime_video_detector.py
rename to modelscope/models/cv/stream_yolo/realtime_video_detector.py
index ebe0221b..a5d62d72 100644
--- a/modelscope/models/cv/realtime_object_detection/realtime_video_detector.py
+++ b/modelscope/models/cv/stream_yolo/realtime_video_detector.py
@@ -17,10 +17,9 @@ from modelscope.models.builder import MODELS
 from modelscope.preprocessors import LoadImage
 from modelscope.utils.config import Config
 from modelscope.utils.constant import ModelFile, Tasks
-from .utils import timestamp_format
-from .yolox.data.data_augment import ValTransform
-from .yolox.exp import get_exp_by_name
-from .yolox.utils import postprocess
+from .data.data_augment import ValTransform
+from .exp import get_exp_by_name
+from .utils import postprocess, timestamp_format
 
 
 @MODELS.register_module(
diff --git a/modelscope/models/cv/stream_yolo/utils/__init__.py b/modelscope/models/cv/stream_yolo/utils/__init__.py
new file mode 100644
index 00000000..46ad7d85
--- /dev/null
+++ b/modelscope/models/cv/stream_yolo/utils/__init__.py
@@ -0,0 +1,9 @@
+# The implementation is based on YOLOX, available at https://github.com/Megvii-BaseDetection/YOLOX
+
+from .boxes import *  # noqa
+from .format import *  # noqa
+
+__all__ = [
+    'bboxes_iou', 'meshgrid', 'postprocess', 'xyxy2cxcywh', 'xyxy2xywh',
+    'timestamp_format'
+]
diff --git a/modelscope/models/cv/realtime_object_detection/yolox/utils/boxes.py b/modelscope/models/cv/stream_yolo/utils/boxes.py
similarity index 100%
rename from modelscope/models/cv/realtime_object_detection/yolox/utils/boxes.py
rename to modelscope/models/cv/stream_yolo/utils/boxes.py
diff --git a/modelscope/models/cv/realtime_object_detection/utils.py b/modelscope/models/cv/stream_yolo/utils/format.py
similarity index 100%
rename from modelscope/models/cv/realtime_object_detection/utils.py
rename to modelscope/models/cv/stream_yolo/utils/format.py
diff --git a/modelscope/models/cv/super_resolution/__init__.py b/modelscope/models/cv/super_resolution/__init__.py
index 5065e280..7187c57a 100644
--- a/modelscope/models/cv/super_resolution/__init__.py
+++ b/modelscope/models/cv/super_resolution/__init__.py
@@ -5,9 +5,13 @@ from modelscope.utils.import_utils import LazyImportModule
 
 if TYPE_CHECKING:
     from .rrdbnet_arch import RRDBNet
+    from .ecbsr_model import ECBSRModel
 
 else:
-    _import_structure = {'rrdbnet_arch': ['RRDBNet']}
+    _import_structure = {
+        'rrdbnet_arch': ['RRDBNet'],
+        'ecbsr_model': ['ECBSRModel']
+    }
 
     import sys
 
diff --git a/modelscope/models/cv/super_resolution/ecb.py b/modelscope/models/cv/super_resolution/ecb.py
new file mode 100644
index 00000000..4ddf734c
--- /dev/null
+++ b/modelscope/models/cv/super_resolution/ecb.py
@@ -0,0 +1,272 @@
+# The implementation is adopted from ECBSR,
+# made publicly available under the Apache 2.0 License at
+# https://github.com/xindongzhang/ECBSR/blob/main/models/ecb.py
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class SeqConv3x3(nn.Module):
+
+    def __init__(self, seq_type, inp_planes, out_planes, depth_multiplier):
+        super(SeqConv3x3, self).__init__()
+
+        self.type = seq_type
+        self.inp_planes = inp_planes
+        self.out_planes = out_planes
+
+        if self.type == 'conv1x1-conv3x3':
+            self.mid_planes = int(out_planes * depth_multiplier)
+            conv0 = torch.nn.Conv2d(
+                self.inp_planes, self.mid_planes, kernel_size=1, padding=0)
+            self.k0 = conv0.weight
+            self.b0 = conv0.bias
+
+            conv1 = torch.nn.Conv2d(
+                self.mid_planes, self.out_planes, kernel_size=3)
+            self.k1 = conv1.weight
+            self.b1 = conv1.bias
+
+        elif self.type == 'conv1x1-sobelx':
+            conv0 = torch.nn.Conv2d(
+                self.inp_planes, self.out_planes, kernel_size=1, padding=0)
+            self.k0 = conv0.weight
+            self.b0 = conv0.bias
+
+            # init scale & bias
+            scale = torch.randn(size=(self.out_planes, 1, 1, 1)) * 1e-3
+            self.scale = nn.Parameter(scale)
+            # bias = 0.0
+            # bias = [bias for c in range(self.out_planes)]
+            # bias = torch.FloatTensor(bias)
+            bias = torch.randn(self.out_planes) * 1e-3
+            bias = torch.reshape(bias, (self.out_planes, ))
+            self.bias = nn.Parameter(bias)
+            # init mask
+            self.mask = torch.zeros((self.out_planes, 1, 3, 3),
+                                    dtype=torch.float32)
+            for i in range(self.out_planes):
+                self.mask[i, 0, 0, 0] = 1.0
+                self.mask[i, 0, 1, 0] = 2.0
+                self.mask[i, 0, 2, 0] = 1.0
+                self.mask[i, 0, 0, 2] = -1.0
+                self.mask[i, 0, 1, 2] = -2.0
+                self.mask[i, 0, 2, 2] = -1.0
+            self.mask = nn.Parameter(data=self.mask, requires_grad=False)
+
+        elif self.type == 'conv1x1-sobely':
+            conv0 = torch.nn.Conv2d(
+                self.inp_planes, self.out_planes, kernel_size=1, padding=0)
+            self.k0 = conv0.weight
+            self.b0 = conv0.bias
+
+            # init scale & bias
+            scale = torch.randn(size=(self.out_planes, 1, 1, 1)) * 1e-3
+            self.scale = nn.Parameter(torch.FloatTensor(scale))
+            # bias = 0.0
+            # bias = [bias for c in range(self.out_planes)]
+            # bias = torch.FloatTensor(bias)
+            bias = torch.randn(self.out_planes) * 1e-3
+            bias = torch.reshape(bias, (self.out_planes, ))
+            self.bias = nn.Parameter(torch.FloatTensor(bias))
+            # init mask
+            self.mask = torch.zeros((self.out_planes, 1, 3, 3),
+                                    dtype=torch.float32)
+            for i in range(self.out_planes):
+                self.mask[i, 0, 0, 0] = 1.0
+                self.mask[i, 0, 0, 1] = 2.0
+                self.mask[i, 0, 0, 2] = 1.0
+                self.mask[i, 0, 2, 0] = -1.0
+                self.mask[i, 0, 2, 1] = -2.0
+                self.mask[i, 0, 2, 2] = -1.0
+            self.mask = nn.Parameter(data=self.mask, requires_grad=False)
+
+        elif self.type == 'conv1x1-laplacian':
+            conv0 = torch.nn.Conv2d(
+                self.inp_planes, self.out_planes, kernel_size=1, padding=0)
+            self.k0 = conv0.weight
+            self.b0 = conv0.bias
+
+            # init scale & bias
+            scale = torch.randn(size=(self.out_planes, 1, 1, 1)) * 1e-3
+            self.scale = nn.Parameter(torch.FloatTensor(scale))
+            # bias = 0.0
+            # bias = [bias for c in range(self.out_planes)]
+            # bias = torch.FloatTensor(bias)
+            bias = torch.randn(self.out_planes) * 1e-3
+            bias = torch.reshape(bias, (self.out_planes, ))
+            self.bias = nn.Parameter(torch.FloatTensor(bias))
+            # init mask
+            self.mask = torch.zeros((self.out_planes, 1, 3, 3),
+                                    dtype=torch.float32)
+            for i in range(self.out_planes):
+                self.mask[i, 0, 0, 1] = 1.0
+                self.mask[i, 0, 1, 0] = 1.0
+                self.mask[i, 0, 1, 2] = 1.0
+                self.mask[i, 0, 2, 1] = 1.0
+                self.mask[i, 0, 1, 1] = -4.0
+            self.mask = nn.Parameter(data=self.mask, requires_grad=False)
+        else:
+            raise ValueError('the type of seqconv is not supported!')
+
+    def forward(self, x):
+        if self.type == 'conv1x1-conv3x3':
+            # conv-1x1
+            y0 = F.conv2d(input=x, weight=self.k0, bias=self.b0, stride=1)
+            # explicitly padding with bias
+            y0 = F.pad(y0, (1, 1, 1, 1), 'constant', 0)
+            b0_pad = self.b0.view(1, -1, 1, 1)
+            y0[:, :, 0:1, :] = b0_pad
+            y0[:, :, -1:, :] = b0_pad
+            y0[:, :, :, 0:1] = b0_pad
+            y0[:, :, :, -1:] = b0_pad
+            # conv-3x3
+            y1 = F.conv2d(input=y0, weight=self.k1, bias=self.b1, stride=1)
+        else:
+            y0 = F.conv2d(input=x, weight=self.k0, bias=self.b0, stride=1)
+            # explicitly padding with bias
+            y0 = F.pad(y0, (1, 1, 1, 1), 'constant', 0)
+            b0_pad = self.b0.view(1, -1, 1, 1)
+            y0[:, :, 0:1, :] = b0_pad
+            y0[:, :, -1:, :] = b0_pad
+            y0[:, :, :, 0:1] = b0_pad
+            y0[:, :, :, -1:] = b0_pad
+            # conv-3x3
+            y1 = F.conv2d(
+                input=y0,
+                weight=self.scale * self.mask,
+                bias=self.bias,
+                stride=1,
+                groups=self.out_planes)
+        return y1
+
+    def rep_params(self):
+        device = self.k0.get_device()
+        if device < 0:
+            device = None
+
+        if self.type == 'conv1x1-conv3x3':
+            # re-param conv kernel
+            RK = F.conv2d(input=self.k1, weight=self.k0.permute(1, 0, 2, 3))
+            # re-param conv bias
+            RB = torch.ones(
+                1, self.mid_planes, 3, 3, device=device) * self.b0.view(
+                    1, -1, 1, 1)
+            RB = F.conv2d(input=RB, weight=self.k1).view(-1, ) + self.b1
+        else:
+            tmp = self.scale * self.mask
+            k1 = torch.zeros((self.out_planes, self.out_planes, 3, 3),
+                             device=device)
+            for i in range(self.out_planes):
+                k1[i, i, :, :] = tmp[i, 0, :, :]
+            b1 = self.bias
+            # re-param conv kernel
+            RK = F.conv2d(input=k1, weight=self.k0.permute(1, 0, 2, 3))
+            # re-param conv bias
+            RB = torch.ones(
+                1, self.out_planes, 3, 3, device=device) * self.b0.view(
+                    1, -1, 1, 1)
+            RB = F.conv2d(input=RB, weight=k1).view(-1, ) + b1
+        return RK, RB
+
+
+class ECB(nn.Module):
+
+    def __init__(self,
+                 inp_planes,
+                 out_planes,
+                 depth_multiplier,
+                 act_type='prelu',
+                 with_idt=False):
+        super(ECB, self).__init__()
+
+        self.depth_multiplier = depth_multiplier
+        self.inp_planes = inp_planes
+        self.out_planes = out_planes
+        self.act_type = act_type
+
+        if with_idt and (self.inp_planes == self.out_planes):
+            self.with_idt = True
+        else:
+            self.with_idt = False
+
+        self.conv3x3 = torch.nn.Conv2d(
+            self.inp_planes, self.out_planes, kernel_size=3, padding=1)
+        self.conv1x1_3x3 = SeqConv3x3('conv1x1-conv3x3', self.inp_planes,
+                                      self.out_planes, self.depth_multiplier)
+        self.conv1x1_sbx = SeqConv3x3('conv1x1-sobelx', self.inp_planes,
+                                      self.out_planes, -1)
+        self.conv1x1_sby = SeqConv3x3('conv1x1-sobely', self.inp_planes,
+                                      self.out_planes, -1)
+        self.conv1x1_lpl = SeqConv3x3('conv1x1-laplacian', self.inp_planes,
+                                      self.out_planes, -1)
+
+        if self.act_type == 'prelu':
+            self.act = nn.PReLU(num_parameters=self.out_planes)
+        elif self.act_type == 'relu':
+            self.act = nn.ReLU(inplace=True)
+        elif self.act_type == 'rrelu':
+            self.act = nn.RReLU(lower=-0.05, upper=0.05)
+        elif self.act_type == 'softplus':
+            self.act = nn.Softplus()
+        elif self.act_type == 'linear':
+            pass
+        else:
+            raise ValueError('The type of activation if not support!')
+
+    def forward(self, x):
+        if self.training:
+            y = self.conv3x3(x) + \
+                self.conv1x1_3x3(x) + \
+                self.conv1x1_sbx(x) + \
+                self.conv1x1_sby(x) + \
+                self.conv1x1_lpl(x)
+            if self.with_idt:
+                y += x
+        else:
+            RK, RB = self.rep_params()
+            y = F.conv2d(input=x, weight=RK, bias=RB, stride=1, padding=1)
+        if self.act_type != 'linear':
+            y = self.act(y)
+        return y
+
+    def rep_params(self):
+        K0, B0 = self.conv3x3.weight, self.conv3x3.bias
+        K1, B1 = self.conv1x1_3x3.rep_params()
+        K2, B2 = self.conv1x1_sbx.rep_params()
+        K3, B3 = self.conv1x1_sby.rep_params()
+        K4, B4 = self.conv1x1_lpl.rep_params()
+        RK, RB = (K0 + K1 + K2 + K3 + K4), (B0 + B1 + B2 + B3 + B4)
+
+        if self.with_idt:
+            device = RK.get_device()
+            if device < 0:
+                device = None
+            K_idt = torch.zeros(
+                self.out_planes, self.out_planes, 3, 3, device=device)
+            for i in range(self.out_planes):
+                K_idt[i, i, 1, 1] = 1.0
+            B_idt = 0.0
+            RK, RB = RK + K_idt, RB + B_idt
+        return RK, RB
+
+
+if __name__ == '__main__':
+
+    # # test seq-conv
+    x = torch.randn(1, 3, 5, 5).cuda()
+    conv = SeqConv3x3('conv1x1-conv3x3', 3, 3, 2).cuda()
+    y0 = conv(x)
+    RK, RB = conv.rep_params()
+    y1 = F.conv2d(input=x, weight=RK, bias=RB, stride=1, padding=1)
+    print(y0 - y1)
+
+    # test ecb
+    x = torch.randn(1, 3, 5, 5).cuda() * 200
+    ecb = ECB(3, 3, 2, act_type='linear', with_idt=True).cuda()
+    y0 = ecb(x)
+
+    RK, RB = ecb.rep_params()
+    y1 = F.conv2d(input=x, weight=RK, bias=RB, stride=1, padding=1)
+    print(y0 - y1)
diff --git a/modelscope/models/cv/super_resolution/ecbsr_model.py b/modelscope/models/cv/super_resolution/ecbsr_model.py
new file mode 100644
index 00000000..6f54d2e4
--- /dev/null
+++ b/modelscope/models/cv/super_resolution/ecbsr_model.py
@@ -0,0 +1,102 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os
+from typing import Any, Dict, Union
+
+import torch
+import torch.cuda
+import torch.nn as nn
+import torch.nn.functional as F
+
+from modelscope.metainfo import Models
+from modelscope.models.base import Tensor
+from modelscope.models.base.base_torch_model import TorchModel
+from modelscope.models.builder import MODELS
+from modelscope.utils.config import Config
+from modelscope.utils.constant import ModelFile, Tasks
+from modelscope.utils.logger import get_logger
+from .ecb import ECB
+
+logger = get_logger()
+__all__ = ['ECBSRModel']
+
+
+@MODELS.register_module(Tasks.image_super_resolution, module_name=Models.ecbsr)
+class ECBSRModel(TorchModel):
+
+    def __init__(self, model_dir: str, *args, **kwargs):
+        """initialize the image denoise model from the `model_dir` path.
+
+        Args:
+            model_dir (str): the model path.
+
+        """
+        super().__init__(model_dir, *args, **kwargs)
+        self.model_dir = model_dir
+        self.config = Config.from_file(
+            os.path.join(self.model_dir, ModelFile.CONFIGURATION))
+
+        # network architecture
+        self.module_nums = self.config.model.model_args.module_nums
+        self.channel_nums = self.config.model.model_args.channel_nums
+        self.scale = self.config.model.model_args.scale
+        self.colors = self.config.model.model_args.colors
+        self.with_idt = self.config.model.model_args.with_idt
+        self.act_type = self.config.model.model_args.act_type
+
+        backbone = []
+        backbone += [
+            ECB(self.colors,
+                self.channel_nums,
+                depth_multiplier=2.0,
+                act_type=self.act_type,
+                with_idt=self.with_idt)
+        ]
+        for i in range(self.module_nums):
+            backbone += [
+                ECB(self.channel_nums,
+                    self.channel_nums,
+                    depth_multiplier=2.0,
+                    act_type=self.act_type,
+                    with_idt=self.with_idt)
+            ]
+        backbone += [
+            ECB(self.channel_nums,
+                self.colors * self.scale * self.scale,
+                depth_multiplier=2.0,
+                act_type='linear',
+                with_idt=self.with_idt)
+        ]
+
+        self.backbone = nn.Sequential(*backbone)
+        self.upsampler = nn.PixelShuffle(self.scale)
+
+        self.interp = nn.Upsample(scale_factor=self.scale, mode='nearest')
+
+    def _inference_forward(self, input: Tensor) -> Dict[str, Tensor]:
+        output = self.backbone(input)
+        output = self.upsampler(output) + self.interp(input)
+        return {'outputs': output}
+
+    def forward(self, inputs: Dict[str,
+                                   Tensor]) -> Dict[str, Union[list, Tensor]]:
+        """return the result by the model
+
+        Args:
+            inputs (Tensor): the preprocessed data
+
+        Returns:
+            Dict[str, Tensor]: results
+        """
+        return self._inference_forward(**inputs)
+
+    @classmethod
+    def _instantiate(cls, **kwargs):
+        model_file = kwargs.get('am_model_name', ModelFile.TORCH_MODEL_FILE)
+        model_dir = kwargs['model_dir']
+        ckpt_path = os.path.join(model_dir, model_file)
+        logger.info(f'loading model from {ckpt_path}')
+        model_dir = kwargs.pop('model_dir')
+        model = cls(model_dir=model_dir, **kwargs)
+        ckpt_path = os.path.join(model_dir, model_file)
+        model.load_state_dict(torch.load(ckpt_path, map_location='cpu'))
+        return model
diff --git a/modelscope/models/cv/tinynas_detection/damo/apis/detector_evaluater.py b/modelscope/models/cv/tinynas_detection/damo/apis/detector_evaluater.py
index 1db526f2..e8972427 100644
--- a/modelscope/models/cv/tinynas_detection/damo/apis/detector_evaluater.py
+++ b/modelscope/models/cv/tinynas_detection/damo/apis/detector_evaluater.py
@@ -28,7 +28,11 @@ class Evaluater:
         self.ckpt = torch.load(
             self.cfg.test.checkpoint_path, map_location=self.device)
         self.model = build_local_model(self.cfg, self.device)
-        self.model.load_state_dict(self.ckpt['model'])
+        if 'state_dict' in self.ckpt:
+            state_dict = self.ckpt['state_dict']
+        elif 'model' in self.ckpt:
+            state_dict = self.ckpt['model']
+        self.model.load_state_dict(state_dict)
         self.val_loader = self.get_data_loader(self.cfg, False)
 
     def get_data_loader(self, cfg, distributed=False):
diff --git a/modelscope/models/cv/tinynas_detection/damo/base_models/losses/gfocal_loss.py b/modelscope/models/cv/tinynas_detection/damo/base_models/losses/gfocal_loss.py
index 5314e2e7..490d41ce 100644
--- a/modelscope/models/cv/tinynas_detection/damo/base_models/losses/gfocal_loss.py
+++ b/modelscope/models/cv/tinynas_detection/damo/base_models/losses/gfocal_loss.py
@@ -39,7 +39,7 @@ def weighted_loss(loss_func):
     the signature like `loss_func(pred, target, weight=None, reduction='mean',
     avg_factor=None, **kwargs)`.
 
-    :Example:
+    Example:
 
     >>> import torch
     >>> @weighted_loss
diff --git a/modelscope/models/cv/tinynas_detection/damo/utils/boxes.py b/modelscope/models/cv/tinynas_detection/damo/utils/boxes.py
index b112b514..369767bb 100644
--- a/modelscope/models/cv/tinynas_detection/damo/utils/boxes.py
+++ b/modelscope/models/cv/tinynas_detection/damo/utils/boxes.py
@@ -39,6 +39,7 @@ def bbox_overlaps(bboxes1, bboxes2, mode='iou', is_aligned=False, eps=1e-6):
             stability. Default 1e-6.
     Returns:
         Tensor: shape (m, n) if ``is_aligned `` is False else shape (m,)
+
     Example:
         >>> bboxes1 = torch.FloatTensor([
         >>>     [0, 0, 10, 10],
@@ -54,6 +55,7 @@ def bbox_overlaps(bboxes1, bboxes2, mode='iou', is_aligned=False, eps=1e-6):
         >>> assert overlaps.shape == (3, 3)
         >>> overlaps = bbox_overlaps(bboxes1, bboxes2, is_aligned=True)
         >>> assert overlaps.shape == (3, )
+
     Example:
         >>> empty = torch.empty(0, 4)
         >>> nonempty = torch.FloatTensor([[0, 0, 10, 9]])
diff --git a/modelscope/models/cv/video_deinterlace/UNet_for_video_deinterlace.py b/modelscope/models/cv/video_deinterlace/UNet_for_video_deinterlace.py
new file mode 100644
index 00000000..c08c1e8e
--- /dev/null
+++ b/modelscope/models/cv/video_deinterlace/UNet_for_video_deinterlace.py
@@ -0,0 +1,89 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os
+from copy import deepcopy
+from typing import Any, Dict, Union
+
+import torch.cuda
+import torch.nn.functional as F
+from torch.nn.parallel import DataParallel, DistributedDataParallel
+
+from modelscope.metainfo import Models
+from modelscope.models.base import Tensor
+from modelscope.models.base.base_torch_model import TorchModel
+from modelscope.models.builder import MODELS
+from modelscope.models.cv.video_deinterlace.deinterlace_arch import \
+    DeinterlaceNet
+from modelscope.utils.config import Config
+from modelscope.utils.constant import ModelFile, Tasks
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+__all__ = ['UNetForVideoDeinterlace']
+
+
+def convert(param):
+    return {
+        k.replace('module.', ''): v
+        for k, v in param.items() if 'module.' in k
+    }
+
+
+@MODELS.register_module(
+    Tasks.video_deinterlace, module_name=Models.video_deinterlace)
+class UNetForVideoDeinterlace(TorchModel):
+
+    def __init__(self, model_dir: str, *args, **kwargs):
+        """initialize the video deinterlace model from the `model_dir` path.
+
+        Args:
+            model_dir (str): the model path.
+
+        """
+        super().__init__(model_dir, *args, **kwargs)
+
+        if torch.cuda.is_available():
+            self._device = torch.device('cuda')
+        else:
+            self._device = torch.device('cpu')
+
+        self.model_dir = model_dir
+
+        frenet_path = os.path.join(model_dir, 'deinterlace_fre.pth')
+        enhnet_path = os.path.join(model_dir, 'deinterlace_mf.pth')
+
+        self.model = DeinterlaceNet()
+        self._load_pretrained(frenet_path, enhnet_path)
+
+    def _load_pretrained(self, frenet_path, enhnet_path):
+        state_dict_frenet = torch.load(frenet_path, map_location=self._device)
+        state_dict_enhnet = torch.load(enhnet_path, map_location=self._device)
+
+        self.model.frenet.load_state_dict(state_dict_frenet, strict=True)
+        self.model.enhnet.load_state_dict(state_dict_enhnet, strict=True)
+        logger.info('load model done.')
+
+    def _inference_forward(self, input: Tensor) -> Dict[str, Tensor]:
+        return {'output': self.model(input)}
+
+    def _evaluate_postprocess(self, input: Tensor,
+                              target: Tensor) -> Dict[str, list]:
+        preds = self.model(input)
+        del input
+        torch.cuda.empty_cache()
+        return {'pred': preds, 'target': target}
+
+    def forward(self, inputs: Dict[str,
+                                   Tensor]) -> Dict[str, Union[list, Tensor]]:
+        """return the result by the model
+
+        Args:
+            inputs (Tensor): the preprocessed data
+
+        Returns:
+            Dict[str, Tensor]: results
+        """
+
+        if 'target' in inputs:
+            return self._evaluate_postprocess(**inputs)
+        else:
+            return self._inference_forward(**inputs)
diff --git a/modelscope/models/cv/video_deinterlace/__init__.py b/modelscope/models/cv/video_deinterlace/__init__.py
new file mode 100644
index 00000000..bef991f2
--- /dev/null
+++ b/modelscope/models/cv/video_deinterlace/__init__.py
@@ -0,0 +1,21 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .deinterlace_arch import \
+        DeinterlaceNet
+
+else:
+    _import_structure = {'deinterlace_arch': ['DeinterlaceNet']}
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/models/cv/video_deinterlace/deinterlace_arch.py b/modelscope/models/cv/video_deinterlace/deinterlace_arch.py
new file mode 100644
index 00000000..7f6a18c5
--- /dev/null
+++ b/modelscope/models/cv/video_deinterlace/deinterlace_arch.py
@@ -0,0 +1,27 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import torch.nn as nn
+
+from modelscope.models.cv.video_deinterlace.models.enh import DeinterlaceEnh
+from modelscope.models.cv.video_deinterlace.models.fre import DeinterlaceFre
+
+
+class DeinterlaceNet(nn.Module):
+
+    def __init__(self):
+        super(DeinterlaceNet, self).__init__()
+        self.frenet = DeinterlaceFre()
+        self.enhnet = DeinterlaceEnh()
+
+    def forward(self, frames):
+        self.frenet.eval()
+        self.enhnet.eval()
+        with torch.no_grad():
+            frame1, frame2, frame3 = frames
+
+            F1_out = self.frenet(frame1)
+            F2_out = self.frenet(frame2)
+            F3_out = self.frenet(frame3)
+
+            out = self.enhnet([F1_out, F2_out, F3_out])
+
+        return out
diff --git a/modelscope/models/cv/video_deinterlace/models/__init__.py b/modelscope/models/cv/video_deinterlace/models/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/modelscope/models/cv/video_deinterlace/models/archs.py b/modelscope/models/cv/video_deinterlace/models/archs.py
new file mode 100644
index 00000000..ba0b5aa9
--- /dev/null
+++ b/modelscope/models/cv/video_deinterlace/models/archs.py
@@ -0,0 +1,97 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import numpy as np
+import torch
+import torch.fft
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class DoubleConv(nn.Module):
+    """(convolution => [BN] => ReLU) * 2"""
+
+    def __init__(self, in_channels, out_channels, mid_channels=None):
+        super().__init__()
+        if not mid_channels:
+            mid_channels = out_channels
+        self.double_conv = nn.Sequential(
+            nn.Conv2d(in_channels, mid_channels, kernel_size=3, padding=1),
+            nn.LeakyReLU(negative_slope=0.2, inplace=True),
+            nn.Conv2d(mid_channels, out_channels, kernel_size=3, padding=1),
+            nn.LeakyReLU(negative_slope=0.2, inplace=True))
+
+    def forward(self, x):
+        return self.double_conv(x)
+
+
+class TripleConv(nn.Module):
+    """(convolution => [BN] => ReLU) * 3"""
+
+    def __init__(self, in_channels, out_channels, mid_channels=None):
+        super().__init__()
+        if not mid_channels:
+            mid_channels = out_channels
+        self.triple_conv = nn.Sequential(
+            nn.Conv2d(in_channels, mid_channels, kernel_size=3, padding=1),
+            nn.LeakyReLU(negative_slope=0.2, inplace=True),
+            nn.Conv2d(mid_channels, mid_channels, kernel_size=3, padding=1),
+            nn.LeakyReLU(negative_slope=0.2, inplace=True),
+            nn.Conv2d(mid_channels, out_channels, kernel_size=3, padding=1),
+            nn.LeakyReLU(negative_slope=0.2, inplace=True))
+
+    def forward(self, x):
+        return self.triple_conv(x)
+
+
+class DownConv(nn.Module):
+    """Downscaling with avgpool then double/triple conv"""
+
+    def __init__(self, in_channels, out_channels, num_conv=2):
+        super().__init__()
+        if num_conv == 2:
+            self.pool_conv = nn.Sequential(
+                nn.AvgPool2d(2), DoubleConv(in_channels, out_channels))
+        else:
+            self.pool_conv = nn.Sequential(
+                nn.AvgPool2d(2), TripleConv(in_channels, out_channels))
+
+    def forward(self, x):
+        return self.pool_conv(x)
+
+
+class UpCatConv(nn.Module):
+    """Upscaling then double conv"""
+
+    def __init__(self, in_channels, out_channels, bilinear=False):
+        super().__init__()
+
+        if bilinear:
+            self.up = nn.Upsample(
+                scale_factor=2, mode='bilinear', align_corners=True)
+            self.conv = DoubleConv(in_channels, out_channels)
+        else:
+            self.up = nn.Upsample(
+                scale_factor=2, mode='nearest', align_corners=None)
+            self.conv = DoubleConv(in_channels, out_channels)
+
+        self.subpixel = nn.PixelShuffle(2)
+
+    def interpolate(self, x):
+        tensor_temp = x
+        for i in range(3):
+            tensor_temp = torch.cat((tensor_temp, x), 1)
+        x = tensor_temp
+        x = self.subpixel(x)
+        return x
+
+    def forward(self, x1, x2):
+
+        x1 = self.interpolate(x1)
+
+        diffY = x2.size()[2] - x1.size()[2]
+        diffX = x2.size()[3] - x1.size()[3]
+        x1 = F.pad(
+            x1,
+            [diffX // 2, diffX - diffX // 2, diffY // 2, diffY - diffY // 2])
+
+        x = torch.cat([x2, x1], dim=1)
+        return self.conv(x)
diff --git a/modelscope/models/cv/video_deinterlace/models/deep_fourier_upsampling.py b/modelscope/models/cv/video_deinterlace/models/deep_fourier_upsampling.py
new file mode 100644
index 00000000..c6e0799d
--- /dev/null
+++ b/modelscope/models/cv/video_deinterlace/models/deep_fourier_upsampling.py
@@ -0,0 +1,47 @@
+# The implementation is adopted from Deep Fourier Upsampling,
+# made publicly available at https://github.com/manman1995/Deep-Fourier-Upsampling
+import numpy as np
+import torch
+import torch.fft
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class freup_Periodicpadding(nn.Module):
+
+    def __init__(self, channels):
+        super(freup_Periodicpadding, self).__init__()
+
+        self.amp_fuse = nn.Sequential(
+            nn.Conv2d(channels, channels, 1, 1, 0),
+            nn.LeakyReLU(0.1, inplace=False),
+            nn.Conv2d(channels, channels, 1, 1, 0))
+        self.pha_fuse = nn.Sequential(
+            nn.Conv2d(channels, channels, 1, 1, 0),
+            nn.LeakyReLU(0.1, inplace=False),
+            nn.Conv2d(channels, channels, 1, 1, 0))
+
+        self.post = nn.Conv2d(channels, channels, 1, 1, 0)
+
+    def forward(self, x):
+
+        N, C, H, W = x.shape
+
+        fft_x = torch.fft.fft(torch.fft.fft(x, dim=0), dim=1)
+        mag_x = torch.abs(fft_x)
+        pha_x = torch.angle(fft_x).detach()
+
+        Mag = self.amp_fuse(mag_x)
+        Pha = self.pha_fuse(pha_x)
+
+        amp_fuse = Mag.repeat(1, 1, 2, 2)
+        pha_fuse = Pha.repeat(1, 1, 2, 2)
+
+        real = amp_fuse * torch.cos(pha_fuse)
+        imag = amp_fuse * torch.sin(pha_fuse)
+        out = torch.complex(real, imag)
+
+        output = torch.fft.ifft(torch.fft.ifft(out, dim=0), dim=1)
+        output = torch.abs(output)
+
+        return self.post(output)
diff --git a/modelscope/models/cv/video_deinterlace/models/enh.py b/modelscope/models/cv/video_deinterlace/models/enh.py
new file mode 100644
index 00000000..e41b99ec
--- /dev/null
+++ b/modelscope/models/cv/video_deinterlace/models/enh.py
@@ -0,0 +1,71 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from modelscope.models.cv.video_deinterlace.models.archs import (DoubleConv,
+                                                                 DownConv,
+                                                                 TripleConv,
+                                                                 UpCatConv)
+from modelscope.models.cv.video_deinterlace.models.utils import warp
+
+
+class DeinterlaceEnh(nn.Module):
+    """Defines a U-Net video enhancement module
+
+    Arg:
+        num_in_ch (int): Channel number of inputs. Default: 3.
+        num_feat (int): Channel number of base intermediate features. Default: 64.
+    """
+
+    def __init__(self, num_in_ch=3, num_feat=64):
+        super(DeinterlaceEnh, self).__init__()
+        self.channel = num_in_ch
+
+        # extra convolutions
+        self.inconv2_1 = DoubleConv(num_in_ch * 3, 48)
+        # downsample
+        self.down2_0 = DownConv(48, 80)
+        self.down2_1 = DownConv(80, 144)
+        self.down2_2 = DownConv(144, 256)
+        self.down2_3 = DownConv(256, 448, num_conv=3)
+        # upsample
+        self.up2_3 = UpCatConv(704, 256)
+        self.up2_2 = UpCatConv(400, 144)
+        self.up2_1 = UpCatConv(224, 80)
+        self.up2_0 = UpCatConv(128, 48)
+        # extra convolutions
+        self.outconv2_1 = nn.Conv2d(48, num_in_ch, 3, 1, 1, bias=False)
+
+        self.offset_conv1 = nn.Sequential(
+            nn.Conv2d(num_in_ch * 2, num_feat, kernel_size=3, padding=1),
+            nn.LeakyReLU(negative_slope=0.2, inplace=True),
+            nn.Conv2d(num_feat, num_feat, kernel_size=3, padding=1),
+            nn.LeakyReLU(negative_slope=0.2, inplace=True),
+            nn.Conv2d(num_feat, num_in_ch * 2, kernel_size=3, padding=1))
+        self.offset_conv2 = nn.Sequential(
+            nn.Conv2d(num_in_ch * 2, num_feat, kernel_size=3, padding=1),
+            nn.LeakyReLU(negative_slope=0.2, inplace=True),
+            nn.Conv2d(num_feat, num_feat, kernel_size=3, padding=1),
+            nn.LeakyReLU(negative_slope=0.2, inplace=True),
+            nn.Conv2d(num_feat, num_in_ch * 2, kernel_size=3, padding=1))
+
+    def forward(self, frames):
+        frame1, frame2, frame3 = frames
+        flow1 = self.offset_conv1(torch.cat([frame1, frame2], 1))
+        warp1 = warp(frame1, flow1)
+        flow3 = self.offset_conv2(torch.cat([frame3, frame2], 1))
+        warp3 = warp(frame3, flow3)
+        x2_0 = self.inconv2_1(torch.cat((warp1, frame2, warp3), 1))
+        # downsample
+        x2_1 = self.down2_0(x2_0)  # 1/2
+        x2_2 = self.down2_1(x2_1)  # 1/4
+        x2_3 = self.down2_2(x2_2)  # 1/8
+        x2_4 = self.down2_3(x2_3)  # 1/16
+
+        x2_5 = self.up2_3(x2_4, x2_3)  # 1/8
+        x2_5 = self.up2_2(x2_5, x2_2)  # 1/4
+        x2_5 = self.up2_1(x2_5, x2_1)  # 1/2
+        x2_5 = self.up2_0(x2_5, x2_0)  # 1
+        out_final = self.outconv2_1(x2_5)
+        return out_final
diff --git a/modelscope/models/cv/video_deinterlace/models/fre.py b/modelscope/models/cv/video_deinterlace/models/fre.py
new file mode 100644
index 00000000..a2144baf
--- /dev/null
+++ b/modelscope/models/cv/video_deinterlace/models/fre.py
@@ -0,0 +1,93 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from modelscope.models.cv.video_deinterlace.models.archs import (DoubleConv,
+                                                                 DownConv,
+                                                                 TripleConv,
+                                                                 UpCatConv)
+from modelscope.models.cv.video_deinterlace.models.deep_fourier_upsampling import \
+    freup_Periodicpadding
+
+
+class DeinterlaceFre(nn.Module):
+
+    def __init__(self, num_in_ch=3, num_out_ch=3, ngf=64):
+        """Defines a video deinterlace module.
+           input a [b,c,h,w] tensor with range [0,1] as frame,
+           it will output a [b,c,h,w] tensor with range [0,1] whitout interlace.
+
+        Args:
+            num_in_ch (int): Channel number of inputs. Default: 3.
+            num_out_ch (int): Channel number of outputs. Default: 3.
+            ngf(int): Channel number of features. Default: 64.
+        """
+        super(DeinterlaceFre, self).__init__()
+
+        self.inconv = DoubleConv(num_in_ch, 48)
+        self.down_0 = DownConv(48, 80)
+        self.down_1 = DownConv(80, 144)
+
+        self.opfre_0 = freup_Periodicpadding(80)
+        self.opfre_1 = freup_Periodicpadding(144)
+
+        self.conv_up1 = nn.Conv2d(80, ngf, 3, 1, 1)
+        self.conv_up2 = nn.Conv2d(144, 80, 3, 1, 1)
+
+        self.conv_hr = nn.Conv2d(ngf, ngf, 3, 1, 1)
+        self.conv_last = nn.Conv2d(ngf, num_out_ch, 3, 1, 1)
+        self.lrelu = nn.LeakyReLU(negative_slope=0.2, inplace=True)
+
+        self.enh_inconv = DoubleConv(num_in_ch + num_out_ch, 48)
+        # downsample
+        self.enh_down_0 = DownConv(48, 80)
+        self.enh_down_1 = DownConv(80, 144)
+        self.enh_down_2 = DownConv(144, 256)
+        self.enh_down_3 = DownConv(256, 448, num_conv=3)
+        # upsample
+        self.enh_up_3 = UpCatConv(704, 256)
+        self.enh_up_2 = UpCatConv(400, 144)
+        self.enh_up_1 = UpCatConv(224, 80)
+        self.enh_up_0 = UpCatConv(128, 48)
+        # extra convolutions
+        self.enh_outconv = nn.Conv2d(48, num_out_ch, 3, 1, 1, bias=False)
+
+    def interpolate(self, feat, x2, fn):
+        x1f = fn(feat)
+        x1 = F.interpolate(feat, scale_factor=2, mode='nearest')
+        diffY = x2.size()[2] - x1.size()[2]
+        diffX = x2.size()[3] - x1.size()[3]
+        x1f = F.pad(
+            x1f,
+            [diffX // 2, diffX - diffX // 2, diffY // 2, diffY - diffY // 2])
+        x1 = F.pad(
+            x1,
+            [diffX // 2, diffX - diffX // 2, diffY // 2, diffY - diffY // 2])
+        return x1 + x1f
+
+    def forward(self, x):
+        x1_0 = self.inconv(x)
+        # downsample
+        x1_1 = self.down_0(x1_0)  # 1/2
+        x1_2 = self.down_1(x1_1)  # 1/4
+
+        feat = self.lrelu(
+            self.conv_up2(self.interpolate(x1_2, x1_1, self.opfre_1)))
+        feat = self.lrelu(
+            self.conv_up1(self.interpolate(feat, x1_0, self.opfre_0)))
+        x_new = self.conv_last(self.lrelu(self.conv_hr(feat)))
+
+        x2_0 = self.enh_inconv(torch.cat([x_new, x], 1))
+        # downsample
+        x2_1 = self.enh_down_0(x2_0)  # 1/2
+        x2_2 = self.enh_down_1(x2_1)  # 1/4
+        x2_3 = self.enh_down_2(x2_2)  # 1/8
+        x2_4 = self.enh_down_3(x2_3)  # 1/16
+
+        x2_5 = self.enh_up_3(x2_4, x2_3)  # 1/8
+        x2_5 = self.enh_up_2(x2_5, x2_2)  # 1/4
+        x2_5 = self.enh_up_1(x2_5, x2_1)  # 1/2
+        x2_5 = self.enh_up_0(x2_5, x2_0)  # 1
+        out = self.enh_outconv(x2_5)
+        return out
diff --git a/modelscope/models/cv/video_deinterlace/models/utils.py b/modelscope/models/cv/video_deinterlace/models/utils.py
new file mode 100644
index 00000000..f5f1f5b2
--- /dev/null
+++ b/modelscope/models/cv/video_deinterlace/models/utils.py
@@ -0,0 +1,107 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import torch
+
+
+def warp(im, flow):
+
+    def _repeat(x, n_repeats):
+        rep = torch.ones((1, n_repeats), dtype=torch.int32)
+        x = torch.matmul(x.view(-1, 1).int(), rep)
+        return x.view(-1)
+
+    def _repeat2(x, n_repeats):
+        rep = torch.ones((n_repeats, 1), dtype=torch.int32)
+        x = torch.matmul(rep, x.view(1, -1).int())
+        return x.view(-1)
+
+    def _interpolate(im, x, y):
+        num_batch, channels, height, width = im.shape
+
+        x = x.float()
+        y = y.float()
+        max_y = height - 1
+        max_x = width - 1
+
+        x = _repeat2(torch.arange(0, width),
+                     height * num_batch).float().cuda() + x * 64
+        y = _repeat2(_repeat(torch.arange(0, height), width),
+                     num_batch).float().cuda() + y * 64
+
+        # do sampling
+        x0 = (torch.floor(x.cpu())).int()
+        x1 = x0 + 1
+        y0 = (torch.floor(y.cpu())).int()
+        y1 = y0 + 1
+
+        x0 = torch.clamp(x0, 0, max_x)
+        x1 = torch.clamp(x1, 0, max_x)
+        y0 = torch.clamp(y0, 0, max_y)
+        y1 = torch.clamp(y1, 0, max_y)
+        dim2 = width
+        dim1 = width * height
+        base = _repeat(torch.arange(num_batch) * dim1, height * width)
+        base_y0 = base + y0 * dim2
+        base_y1 = base + y1 * dim2
+        idx_a = base_y0 + x0
+        idx_b = base_y1 + x0
+        idx_c = base_y0 + x1
+        idx_d = base_y1 + x1
+
+        # use indices to lookup pixels in the flat image and restore
+        im_flat = im.permute(0, 2, 3, 1)
+        im_flat = im_flat.reshape((-1, channels)).float()
+        Ia = torch.gather(
+            im_flat, dim=0, index=torch.unsqueeze(idx_a, 1).long().cuda())
+        Ib = torch.gather(im_flat, 0, torch.unsqueeze(idx_b, 1).long().cuda())
+        Ic = torch.gather(im_flat, 0, torch.unsqueeze(idx_c, 1).long().cuda())
+        Id = torch.gather(im_flat, 0, torch.unsqueeze(idx_d, 1).long().cuda())
+        # and finally calculate interpolated values
+        x0_f = x0.float().cuda()
+        x1_f = x1.float().cuda()
+        y0_f = y0.float().cuda()
+        y1_f = y1.float().cuda()
+        wa = torch.unsqueeze(((x1_f - x) * (y1_f - y)), 1)
+        wb = torch.unsqueeze(((x1_f - x) * (y - y0_f)), 1)
+        wc = torch.unsqueeze(((x - x0_f) * (y1_f - y)), 1)
+        wd = torch.unsqueeze(((x - x0_f) * (y - y0_f)), 1)
+        output = wa * Ia + wb * Ib + wc * Ic + wd * Id
+        return output
+
+    def _meshgrid(height, width):
+        x_t = torch.matmul(
+            torch.ones((height, 1)),
+            torch.unsqueeze(torch.linspace(-0.1, 0.1, width),
+                            1).permute(1, 0)).cuda()
+        y_t = torch.matmul(
+            torch.unsqueeze(torch.linspace(-0.1, 0.1, height), 1),
+            torch.ones((1, width))).cuda()
+
+        x_t_flat = x_t.reshape((1, -1))
+        y_t_flat = y_t.reshape((1, -1))
+
+        ones = torch.ones_like(x_t_flat).cuda()
+        grid = torch.cat((x_t_flat, y_t_flat, ones), 0)
+
+        return grid
+
+    def _warp(x_s, y_s, input_dim):
+        num_batch, num_channels, height, width = input_dim.shape
+        # out_height, out_width = out_size
+
+        x_s_flat = x_s.reshape(-1)
+        y_s_flat = y_s.reshape(-1)
+
+        input_transformed = _interpolate(input_dim, x_s_flat, y_s_flat)
+        output = input_transformed.reshape(
+            (num_batch, num_channels, height, width))
+        return output
+
+    n_dims = int(flow.shape[1]) // 2
+    dx = flow[:, :n_dims, :, :]
+    dy = flow[:, n_dims:, :, :]
+
+    output = torch.cat([
+        _warp(dx[:, idx:idx + 1, :, :], dy[:, idx:idx + 1, :, :],
+              im[:, idx:idx + 1, :, :]) for idx in range(im.shape[1])
+    ], 1)
+    return output
diff --git a/modelscope/models/cv/video_multi_object_tracking/utils/visualization.py b/modelscope/models/cv/video_multi_object_tracking/utils/visualization.py
index 8ed7b601..225733ac 100644
--- a/modelscope/models/cv/video_multi_object_tracking/utils/visualization.py
+++ b/modelscope/models/cv/video_multi_object_tracking/utils/visualization.py
@@ -54,7 +54,8 @@ def plot_tracking(image,
     return im
 
 
-def show_multi_object_tracking_result(video_in_path, bboxes, video_save_path):
+def show_multi_object_tracking_result(video_in_path, bboxes, labels,
+                                      video_save_path):
     cap = cv2.VideoCapture(video_in_path)
     frame_idx = 0
     while (cap.isOpened()):
@@ -66,13 +67,13 @@ def show_multi_object_tracking_result(video_in_path, bboxes, video_save_path):
                                 ' can not be correctly decoded by OpenCV.')
             else:
                 break
-        cur_frame_boxes = []
-        cur_obj_ids = []
-        for box in bboxes:
-            if box[0] == frame_idx:
-                cur_frame_boxes.append(
-                    [box[2], box[3], box[4] - box[2], box[5] - box[3]])
-                cur_obj_ids.append(box[1])
+        cur_frame_boxes = bboxes[frame_idx - 1]
+        cur_obj_ids = labels[frame_idx - 1]
+        for i in range(len(cur_frame_boxes)):
+            cur_frame_boxes[i][
+                2] = cur_frame_boxes[i][2] - cur_frame_boxes[i][0]
+            cur_frame_boxes[i][
+                3] = cur_frame_boxes[i][3] - cur_frame_boxes[i][1]
         if frame_idx == 1:
             size = (frame.shape[1], frame.shape[0])
             fourcc = cv2.VideoWriter_fourcc('M', 'J', 'P', 'G')
diff --git a/modelscope/models/cv/video_panoptic_segmentation/__init__.py b/modelscope/models/cv/video_panoptic_segmentation/__init__.py
new file mode 100644
index 00000000..72058a0b
--- /dev/null
+++ b/modelscope/models/cv/video_panoptic_segmentation/__init__.py
@@ -0,0 +1,21 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .video_k_net import (
+        VideoKNet, )
+
+else:
+    _import_structure = {'video_k_net': ['VideoKNet']}
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/models/cv/video_panoptic_segmentation/backbone/__init__.py b/modelscope/models/cv/video_panoptic_segmentation/backbone/__init__.py
new file mode 100644
index 00000000..b937315b
--- /dev/null
+++ b/modelscope/models/cv/video_panoptic_segmentation/backbone/__init__.py
@@ -0,0 +1 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
diff --git a/modelscope/models/cv/video_panoptic_segmentation/backbone/swin_checkpoint.py b/modelscope/models/cv/video_panoptic_segmentation/backbone/swin_checkpoint.py
new file mode 100644
index 00000000..09e6c9f4
--- /dev/null
+++ b/modelscope/models/cv/video_panoptic_segmentation/backbone/swin_checkpoint.py
@@ -0,0 +1,262 @@
+# The implementation is adopted from Video-K-Net,
+# made publicly available at https://github.com/lxtGH/Video-K-Net
+
+import os.path as osp
+import pkgutil
+from collections import OrderedDict
+from importlib import import_module
+
+import torch
+import torchvision
+from torch.nn import functional as F
+
+
+def load_state_dict(module, state_dict, strict=False, logger=None):
+    """Load state_dict to a module.
+
+    This method is modified from :meth:`torch.nn.Module.load_state_dict`.
+    Default value for ``strict`` is set to ``False`` and the message for
+    param mismatch will be shown even if strict is False.
+    Args:
+        module (Module): Module that receives the state_dict.
+        state_dict (OrderedDict): Weights.
+        strict (bool): whether to strictly enforce that the keys
+            in :attr:`state_dict` match the keys returned by this module's
+            :meth:`~torch.nn.Module.state_dict` function. Default: ``False``.
+        logger (:obj:`logging.Logger`, optional): Logger to log the error
+            message. If not specified, print function will be used.
+    """
+    unexpected_keys = []
+    all_missing_keys = []
+    err_msg = []
+
+    metadata = getattr(state_dict, '_metadata', None)
+    state_dict = state_dict.copy()
+    if metadata is not None:
+        state_dict._metadata = metadata
+
+    # use _load_from_state_dict to enable checkpoint version control
+    def load(module, prefix=''):
+        local_metadata = {} if metadata is None else metadata.get(
+            prefix[:-1], {})
+        module._load_from_state_dict(state_dict, prefix, local_metadata, True,
+                                     all_missing_keys, unexpected_keys,
+                                     err_msg)
+        for name, child in module._modules.items():
+            if child is not None:
+                load(child, prefix + name + '.')
+
+    load(module)
+    load = None  # break load->load reference cycle
+
+    # ignore "num_batches_tracked" of BN layers
+    missing_keys = [
+        key for key in all_missing_keys if 'num_batches_tracked' not in key
+    ]
+
+    if unexpected_keys:
+        err_msg.append('unexpected key in source '
+                       f'state_dict: {", ".join(unexpected_keys)}\n')
+    if missing_keys:
+        err_msg.append(
+            f'missing keys in source state_dict: {", ".join(missing_keys)}\n')
+
+    if len(err_msg) > 0:
+        err_msg.insert(
+            0, 'The model and loaded state dict do not match exactly\n')
+        err_msg = '\n'.join(err_msg)
+        if strict:
+            raise RuntimeError(err_msg)
+        elif logger is not None:
+            logger.warning(err_msg)
+        else:
+            print(err_msg)
+
+
+def get_torchvision_models():
+    model_urls = dict()
+    for _, name, ispkg in pkgutil.walk_packages(torchvision.models.__path__):
+        if ispkg:
+            continue
+        _zoo = import_module(f'torchvision.models.{name}')
+        if hasattr(_zoo, 'model_urls'):
+            _urls = getattr(_zoo, 'model_urls')
+            model_urls.update(_urls)
+    return model_urls
+
+
+def _process_mmcls_checkpoint(checkpoint):
+    state_dict = checkpoint['state_dict']
+    new_state_dict = OrderedDict()
+    for k, v in state_dict.items():
+        if k.startswith('backbone.'):
+            new_state_dict[k[9:]] = v
+    new_checkpoint = dict(state_dict=new_state_dict)
+
+    return new_checkpoint
+
+
+def _load_checkpoint(filename, map_location=None):
+    """Load checkpoint from somewhere (modelzoo, file, url).
+
+    Args:
+        filename (str): Accept local filepath, URL, ``torchvision://xxx``,
+            ``open-mmlab://xxx``. Please refer to ``docs/model_zoo.md`` for
+            details.
+        map_location (str | None): Same as :func:`torch.load`. Default: None.
+    Returns:
+        dict | OrderedDict: The loaded checkpoint. It can be either an
+            OrderedDict storing model weights or a dict containing other
+            information, which depends on the checkpoint.
+    """
+    if not osp.isfile(filename):
+        raise IOError(f'{filename} is not a checkpoint file')
+    checkpoint = torch.load(filename, map_location=map_location)
+    return checkpoint
+
+
+def load_checkpoint(model,
+                    filename,
+                    map_location='cpu',
+                    strict=False,
+                    logger=None):
+    """Load checkpoint from a file or URI.
+
+    Args:
+        model (Module): Module to load checkpoint.
+        filename (str): Accept local filepath, URL, ``torchvision://xxx``,
+            ``open-mmlab://xxx``. Please refer to ``docs/model_zoo.md`` for
+            details.
+        map_location (str): Same as :func:`torch.load`.
+        strict (bool): Whether to allow different params for the model and
+            checkpoint.
+        logger (:mod:`logging.Logger` or None): The logger for error message.
+    Returns:
+        dict or OrderedDict: The loaded checkpoint.
+    """
+    checkpoint = _load_checkpoint(filename, map_location)
+    # OrderedDict is a subclass of dict
+    if not isinstance(checkpoint, dict):
+        raise RuntimeError(
+            f'No state_dict found in checkpoint file {filename}')
+    # get state_dict from checkpoint
+    if 'state_dict' in checkpoint:
+        state_dict = checkpoint['state_dict']
+    elif 'model' in checkpoint:
+        state_dict = checkpoint['model']
+    else:
+        state_dict = checkpoint
+    # strip prefix of state_dict
+    if list(state_dict.keys())[0].startswith('module.'):
+        state_dict = {k[7:]: v for k, v in state_dict.items()}
+
+    # reshape absolute position embedding
+    if state_dict.get('absolute_pos_embed') is not None:
+        absolute_pos_embed = state_dict['absolute_pos_embed']
+        N1, L, C1 = absolute_pos_embed.size()
+        N2, C2, H, W = model.absolute_pos_embed.size()
+        if N1 != N2 or C1 != C2 or L != H * W:
+            logger.warning('Error in loading absolute_pos_embed, pass')
+        else:
+            state_dict['absolute_pos_embed'] = absolute_pos_embed.view(
+                N2, H, W, C2).permute(0, 3, 1, 2)
+
+    # interpolate position bias table if needed
+    relative_position_bias_table_keys = [
+        k for k in state_dict.keys() if 'relative_position_bias_table' in k
+    ]
+    for table_key in relative_position_bias_table_keys:
+        table_pretrained = state_dict[table_key]
+        table_current = model.state_dict()[table_key]
+        L1, nH1 = table_pretrained.size()
+        L2, nH2 = table_current.size()
+        if nH1 != nH2:
+            logger.warning(f'Error in loading {table_key}, pass')
+        else:
+            if L1 != L2:
+                S1 = int(L1**0.5)
+                S2 = int(L2**0.5)
+                table_pretrained_resized = F.interpolate(
+                    table_pretrained.permute(1, 0).view(1, nH1, S1, S1),
+                    size=(S2, S2),
+                    mode='bicubic')
+                state_dict[table_key] = table_pretrained_resized.view(
+                    nH2, L2).permute(1, 0)
+
+    # load state_dict
+    load_state_dict(model, state_dict, strict, logger)
+    return checkpoint
+
+
+def weights_to_cpu(state_dict):
+    """Copy a model state_dict to cpu.
+
+    Args:
+        state_dict (OrderedDict): Model weights on GPU.
+    Returns:
+        OrderedDict: Model weights on GPU.
+    """
+    state_dict_cpu = OrderedDict()
+    for key, val in state_dict.items():
+        state_dict_cpu[key] = val.cpu()
+    return state_dict_cpu
+
+
+def _save_to_state_dict(module, destination, prefix, keep_vars):
+    """Saves module state to `destination` dictionary.
+
+    This method is modified from :meth:`torch.nn.Module._save_to_state_dict`.
+    Args:
+        module (nn.Module): The module to generate state_dict.
+        destination (dict): A dict where state will be stored.
+        prefix (str): The prefix for parameters and buffers used in this
+            module.
+    """
+    for name, param in module._parameters.items():
+        if param is not None:
+            destination[prefix + name] = param if keep_vars else param.detach()
+    for name, buf in module._buffers.items():
+        # remove check of _non_persistent_buffers_set to allow nn.BatchNorm2d
+        if buf is not None:
+            destination[prefix + name] = buf if keep_vars else buf.detach()
+
+
+def get_state_dict(module, destination=None, prefix='', keep_vars=False):
+    """Returns a dictionary containing a whole state of the module.
+
+    Both parameters and persistent buffers (e.g. running averages) are
+    included. Keys are corresponding parameter and buffer names.
+    This method is modified from :meth:`torch.nn.Module.state_dict` to
+    recursively check parallel module in case that the model has a complicated
+    structure, e.g., nn.Module(nn.Module(DDP)).
+    Args:
+        module (nn.Module): The module to generate state_dict.
+        destination (OrderedDict): Returned dict for the state of the
+            module.
+        prefix (str): Prefix of the key.
+        keep_vars (bool): Whether to keep the variable property of the
+            parameters. Default: False.
+    Returns:
+        dict: A dictionary containing a whole state of the module.
+    """
+    # recursively check parallel module in case that the model has a
+    # complicated structure, e.g., nn.Module(nn.Module(DDP))
+    # if is_module_wrapper(module):
+    #     module = module.module
+
+    # below is the same as torch.nn.Module.state_dict()
+    if destination is None:
+        destination = OrderedDict()
+        destination._metadata = OrderedDict()
+    destination._metadata[prefix[:-1]] = local_metadata = dict(
+        version=module._version)
+    _save_to_state_dict(module, destination, prefix, keep_vars)
+    for name, child in module._modules.items():
+        if child is not None:
+            get_state_dict(
+                child, destination, prefix + name + '.', keep_vars=keep_vars)
+    for hook in module._state_dict_hooks.values():
+        hook_result = hook(module, destination, prefix, local_metadata)
+        if hook_result is not None:
+            destination = hook_result
+    return destination
diff --git a/modelscope/models/cv/video_panoptic_segmentation/backbone/swin_transformer.py b/modelscope/models/cv/video_panoptic_segmentation/backbone/swin_transformer.py
new file mode 100644
index 00000000..10af1389
--- /dev/null
+++ b/modelscope/models/cv/video_panoptic_segmentation/backbone/swin_transformer.py
@@ -0,0 +1,718 @@
+# The implementation is adopted from Video-K-Net,
+# made publicly available at https://github.com/lxtGH/Video-K-Net
+# --------------------------------------------------------
+# Swin Transformer
+# Copyright (c) 2021 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Ze Liu, Yutong Lin, Yixuan Wei
+# --------------------------------------------------------
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.checkpoint as checkpoint
+from mmdet.utils import get_root_logger
+from timm.models.layers import DropPath, to_2tuple, trunc_normal_
+
+from .swin_checkpoint import load_checkpoint
+
+
+class Mlp(nn.Module):
+    """Multilayer perceptron."""
+
+    def __init__(self,
+                 in_features,
+                 hidden_features=None,
+                 out_features=None,
+                 act_layer=nn.GELU,
+                 drop=0.):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+
+
+def window_partition(x, window_size):
+    """
+    Args:
+        x: (B, H, W, C)
+        window_size (int): window size
+    Returns:
+        windows: (num_windows*B, window_size, window_size, C)
+    """
+    B, H, W, C = x.shape
+    x = x.view(B, H // window_size, window_size, W // window_size, window_size,
+               C)
+    windows = x.permute(0, 1, 3, 2, 4,
+                        5).contiguous().view(-1, window_size, window_size, C)
+    return windows
+
+
+def window_reverse(windows, window_size, H, W):
+    """
+    Args:
+        windows: (num_windows*B, window_size, window_size, C)
+        window_size (int): Window size
+        H (int): Height of image
+        W (int): Width of image
+    Returns:
+        x: (B, H, W, C)
+    """
+    B = int(windows.shape[0] / (H * W / window_size / window_size))
+    x = windows.view(B, H // window_size, W // window_size, window_size,
+                     window_size, -1)
+    x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1)
+    return x
+
+
+class WindowAttention(nn.Module):
+    """Window based multi-head self attention (W-MSA) module with relative
+    position bias.
+
+    It supports both of shifted and non-shifted window.
+    Args:
+        dim (int): Number of input channels.
+        window_size (tuple[int]): The height and width of the window.
+        num_heads (int): Number of attention heads.
+        qkv_bias (bool, optional):  If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set
+        attn_drop (float, optional): Dropout ratio of attention weight. Default: 0.0
+        proj_drop (float, optional): Dropout ratio of output. Default: 0.0
+    """
+
+    def __init__(self,
+                 dim,
+                 window_size,
+                 num_heads,
+                 qkv_bias=True,
+                 qk_scale=None,
+                 attn_drop=0.,
+                 proj_drop=0.):
+
+        super().__init__()
+        self.dim = dim
+        self.window_size = window_size  # Wh, Ww
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = qk_scale or head_dim**-0.5
+
+        # define a parameter table of relative position bias
+        self.relative_position_bias_table = nn.Parameter(
+            torch.zeros((2 * window_size[0] - 1) * (2 * window_size[1] - 1),
+                        num_heads))  # 2*Wh-1 * 2*Ww-1, nH
+
+        # get pair-wise relative position index for each token inside the window
+        coords_h = torch.arange(self.window_size[0])
+        coords_w = torch.arange(self.window_size[1])
+        coords = torch.stack(torch.meshgrid([coords_h, coords_w]))  # 2, Wh, Ww
+        coords_flatten = torch.flatten(coords, 1)  # 2, Wh*Ww
+        relative_coords = coords_flatten[:, :,
+                                         None] - coords_flatten[:,
+                                                                None, :]  # 2, Wh*Ww, Wh*Ww
+        relative_coords = relative_coords.permute(
+            1, 2, 0).contiguous()  # Wh*Ww, Wh*Ww, 2
+        relative_coords[:, :,
+                        0] += self.window_size[0] - 1  # shift to start from 0
+        relative_coords[:, :, 1] += self.window_size[1] - 1
+        relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1
+        relative_position_index = relative_coords.sum(-1)  # Wh*Ww, Wh*Ww
+        self.register_buffer('relative_position_index',
+                             relative_position_index)
+
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+
+        trunc_normal_(self.relative_position_bias_table, std=.02)
+        self.softmax = nn.Softmax(dim=-1)
+
+    def forward(self, x, mask=None):
+        """Forward function.
+
+        Args:
+            x: input features with shape of (num_windows*B, N, C)
+            mask: (0/-inf) mask with shape of (num_windows, Wh*Ww, Wh*Ww) or None
+        """
+        B_, N, C = x.shape
+        qkv = self.qkv(x).reshape(B_, N, 3, self.num_heads,
+                                  C // self.num_heads).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv[0], qkv[1], qkv[
+            2]  # make torchscript happy (cannot use tensor as tuple)
+
+        q = q * self.scale
+        attn = (q @ k.transpose(-2, -1))
+
+        relative_position_bias = self.relative_position_bias_table[
+            self.relative_position_index.view(-1)].view(
+                self.window_size[0] * self.window_size[1],
+                self.window_size[0] * self.window_size[1],
+                -1)  # Wh*Ww,Wh*Ww,nH
+        relative_position_bias = relative_position_bias.permute(
+            2, 0, 1).contiguous()  # nH, Wh*Ww, Wh*Ww
+        attn = attn + relative_position_bias.unsqueeze(0)
+
+        if mask is not None:
+            nW = mask.shape[0]
+            attn = attn.view(B_ // nW, nW, self.num_heads, N,
+                             N) + mask.unsqueeze(1).unsqueeze(0)
+            attn = attn.view(-1, self.num_heads, N, N)
+            attn = self.softmax(attn)
+        else:
+            attn = self.softmax(attn)
+
+        attn = self.attn_drop(attn)
+
+        x = (attn @ v).transpose(1, 2).reshape(B_, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+
+
+class SwinTransformerBlock(nn.Module):
+    """Swin Transformer Block.
+
+    Args:
+        dim (int): Number of input channels.
+        num_heads (int): Number of attention heads.
+        window_size (int): Window size.
+        shift_size (int): Shift size for SW-MSA.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
+        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
+        drop (float, optional): Dropout rate. Default: 0.0
+        attn_drop (float, optional): Attention dropout rate. Default: 0.0
+        drop_path (float, optional): Stochastic depth rate. Default: 0.0
+        act_layer (nn.Module, optional): Activation layer. Default: nn.GELU
+        norm_layer (nn.Module, optional): Normalization layer.  Default: nn.LayerNorm
+    """
+
+    def __init__(self,
+                 dim,
+                 num_heads,
+                 window_size=7,
+                 shift_size=0,
+                 mlp_ratio=4.,
+                 qkv_bias=True,
+                 qk_scale=None,
+                 drop=0.,
+                 attn_drop=0.,
+                 drop_path=0.,
+                 act_layer=nn.GELU,
+                 norm_layer=nn.LayerNorm):
+        super().__init__()
+        self.dim = dim
+        self.num_heads = num_heads
+        self.window_size = window_size
+        self.shift_size = shift_size
+        self.mlp_ratio = mlp_ratio
+        assert 0 <= self.shift_size < self.window_size, 'shift_size must in 0-window_size'
+
+        self.norm1 = norm_layer(dim)
+        self.attn = WindowAttention(
+            dim,
+            window_size=to_2tuple(self.window_size),
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            qk_scale=qk_scale,
+            attn_drop=attn_drop,
+            proj_drop=drop)
+
+        self.drop_path = DropPath(
+            drop_path) if drop_path > 0. else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(
+            in_features=dim,
+            hidden_features=mlp_hidden_dim,
+            act_layer=act_layer,
+            drop=drop)
+
+        self.H = None
+        self.W = None
+
+    def forward(self, x, mask_matrix):
+        """Forward function.
+
+        Args:
+            x: Input feature, tensor size (B, H*W, C).
+            H, W: Spatial resolution of the input feature.
+            mask_matrix: Attention mask for cyclic shift.
+        """
+        B, L, C = x.shape
+        H, W = self.H, self.W
+        assert L == H * W, 'input feature has wrong size'
+
+        shortcut = x
+        x = self.norm1(x)
+        x = x.view(B, H, W, C)
+
+        # pad feature maps to multiples of window size
+        pad_l = pad_t = 0
+        pad_r = (self.window_size - W % self.window_size) % self.window_size
+        pad_b = (self.window_size - H % self.window_size) % self.window_size
+        x = F.pad(x, (0, 0, pad_l, pad_r, pad_t, pad_b))
+        _, Hp, Wp, _ = x.shape
+
+        # cyclic shift
+        if self.shift_size > 0:
+            shifted_x = torch.roll(
+                x, shifts=(-self.shift_size, -self.shift_size), dims=(1, 2))
+            attn_mask = mask_matrix
+        else:
+            shifted_x = x
+            attn_mask = None
+
+        # partition windows
+        x_windows = window_partition(
+            shifted_x, self.window_size)  # nW*B, window_size, window_size, C
+        x_windows = x_windows.view(-1, self.window_size * self.window_size,
+                                   C)  # nW*B, window_size*window_size, C
+
+        # W-MSA/SW-MSA
+        attn_windows = self.attn(
+            x_windows, mask=attn_mask)  # nW*B, window_size*window_size, C
+
+        # merge windows
+        attn_windows = attn_windows.view(-1, self.window_size,
+                                         self.window_size, C)
+        shifted_x = window_reverse(attn_windows, self.window_size, Hp,
+                                   Wp)  # B H' W' C
+
+        # reverse cyclic shift
+        if self.shift_size > 0:
+            x = torch.roll(
+                shifted_x,
+                shifts=(self.shift_size, self.shift_size),
+                dims=(1, 2))
+        else:
+            x = shifted_x
+
+        if pad_r > 0 or pad_b > 0:
+            x = x[:, :H, :W, :].contiguous()
+
+        x = x.view(B, H * W, C)
+
+        # FFN
+        x = shortcut + self.drop_path(x)
+        x = x + self.drop_path(self.mlp(self.norm2(x)))
+
+        return x
+
+
+class PatchMerging(nn.Module):
+    """ Patch Merging Layer
+    Args:
+        dim (int): Number of input channels.
+        norm_layer (nn.Module, optional): Normalization layer.  Default: nn.LayerNorm
+    """
+
+    def __init__(self, dim, norm_layer=nn.LayerNorm):
+        super().__init__()
+        self.dim = dim
+        self.reduction = nn.Linear(4 * dim, 2 * dim, bias=False)
+        self.norm = norm_layer(4 * dim)
+
+    def forward(self, x, H, W):
+        """Forward function.
+
+        Args:
+            x: Input feature, tensor size (B, H*W, C).
+            H, W: Spatial resolution of the input feature.
+        """
+        B, L, C = x.shape
+        assert L == H * W, 'input feature has wrong size'
+
+        x = x.view(B, H, W, C)
+
+        # padding
+        pad_input = (H % 2 == 1) or (W % 2 == 1)
+        if pad_input:
+            x = F.pad(x, (0, 0, 0, W % 2, 0, H % 2))
+
+        x0 = x[:, 0::2, 0::2, :]  # B H/2 W/2 C
+        x1 = x[:, 1::2, 0::2, :]  # B H/2 W/2 C
+        x2 = x[:, 0::2, 1::2, :]  # B H/2 W/2 C
+        x3 = x[:, 1::2, 1::2, :]  # B H/2 W/2 C
+        x = torch.cat([x0, x1, x2, x3], -1)  # B H/2 W/2 4*C
+        x = x.view(B, -1, 4 * C)  # B H/2*W/2 4*C
+
+        x = self.norm(x)
+        x = self.reduction(x)
+
+        return x
+
+
+class BasicLayer(nn.Module):
+    """A basic Swin Transformer layer for one stage.
+
+    Args:
+        dim (int): Number of feature channels
+        depth (int): Depths of this stage.
+        num_heads (int): Number of attention head.
+        window_size (int): Local window size. Default: 7.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.
+        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
+        drop (float, optional): Dropout rate. Default: 0.0
+        attn_drop (float, optional): Attention dropout rate. Default: 0.0
+        drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0
+        norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm
+        downsample (nn.Module | None, optional): Downsample layer at the end of the layer. Default: None
+        with_cp (bool): Whether to use checkpointing to save memory. Default: False.
+    """
+
+    def __init__(self,
+                 dim,
+                 depth,
+                 num_heads,
+                 window_size=7,
+                 mlp_ratio=4.,
+                 qkv_bias=True,
+                 qk_scale=None,
+                 drop=0.,
+                 attn_drop=0.,
+                 drop_path=0.,
+                 norm_layer=nn.LayerNorm,
+                 downsample=None,
+                 with_cp=False):
+        super().__init__()
+        self.window_size = window_size
+        self.shift_size = window_size // 2
+        self.depth = depth
+        self.with_cp = with_cp
+
+        # build blocks
+        self.blocks = nn.ModuleList([
+            SwinTransformerBlock(
+                dim=dim,
+                num_heads=num_heads,
+                window_size=window_size,
+                shift_size=0 if (i % 2 == 0) else window_size // 2,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                drop=drop,
+                attn_drop=attn_drop,
+                drop_path=drop_path[i]
+                if isinstance(drop_path, list) else drop_path,
+                norm_layer=norm_layer) for i in range(depth)
+        ])
+
+        # patch merging layer
+        if downsample is not None:
+            self.downsample = downsample(dim=dim, norm_layer=norm_layer)
+        else:
+            self.downsample = None
+
+    def forward(self, x, H, W):
+        """Forward function.
+
+        Args:
+            x: Input feature, tensor size (B, H*W, C).
+            H, W: Spatial resolution of the input feature.
+        """
+
+        # calculate attention mask for SW-MSA
+        Hp = int(np.ceil(H / self.window_size)) * self.window_size
+        Wp = int(np.ceil(W / self.window_size)) * self.window_size
+        img_mask = torch.zeros((1, Hp, Wp, 1), device=x.device)  # 1 Hp Wp 1
+        h_slices = (slice(0, -self.window_size),
+                    slice(-self.window_size,
+                          -self.shift_size), slice(-self.shift_size, None))
+        w_slices = (slice(0, -self.window_size),
+                    slice(-self.window_size,
+                          -self.shift_size), slice(-self.shift_size, None))
+        cnt = 0
+        for h in h_slices:
+            for w in w_slices:
+                img_mask[:, h, w, :] = cnt
+                cnt += 1
+
+        mask_windows = window_partition(
+            img_mask, self.window_size)  # nW, window_size, window_size, 1
+        mask_windows = mask_windows.view(-1,
+                                         self.window_size * self.window_size)
+        attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)
+        attn_mask = attn_mask.masked_fill(attn_mask != 0,
+                                          float(-100.0)).masked_fill(
+                                              attn_mask == 0, float(0.0))
+        attn_mask = attn_mask.to(dtype=x.dtype)
+        for blk in self.blocks:
+            blk.H, blk.W = H, W
+            if self.with_cp:
+                x = checkpoint.checkpoint(blk, x, attn_mask)
+            else:
+                x = blk(x, attn_mask)
+        if self.downsample is not None:
+            x_down = self.downsample(x, H, W)
+            Wh, Ww = (H + 1) // 2, (W + 1) // 2
+            return x, H, W, x_down, Wh, Ww
+        else:
+            return x, H, W, x, H, W
+
+
+class PatchEmbed(nn.Module):
+    """ Image to Patch Embedding
+    Args:
+        patch_size (int): Patch token size. Default: 4.
+        in_chans (int): Number of input image channels. Default: 3.
+        embed_dims (int): Number of linear projection output channels. Default: 96.
+        norm_layer (nn.Module, optional): Normalization layer. Default: None
+    """
+
+    def __init__(self,
+                 patch_size=4,
+                 in_chans=3,
+                 embed_dims=96,
+                 norm_layer=None):
+        super().__init__()
+        patch_size = to_2tuple(patch_size)
+        self.patch_size = patch_size
+
+        self.in_chans = in_chans
+        self.embed_dims = embed_dims
+
+        self.proj = nn.Conv2d(
+            in_chans, embed_dims, kernel_size=patch_size, stride=patch_size)
+        if norm_layer is not None:
+            self.norm = norm_layer(embed_dims)
+        else:
+            self.norm = None
+
+    def forward(self, x):
+        """Forward function."""
+        # padding
+        _, _, H, W = x.size()
+        if W % self.patch_size[1] != 0:
+            x = F.pad(x, (0, self.patch_size[1] - W % self.patch_size[1]))
+        if H % self.patch_size[0] != 0:
+            x = F.pad(x,
+                      (0, 0, 0, self.patch_size[0] - H % self.patch_size[0]))
+
+        x = self.proj(x)  # B C Wh Ww
+        if self.norm is not None:
+            Wh, Ww = x.size(2), x.size(3)
+            x = x.flatten(2).transpose(1, 2)
+            x = self.norm(x)
+            x = x.transpose(1, 2).view(-1, self.embed_dims, Wh, Ww)
+
+        return x
+
+
+class SwinTransformerDIY(nn.Module):
+    """ Swin Transformer backbone.
+        A PyTorch impl of : `Swin Transformer: Hierarchical Vision Transformer using Shifted Windows`  -
+          https://arxiv.org/pdf/2103.14030
+    Args:
+        pretrain_img_size (int): Input image size for training the pretrained model,
+            used in absolute postion embedding. Default 224.
+        patch_size (int | tuple(int)): Patch size. Default: 4.
+        in_chans (int): Number of input image channels. Default: 3.
+        embed_dims (int): Number of linear projection output channels. Default: 96.
+        depths (tuple[int]): Depths of each Swin Transformer stage.
+        num_heads (tuple[int]): Number of attention head of each stage.
+        window_size (int): Window size. Default: 7.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.
+        qkv_bias (bool): If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float): Override default qk scale of head_dim ** -0.5 if set.
+        drop_rate (float): Dropout rate.
+        attn_drop_rate (float): Attention dropout rate. Default: 0.
+        drop_path_rate (float): Stochastic depth rate. Default: 0.2.
+        norm_layer (nn.Module): Normalization layer. Default: nn.LayerNorm.
+        ape (bool): If True, add absolute position embedding to the patch embedding. Default: False.
+        patch_norm (bool): If True, add normalization after patch embedding. Default: True.
+        out_indices (Sequence[int]): Output from which stages.
+        frozen_stages (int): Stages to be frozen (stop grad and set eval mode).
+            -1 means not freezing any parameters.
+        with_cp (bool): Whether to use checkpointing to save memory. Default: False.
+    """
+
+    def __init__(self,
+                 pretrain_img_size=224,
+                 patch_size=4,
+                 in_chans=3,
+                 embed_dims=96,
+                 depths=[2, 2, 6, 2],
+                 num_heads=[3, 6, 12, 24],
+                 window_size=7,
+                 mlp_ratio=4.,
+                 qkv_bias=True,
+                 qk_scale=None,
+                 drop_rate=0.,
+                 attn_drop_rate=0.,
+                 drop_path_rate=0.2,
+                 norm_layer=nn.LayerNorm,
+                 use_abs_pos_embed=False,
+                 patch_norm=True,
+                 out_indices=(0, 1, 2, 3),
+                 frozen_stages=-1,
+                 with_cp=False,
+                 output_img=False,
+                 pretrained=None):
+        super().__init__()
+        self.output_img = output_img
+
+        self.pretrain_img_size = pretrain_img_size
+        self.num_layers = len(depths)
+        self.embed_dims = embed_dims
+        self.ape = use_abs_pos_embed
+        self.patch_norm = patch_norm
+        self.out_indices = out_indices
+        self.frozen_stages = frozen_stages
+        self.pretrained = pretrained
+
+        # split image into non-overlapping patches
+        self.patch_embed = PatchEmbed(
+            patch_size=patch_size,
+            in_chans=in_chans,
+            embed_dims=embed_dims,
+            norm_layer=norm_layer if self.patch_norm else None)
+
+        # absolute position embedding
+        if self.ape:
+            pretrain_img_size = to_2tuple(pretrain_img_size)
+            patch_size = to_2tuple(patch_size)
+            patches_resolution = [
+                pretrain_img_size[0] // patch_size[0],
+                pretrain_img_size[1] // patch_size[1]
+            ]
+
+            self.absolute_pos_embed = nn.Parameter(
+                torch.zeros(1, embed_dims, patches_resolution[0],
+                            patches_resolution[1]))
+            trunc_normal_(self.absolute_pos_embed, std=.02)
+
+        self.pos_drop = nn.Dropout(p=drop_rate)
+
+        # stochastic depth
+        dpr = [
+            x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))
+        ]  # stochastic depth decay rule
+
+        # build layers
+        self.layers = nn.ModuleList()
+        for i_layer in range(self.num_layers):
+            layer = BasicLayer(
+                dim=int(embed_dims * 2**i_layer),
+                depth=depths[i_layer],
+                num_heads=num_heads[i_layer],
+                window_size=window_size,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                drop=drop_rate,
+                attn_drop=attn_drop_rate,
+                drop_path=dpr[sum(depths[:i_layer]):sum(depths[:i_layer + 1])],
+                norm_layer=norm_layer,
+                downsample=PatchMerging if
+                (i_layer < self.num_layers - 1) else None,
+                with_cp=with_cp)
+            self.layers.append(layer)
+
+        num_features = [int(embed_dims * 2**i) for i in range(self.num_layers)]
+        self.num_features = num_features
+
+        # add a norm layer for each output
+        for i_layer in out_indices:
+            layer = norm_layer(num_features[i_layer])
+            layer_name = f'norm{i_layer}'
+            self.add_module(layer_name, layer)
+
+        self._freeze_stages()
+
+    def _freeze_stages(self):
+        if self.frozen_stages >= 0:
+            self.patch_embed.eval()
+            for param in self.patch_embed.parameters():
+                param.requires_grad = False
+
+        if self.frozen_stages >= 1 and self.ape:
+            self.absolute_pos_embed.requires_grad = False
+
+        if self.frozen_stages >= 2:
+            self.pos_drop.eval()
+            for i in range(0, self.frozen_stages - 1):
+                m = self.layers[i]
+                m.eval()
+                for param in m.parameters():
+                    param.requires_grad = False
+
+    def init_weights(self, pretrained=None):
+        """Initialize the weights in backbone.
+
+        Args:
+            pretrained (str, optional): Path to pre-trained weights.
+                Defaults to None.
+        """
+        if pretrained is None and self.pretrained is not None:
+            pretrained = self.pretrained
+
+        def _init_weights(m):
+            if isinstance(m, nn.Linear):
+                trunc_normal_(m.weight, std=.02)
+                if isinstance(m, nn.Linear) and m.bias is not None:
+                    nn.init.constant_(m.bias, 0)
+            elif isinstance(m, nn.LayerNorm):
+                nn.init.constant_(m.bias, 0)
+                nn.init.constant_(m.weight, 1.0)
+
+        if isinstance(pretrained, str):
+            self.apply(_init_weights)
+            logger = get_root_logger()
+            load_checkpoint(self, pretrained, strict=False, logger=logger)
+        elif pretrained is None:
+            self.apply(_init_weights)
+        else:
+            raise TypeError('pretrained must be a str or None')
+
+    def forward(self, x):
+        """Forward function."""
+        x_idty = x
+        x = self.patch_embed(x)
+
+        Wh, Ww = x.size(2), x.size(3)
+        if self.ape:
+            # interpolate the position embedding to the corresponding size
+            absolute_pos_embed = F.interpolate(
+                self.absolute_pos_embed, size=(Wh, Ww), mode='bicubic')
+            x = (x + absolute_pos_embed).flatten(2).transpose(1,
+                                                              2)  # B Wh*Ww C
+        else:
+            x = x.flatten(2).transpose(1, 2)
+        x = self.pos_drop(x)
+
+        outs = []
+        for i in range(self.num_layers):
+            layer = self.layers[i]
+            x_out, H, W, x, Wh, Ww = layer(x, Wh, Ww)
+
+            if i in self.out_indices:
+                norm_layer = getattr(self, f'norm{i}')
+                x_out = norm_layer(x_out)
+
+                out = x_out.view(-1, H, W,
+                                 self.num_features[i]).permute(0, 3, 1,
+                                                               2).contiguous()
+                outs.append(out)
+
+        if self.output_img:
+            outs.insert(0, x_idty)
+        return tuple(outs)
+
+    def train(self, mode=True):
+        """Convert the model into training mode while keep layers freezed."""
+        super().train(mode)
+        self._freeze_stages()
diff --git a/modelscope/models/cv/video_panoptic_segmentation/head/__init__.py b/modelscope/models/cv/video_panoptic_segmentation/head/__init__.py
new file mode 100644
index 00000000..81693b85
--- /dev/null
+++ b/modelscope/models/cv/video_panoptic_segmentation/head/__init__.py
@@ -0,0 +1,21 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .kernel_update_head import (
+        VideoKernelUpdateHead, )
+
+else:
+    _import_structure = {'kernel_update_head': ['VideoKernelUpdateHead']}
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/models/cv/video_panoptic_segmentation/head/kernel_head.py b/modelscope/models/cv/video_panoptic_segmentation/head/kernel_head.py
new file mode 100644
index 00000000..443c826f
--- /dev/null
+++ b/modelscope/models/cv/video_panoptic_segmentation/head/kernel_head.py
@@ -0,0 +1,223 @@
+# The implementation is adopted from Video-K-Net,
+# made publicly available at https://github.com/lxtGH/Video-K-Net
+
+import torch
+import torch.nn as nn
+from mmcv.cnn import ConvModule
+
+from .semantic_fpn_wrapper import SemanticFPNWrapper
+
+
+class ConvKernelHead(nn.Module):
+
+    def __init__(self,
+                 num_proposals=100,
+                 in_channels=256,
+                 out_channels=256,
+                 num_heads=8,
+                 num_cls_fcs=1,
+                 num_seg_convs=1,
+                 num_loc_convs=1,
+                 att_dropout=False,
+                 conv_kernel_size=1,
+                 norm_cfg=dict(type='GN', num_groups=32),
+                 semantic_fpn=True,
+                 train_cfg=None,
+                 num_classes=80,
+                 xavier_init_kernel=False,
+                 kernel_init_std=0.01,
+                 use_binary=False,
+                 proposal_feats_with_obj=False,
+                 feat_downsample_stride=1,
+                 feat_refine_stride=1,
+                 feat_refine=True,
+                 with_embed=False,
+                 feat_embed_only=False,
+                 conv_normal_init=False,
+                 mask_out_stride=4,
+                 hard_target=False,
+                 num_thing_classes=80,
+                 num_stuff_classes=53,
+                 mask_assign_stride=4,
+                 ignore_label=255,
+                 thing_label_in_seg=0,
+                 cat_stuff_mask=False,
+                 **kwargs):
+        super(ConvKernelHead, self).__init__()
+        self.num_proposals = num_proposals
+        self.num_cls_fcs = num_cls_fcs
+        self.train_cfg = train_cfg
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.num_classes = num_classes
+        self.proposal_feats_with_obj = proposal_feats_with_obj
+        self.sampling = False
+        self.localization_fpn = SemanticFPNWrapper(
+            in_channels=256,
+            feat_channels=256,
+            out_channels=256,
+            start_level=0,
+            end_level=3,
+            upsample_times=2,
+            positional_encoding=dict(
+                type='SinePositionalEncoding', num_feats=128, normalize=True),
+            cat_coors=False,
+            cat_coors_level=3,
+            fuse_by_cat=False,
+            return_list=False,
+            num_aux_convs=1,
+            norm_cfg=dict(type='GN', num_groups=32, requires_grad=True))
+        self.semantic_fpn = semantic_fpn
+        self.norm_cfg = norm_cfg
+        self.num_heads = num_heads
+        self.att_dropout = att_dropout
+        self.mask_out_stride = mask_out_stride
+        self.hard_target = hard_target
+        self.conv_kernel_size = conv_kernel_size
+        self.xavier_init_kernel = xavier_init_kernel
+        self.kernel_init_std = kernel_init_std
+        self.feat_downsample_stride = feat_downsample_stride
+        self.feat_refine_stride = feat_refine_stride
+        self.conv_normal_init = conv_normal_init
+        self.feat_refine = feat_refine
+        self.with_embed = with_embed
+        self.feat_embed_only = feat_embed_only
+        self.num_loc_convs = num_loc_convs
+        self.num_seg_convs = num_seg_convs
+        self.use_binary = use_binary
+        self.num_thing_classes = num_thing_classes
+        self.num_stuff_classes = num_stuff_classes
+        self.mask_assign_stride = mask_assign_stride
+        self.ignore_label = ignore_label
+        self.thing_label_in_seg = thing_label_in_seg
+        self.cat_stuff_mask = cat_stuff_mask
+        self._init_layers()
+
+    def _init_layers(self):
+        """Initialize a sparse set of proposal boxes and proposal features."""
+        self.init_kernels = nn.Conv2d(
+            self.out_channels,
+            self.num_proposals,
+            self.conv_kernel_size,
+            padding=int(self.conv_kernel_size // 2),
+            bias=False)  # (N, C)
+
+        if self.semantic_fpn:
+            self.conv_seg = nn.Conv2d(self.out_channels, self.num_classes, 1)
+
+        if self.feat_downsample_stride > 1 and self.feat_refine:
+            self.ins_downsample = ConvModule(
+                self.in_channels,
+                self.out_channels,
+                3,
+                stride=self.feat_refine_stride,  # 2
+                padding=1,
+                norm_cfg=self.norm_cfg)
+            self.seg_downsample = ConvModule(
+                self.in_channels,
+                self.out_channels,
+                3,
+                stride=self.feat_refine_stride,  # 2
+                padding=1,
+                norm_cfg=self.norm_cfg)
+
+        self.loc_convs = nn.ModuleList()
+        for i in range(self.num_loc_convs):
+            self.loc_convs.append(
+                ConvModule(
+                    self.in_channels,
+                    self.out_channels,
+                    1,
+                    norm_cfg=self.norm_cfg))
+
+        self.seg_convs = nn.ModuleList()
+        for i in range(self.num_seg_convs):
+            self.seg_convs.append(
+                ConvModule(
+                    self.in_channels,
+                    self.out_channels,
+                    1,
+                    norm_cfg=self.norm_cfg))
+
+    def _decode_init_proposals(self, img, img_metas):
+        num_imgs = len(img_metas)
+
+        localization_feats = self.localization_fpn(img)
+
+        # thing branch
+        if isinstance(localization_feats, list):
+            loc_feats = localization_feats[0]
+        else:
+            loc_feats = localization_feats
+        for conv in self.loc_convs:
+            loc_feats = conv(loc_feats)
+        if self.feat_downsample_stride > 1 and self.feat_refine:
+            loc_feats = self.ins_downsample(loc_feats)
+
+        # init kernel prediction
+        mask_preds = self.init_kernels(loc_feats)
+
+        # stuff branch
+        if self.semantic_fpn:
+            if isinstance(localization_feats, list):
+                semantic_feats = localization_feats[1]
+            else:
+                semantic_feats = localization_feats
+            for conv in self.seg_convs:
+                semantic_feats = conv(semantic_feats)
+            if self.feat_downsample_stride > 1 and self.feat_refine:
+                semantic_feats = self.seg_downsample(semantic_feats)
+        else:
+            semantic_feats = None
+
+        if semantic_feats is not None:
+            seg_preds = self.conv_seg(semantic_feats)
+        else:
+            seg_preds = None
+
+        proposal_feats = self.init_kernels.weight.clone()
+        proposal_feats = proposal_feats[None].expand(num_imgs,
+                                                     *proposal_feats.size())
+
+        if semantic_feats is not None:
+            x_feats = semantic_feats + loc_feats
+        else:
+            x_feats = loc_feats
+
+        if self.proposal_feats_with_obj:
+            sigmoid_masks = mask_preds.sigmoid()
+            nonzero_inds = sigmoid_masks > 0.5
+            if self.use_binary:
+                sigmoid_masks = nonzero_inds.float()
+            else:
+                sigmoid_masks = nonzero_inds.float() * sigmoid_masks
+            obj_feats = torch.einsum('bnhw, bchw->bnc', sigmoid_masks, x_feats)
+
+        cls_scores = None
+
+        if self.proposal_feats_with_obj:  # important use
+            proposal_feats = proposal_feats + obj_feats.view(
+                num_imgs, self.num_proposals, self.out_channels, 1, 1)
+
+        if self.cat_stuff_mask and not self.training:
+            mask_preds = torch.cat(
+                [mask_preds, seg_preds[:, self.num_thing_classes:]], dim=1)
+            stuff_kernels = self.conv_seg.weight[self.
+                                                 num_thing_classes:].clone()
+            stuff_kernels = stuff_kernels[None].expand(num_imgs,
+                                                       *stuff_kernels.size())
+            proposal_feats = torch.cat([proposal_feats, stuff_kernels],
+                                       dim=1)  # (b, N_{st}+N_{th}, c)
+
+        return proposal_feats, x_feats, mask_preds, cls_scores, seg_preds
+
+    def simple_test_rpn(self, img, img_metas):
+        """Forward function in testing stage."""
+        return self._decode_init_proposals(img, img_metas)
+
+    def forward_dummy(self, img, img_metas):
+        """Dummy forward function.
+
+        Used in flops calculation.
+        """
+        return self._decode_init_proposals(img, img_metas)
diff --git a/modelscope/models/cv/video_panoptic_segmentation/head/kernel_iter_head.py b/modelscope/models/cv/video_panoptic_segmentation/head/kernel_iter_head.py
new file mode 100644
index 00000000..0eeb3e0a
--- /dev/null
+++ b/modelscope/models/cv/video_panoptic_segmentation/head/kernel_iter_head.py
@@ -0,0 +1,516 @@
+# The implementation is adopted from Video-K-Net,
+# made publicly available at https://github.com/lxtGH/Video-K-Net
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmdet.models.builder import HEADS, build_head
+from mmdet.models.roi_heads import BaseRoIHead
+
+from .kernel_update_head import VideoKernelUpdateHead
+from .kernel_updator import KernelUpdator
+
+
+@HEADS.register_module()
+class VideoKernelIterHead(BaseRoIHead):
+
+    def __init__(self,
+                 num_stages=6,
+                 recursive=False,
+                 assign_stages=5,
+                 stage_loss_weights=(1, 1, 1, 1, 1, 1),
+                 proposal_feature_channel=256,
+                 merge_cls_scores=False,
+                 do_panoptic=False,
+                 post_assign=False,
+                 hard_target=False,
+                 merge_joint=False,
+                 num_proposals=100,
+                 num_thing_classes=80,
+                 num_stuff_classes=53,
+                 mask_assign_stride=4,
+                 ignore_label=255,
+                 thing_label_in_seg=0,
+                 with_track=False,
+                 mask_head=dict(
+                     type='KernelUpdateHead',
+                     num_classes=80,
+                     num_fcs=2,
+                     num_heads=8,
+                     num_cls_fcs=1,
+                     num_reg_fcs=3,
+                     feedforward_channels=2048,
+                     hidden_channels=256,
+                     dropout=0.0,
+                     roi_feat_size=7,
+                     ffn_act_cfg=dict(type='ReLU', inplace=True)),
+                 mask_out_stride=4,
+                 train_cfg=None,
+                 test_cfg=None,
+                 **kwargs):
+        assert mask_head is not None
+        assert len(stage_loss_weights) == num_stages
+        self.num_stages = num_stages
+        self.stage_loss_weights = stage_loss_weights
+        self.proposal_feature_channel = proposal_feature_channel
+        self.merge_cls_scores = merge_cls_scores
+        self.recursive = recursive
+        self.post_assign = post_assign
+        self.mask_out_stride = mask_out_stride
+        self.hard_target = hard_target
+        self.merge_joint = merge_joint
+        self.assign_stages = assign_stages
+        self.do_panoptic = do_panoptic
+        self.num_thing_classes = num_thing_classes
+        self.num_stuff_classes = num_stuff_classes
+        self.mask_assign_stride = mask_assign_stride
+        self.thing_label_in_seg = thing_label_in_seg
+        self.num_proposals = num_proposals
+        self.ignore_label = ignore_label
+        self.with_track = with_track
+        super(VideoKernelIterHead, self).__init__(
+            mask_head=mask_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            **kwargs)
+
+    def init_bbox_head(self, mask_roi_extractor, mask_head):
+        """Initialize box head and box roi extractor.
+
+        Args:
+            mask_roi_extractor (dict): Config of box roi extractor.
+            mask_head (dict): Config of box in box head.
+        """
+        pass
+
+    def init_weights(self):
+        for i in range(self.num_stages):
+            self.mask_head[i].init_weights()
+
+    def init_assigner_sampler(self):
+        pass
+
+    def forward_train(self,
+                      x,
+                      proposal_feats,
+                      mask_preds,
+                      cls_score,
+                      img_metas,
+                      gt_masks,
+                      gt_labels,
+                      gt_pids=None,
+                      gt_bboxes_ignore=None,
+                      imgs_whwh=None,
+                      gt_bboxes=None,
+                      gt_sem_seg=None,
+                      gt_sem_cls=None):
+        pass
+
+    def init_mask_head(self, mask_roi_extractor, mask_head):
+        """Initialize mask head and mask roi extractor.
+
+        Args:
+            mask_roi_extractor (dict): Config of mask roi extractor.
+            mask_head (dict): Config of mask in mask head.
+        """
+        self.mask_head = nn.ModuleList()
+        if not isinstance(mask_head, list):
+            mask_head = [mask_head for _ in range(self.num_stages)]
+        assert len(mask_head) == self.num_stages
+        for head in mask_head:
+            self.mask_head.append(build_head(head))
+        if self.recursive:
+            for i in range(self.num_stages):
+                self.mask_head[i] = self.mask_head[0]
+
+    def _mask_forward(self,
+                      stage,
+                      x,
+                      object_feats,
+                      mask_preds,
+                      img_metas,
+                      previous_obj_feats=None,
+                      previous_mask_preds=None,
+                      previous_x_feats=None):
+        mask_head = self.mask_head[stage]
+        cls_score, mask_preds, object_feats, x_feats, object_feats_track = mask_head(
+            x,
+            object_feats,
+            mask_preds,
+            img_metas=img_metas,
+            previous_obj_feats=previous_obj_feats,
+            previous_mask_preds=previous_mask_preds,
+            previous_x_feats=previous_x_feats)
+        if mask_head.mask_upsample_stride > 1 and (stage == self.num_stages - 1
+                                                   or self.training):
+            scaled_mask_preds = F.interpolate(
+                mask_preds,
+                scale_factor=mask_head.mask_upsample_stride,
+                align_corners=False,
+                mode='bilinear')
+        else:
+            scaled_mask_preds = mask_preds
+        mask_results = dict(
+            cls_score=cls_score,
+            mask_preds=mask_preds,
+            scaled_mask_preds=scaled_mask_preds,
+            object_feats=object_feats,
+            object_feats_track=object_feats_track,
+            x_feats=x_feats,
+        )
+
+        return mask_results
+
+    def simple_test(self, x, proposal_feats, mask_preds, cls_score, img_metas):
+
+        # Decode initial proposals
+        num_imgs = len(img_metas)
+
+        object_feats = proposal_feats
+        for stage in range(self.num_stages):
+            mask_results = self._mask_forward(stage, x, object_feats,
+                                              mask_preds, img_metas)
+            object_feats = mask_results['object_feats']
+            cls_score = mask_results['cls_score']
+            mask_preds = mask_results['mask_preds']
+            scaled_mask_preds = mask_results['scaled_mask_preds']
+
+        num_classes = self.mask_head[-1].num_classes
+        results = []
+
+        if self.mask_head[-1].loss_cls.use_sigmoid:
+            cls_score = cls_score.sigmoid()
+        else:
+            cls_score = cls_score.softmax(-1)[..., :-1]
+
+        if self.do_panoptic:
+            for img_id in range(num_imgs):
+                single_result = self.get_panoptic(cls_score[img_id],
+                                                  scaled_mask_preds[img_id],
+                                                  self.test_cfg,
+                                                  img_metas[img_id],
+                                                  object_feats[img_id])
+                results.append(single_result)
+        else:
+            for img_id in range(num_imgs):
+                cls_score_per_img = cls_score[img_id]
+                scores_per_img, topk_indices = cls_score_per_img.flatten(
+                    0, 1).topk(
+                        self.test_cfg['max_per_img'], sorted=True)
+                mask_indices = topk_indices // num_classes
+                labels_per_img = topk_indices % num_classes
+                masks_per_img = scaled_mask_preds[img_id][mask_indices]
+                single_result = self.mask_head[-1].get_seg_masks(
+                    masks_per_img, labels_per_img, scores_per_img,
+                    self.test_cfg, img_metas[img_id])
+                results.append(single_result)
+
+        if self.with_track:
+            return results, object_feats, cls_score, mask_preds, scaled_mask_preds
+        else:
+            return results
+
+    def simple_test_with_previous(
+        self,
+        x,
+        proposal_feats,
+        mask_preds,
+        cls_score,
+        img_metas,
+        previous_obj_feats=None,
+        previous_mask_preds=None,
+        previous_x_feats=None,
+        is_first=False,
+    ):
+
+        # Decode initial proposals
+        num_imgs = len(img_metas)
+
+        object_feats = proposal_feats
+        for stage in range(self.num_stages):
+            # only link the last stage inputs
+            previous_obj_feats_cur = previous_obj_feats if stage == self.num_stages - 1 else None
+            previous_mask_preds_cur = previous_mask_preds if stage == self.num_stages - 1 else None
+            previous_x_feats_cur = previous_x_feats if stage == self.num_stages - 1 else None
+
+            mask_results = self._mask_forward(
+                stage,
+                x,
+                object_feats,
+                mask_preds,
+                img_metas,
+                previous_obj_feats=previous_obj_feats_cur,
+                previous_mask_preds=previous_mask_preds_cur,
+                previous_x_feats=previous_x_feats_cur)
+            object_feats = mask_results['object_feats']
+            cls_score = mask_results['cls_score']
+            mask_preds = mask_results['mask_preds']
+            scaled_mask_preds = mask_results['scaled_mask_preds']
+            object_feats_track = mask_results['object_feats_track']
+
+        num_classes = self.mask_head[-1].num_classes
+        results = []
+
+        if self.mask_head[-1].loss_cls.use_sigmoid:
+            cls_score = cls_score.sigmoid()
+        else:
+            cls_score = cls_score.softmax(-1)[..., :-1]
+
+        if is_first:
+            object_feats_track = object_feats
+
+        if self.do_panoptic:
+            for img_id in range(num_imgs):
+                single_result = self.get_panoptic(cls_score[img_id],
+                                                  scaled_mask_preds[img_id],
+                                                  self.test_cfg,
+                                                  img_metas[img_id],
+                                                  object_feats_track[img_id])
+                results.append(single_result)
+        else:
+            for img_id in range(num_imgs):
+                cls_score_per_img = cls_score[img_id]
+                scores_per_img, topk_indices = cls_score_per_img.flatten(
+                    0, 1).topk(
+                        self.test_cfg['max_per_img'], sorted=True)
+                mask_indices = topk_indices // num_classes
+                labels_per_img = topk_indices % num_classes
+                masks_per_img = scaled_mask_preds[img_id][mask_indices]
+                single_result = self.mask_head[-1].get_seg_masks(
+                    masks_per_img, labels_per_img, scores_per_img,
+                    self.test_cfg, img_metas[img_id])
+                results.append(single_result)
+
+        if self.with_track:
+            return results, object_feats, cls_score, mask_preds, scaled_mask_preds
+        else:
+            return results
+
+    def aug_test(self, features, proposal_list, img_metas, rescale=False):
+        raise NotImplementedError('SparseMask does not support `aug_test`')
+
+    def forward_dummy(self, x, proposal_boxes, proposal_feats, img_metas):
+        """Dummy forward function when do the flops computing."""
+        all_stage_mask_results = []
+        num_imgs = len(img_metas)
+        num_proposals = proposal_feats.size(1)
+        C, H, W = x.shape[-3:]
+        mask_preds = proposal_feats.bmm(x.view(num_imgs, C, -1)).view(
+            num_imgs, num_proposals, H, W)
+        object_feats = proposal_feats
+        for stage in range(self.num_stages):
+            mask_results = self._mask_forward(stage, x, object_feats,
+                                              mask_preds, img_metas)
+            all_stage_mask_results.append(mask_results)
+        return all_stage_mask_results
+
+    def get_panoptic(self,
+                     cls_scores,
+                     mask_preds,
+                     test_cfg,
+                     img_meta,
+                     obj_feat=None):
+        # resize mask predictions back
+        thing_scores = cls_scores[:self.num_proposals][:, :self.
+                                                       num_thing_classes]
+        thing_mask_preds = mask_preds[:self.num_proposals]
+        thing_scores, topk_indices = thing_scores.flatten(0, 1).topk(
+            self.test_cfg['max_per_img'], sorted=True)
+        mask_indices = topk_indices // self.num_thing_classes
+        thing_labels = topk_indices % self.num_thing_classes
+        masks_per_img = thing_mask_preds[mask_indices]
+        thing_masks = self.mask_head[-1].rescale_masks(masks_per_img, img_meta)
+
+        # thing obj_feat
+        thing_obj_feat = obj_feat[:self.num_proposals]
+        thing_obj_feat = thing_obj_feat[mask_indices]
+
+        if not self.merge_joint:
+            thing_masks = thing_masks > test_cfg['mask_thr']
+        bbox_result, segm_result, thing_mask_preds = self.mask_head[
+            -1].segm2result(thing_masks, thing_labels, thing_scores)
+
+        stuff_scores = cls_scores[
+            self.num_proposals:][:, self.num_thing_classes:].diag()
+        stuff_scores, stuff_inds = torch.sort(stuff_scores, descending=True)
+        stuff_masks = mask_preds[self.num_proposals:][stuff_inds]
+        stuff_masks = self.mask_head[-1].rescale_masks(stuff_masks, img_meta)
+
+        # stuff obj_feat
+        stuff_obj_feat = obj_feat[self.num_proposals:][stuff_inds]
+
+        if not self.merge_joint:
+            stuff_masks = stuff_masks > test_cfg['mask_thr']
+
+        if self.merge_joint:
+            stuff_labels = stuff_inds + self.num_thing_classes
+            panoptic_result, thing_obj_feat = self.merge_stuff_thing_stuff_joint(
+                thing_masks, thing_labels, thing_scores, stuff_masks,
+                stuff_labels, stuff_scores, test_cfg['merge_stuff_thing'],
+                thing_obj_feat, stuff_obj_feat)
+        else:
+            stuff_labels = stuff_inds + 1
+            panoptic_result, thing_obj_feat = self.merge_stuff_thing_thing_first(
+                thing_masks, thing_labels, thing_scores, stuff_masks,
+                stuff_labels, stuff_scores, test_cfg['merge_stuff_thing'],
+                thing_obj_feat, stuff_obj_feat)
+
+        return bbox_result, segm_result, thing_mask_preds, panoptic_result, thing_obj_feat
+
+    def merge_stuff_thing_thing_first(self,
+                                      thing_masks,
+                                      thing_labels,
+                                      thing_scores,
+                                      stuff_masks,
+                                      stuff_labels,
+                                      stuff_scores,
+                                      merge_cfg=None,
+                                      thing_obj_feat=None,
+                                      stuff_obj_feat=None):
+
+        H, W = thing_masks.shape[-2:]
+        panoptic_seg = thing_masks.new_zeros((H, W), dtype=torch.int32)
+        thing_masks = thing_masks.to(
+            dtype=torch.bool, device=panoptic_seg.device)
+        stuff_masks = stuff_masks.to(
+            dtype=torch.bool, device=panoptic_seg.device)
+
+        # sort instance outputs by scores
+        sorted_inds = torch.argsort(-thing_scores)
+        thing_obj_feat = thing_obj_feat[sorted_inds]
+        current_segment_id = 0
+        segments_info = []
+        instance_ids = []
+
+        # Add instances one-by-one, check for overlaps with existing ones
+        for inst_id in sorted_inds:
+            score = thing_scores[inst_id].item()
+            if score < merge_cfg['instance_score_thr']:
+                break
+            mask = thing_masks[inst_id]  # H,W
+            mask_area = mask.sum().item()
+
+            if mask_area == 0:
+                continue
+
+            intersect = (mask > 0) & (panoptic_seg > 0)
+            intersect_area = intersect.sum().item()
+
+            if intersect_area * 1.0 / mask_area > merge_cfg['iou_thr']:
+                continue
+
+            if intersect_area > 0:
+                mask = mask & (panoptic_seg == 0)
+
+            mask_area = mask.sum().item()
+            if mask_area == 0:
+                continue
+
+            current_segment_id += 1
+            panoptic_seg[mask.bool()] = current_segment_id
+            segments_info.append({
+                'id': current_segment_id,
+                'isthing': True,
+                'score': score,
+                'category_id': thing_labels[inst_id].item(),
+                'instance_id': inst_id.item(),
+            })
+            instance_ids.append(inst_id.item())
+
+        # Add semantic results to remaining empty areas
+        sorted_inds = torch.argsort(-stuff_scores)
+        sorted_stuff_labels = stuff_labels[sorted_inds]
+        # paste semantic masks following the order of scores
+        processed_label = []
+        for semantic_label in sorted_stuff_labels:
+            semantic_label = semantic_label.item()
+            if semantic_label in processed_label:
+                continue
+            processed_label.append(semantic_label)
+            sem_inds = stuff_labels == semantic_label
+            sem_masks = stuff_masks[sem_inds].sum(0).bool()
+            mask = sem_masks & (panoptic_seg == 0)
+            mask_area = mask.sum().item()
+            if mask_area < merge_cfg['stuff_max_area']:
+                continue
+
+            current_segment_id += 1
+            panoptic_seg[mask] = current_segment_id
+            segments_info.append({
+                'id': current_segment_id,
+                'isthing': False,
+                'category_id': semantic_label,
+                'area': mask_area,
+            })
+        return (panoptic_seg.cpu().numpy(),
+                segments_info), thing_obj_feat[instance_ids]
+
+    def merge_stuff_thing_stuff_joint(self,
+                                      thing_masks,
+                                      thing_labels,
+                                      thing_scores,
+                                      stuff_masks,
+                                      stuff_labels,
+                                      stuff_scores,
+                                      merge_cfg=None,
+                                      thing_obj=None,
+                                      stuff_obj=None):
+
+        H, W = thing_masks.shape[-2:]
+        panoptic_seg = thing_masks.new_zeros((H, W), dtype=torch.int32)
+
+        total_masks = torch.cat([thing_masks, stuff_masks], dim=0)
+        total_scores = torch.cat([thing_scores, stuff_scores], dim=0)
+        total_labels = torch.cat([thing_labels, stuff_labels], dim=0)
+        obj_fea = torch.cat([thing_obj, stuff_obj], dim=0)
+
+        cur_prob_masks = total_scores.view(-1, 1, 1) * total_masks
+        segments_info = []
+        cur_mask_ids = cur_prob_masks.argmax(0)
+
+        # sort instance outputs by scores
+        sorted_inds = torch.argsort(-total_scores)
+        current_segment_id = 0
+        sort_obj_fea = obj_fea
+        things_ids = []
+        for k in sorted_inds:
+            pred_class = total_labels[k].item()
+            isthing = pred_class < self.num_thing_classes
+            if isthing and total_scores[k] < merge_cfg['instance_score_thr']:
+                continue
+
+            mask = cur_mask_ids == k
+            mask_area = mask.sum().item()
+            original_area = (total_masks[k] >= 0.5).sum().item()
+
+            if mask_area > 0 and original_area > 0:
+                if mask_area / original_area < merge_cfg['overlap_thr']:
+                    continue
+                current_segment_id += 1
+
+                panoptic_seg[mask] = current_segment_id
+
+                if isthing:
+                    segments_info.append({
+                        'id': current_segment_id,
+                        'isthing': isthing,
+                        'score': total_scores[k].item(),
+                        'category_id': pred_class,  # 0, num_thing - 1
+                        'instance_id': k.item(),
+                    })
+                    things_ids.append(k.item())
+                else:
+                    segments_info.append({
+                        'id':
+                        current_segment_id,
+                        'isthing':
+                        isthing,
+                        'category_id':
+                        pred_class - self.num_thing_classes
+                        + 1,  # 1, num_stuff
+                        'area':
+                        mask_area,
+                    })
+
+        return (panoptic_seg.cpu().numpy(),
+                segments_info), sort_obj_fea[things_ids]
diff --git a/modelscope/models/cv/video_panoptic_segmentation/head/kernel_update_head.py b/modelscope/models/cv/video_panoptic_segmentation/head/kernel_update_head.py
new file mode 100644
index 00000000..af874021
--- /dev/null
+++ b/modelscope/models/cv/video_panoptic_segmentation/head/kernel_update_head.py
@@ -0,0 +1,664 @@
+# The implementation is adopted from Video-K-Net,
+# made publicly available at https://github.com/lxtGH/Video-K-Net
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import (ConvModule, bias_init_with_prob, build_activation_layer,
+                      build_norm_layer)
+from mmcv.cnn.bricks.transformer import (FFN, MultiheadAttention,
+                                         build_transformer_layer)
+from mmcv.runner import force_fp32
+from mmdet.core import multi_apply
+from mmdet.models.builder import HEADS, build_loss
+from mmdet.models.dense_heads.atss_head import reduce_mean
+from mmdet.models.losses import accuracy
+from mmdet.utils import get_root_logger
+
+from .mask import tensor_mask2box
+
+
+@HEADS.register_module()
+class VideoKernelUpdateHead(nn.Module):
+
+    def __init__(
+        self,
+        num_classes=80,
+        num_ffn_fcs=2,
+        num_heads=8,
+        num_cls_fcs=1,
+        num_mask_fcs=3,
+        feedforward_channels=2048,
+        in_channels=256,
+        out_channels=256,
+        dropout=0.0,
+        mask_thr=0.5,
+        act_cfg=dict(type='ReLU', inplace=True),
+        ffn_act_cfg=dict(type='ReLU', inplace=True),
+        conv_kernel_size=3,
+        feat_transform_cfg=None,
+        hard_mask_thr=0.5,
+        kernel_init=False,
+        with_ffn=True,
+        mask_out_stride=4,
+        relative_coors=False,
+        relative_coors_off=False,
+        feat_gather_stride=1,
+        mask_transform_stride=1,
+        mask_upsample_stride=1,
+        num_thing_classes=80,
+        num_stuff_classes=53,
+        mask_assign_stride=4,
+        ignore_label=255,
+        thing_label_in_seg=0,
+        previous=None,
+        previous_x_feat=None,
+        previous_link=None,  # seg/cls embeddings
+        previous_type=None,  # tracking embeddings
+        previous_detach=False,
+        previous_detach_link=False,  # whether detach linl query
+        previous_link_detach=False,
+        kernel_updator_cfg=dict(
+            type='DynamicConv',
+            in_channels=256,
+            feat_channels=64,
+            out_channels=256,
+            input_feat_shape=1,
+            act_cfg=dict(type='ReLU', inplace=True),
+            norm_cfg=dict(type='LN')),
+        loss_rank=None,
+        loss_mask=dict(
+            type='CrossEntropyLoss', use_mask=True, loss_weight=1.0),
+        loss_dice=dict(type='DiceLoss', loss_weight=3.0),
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=2.0)):
+        super(VideoKernelUpdateHead, self).__init__()
+        self.num_classes = num_classes
+        self.loss_cls = build_loss(loss_cls)
+        self.loss_mask = build_loss(loss_mask)
+        self.loss_dice = build_loss(loss_dice)
+        if loss_rank is not None:
+            self.loss_rank = build_loss(loss_rank)
+        else:
+            self.loss_rank = loss_rank
+
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.mask_thr = mask_thr
+        self.fp16_enabled = False
+        self.dropout = dropout
+
+        self.num_heads = num_heads
+        self.hard_mask_thr = hard_mask_thr
+        self.kernel_init = kernel_init
+        self.with_ffn = with_ffn
+        self.mask_out_stride = mask_out_stride
+        self.relative_coors = relative_coors
+        self.relative_coors_off = relative_coors_off
+        self.conv_kernel_size = conv_kernel_size
+        self.feat_gather_stride = feat_gather_stride
+        self.mask_transform_stride = mask_transform_stride
+        self.mask_upsample_stride = mask_upsample_stride
+
+        self.num_thing_classes = num_thing_classes
+        self.num_stuff_classes = num_stuff_classes
+        self.mask_assign_stride = mask_assign_stride
+        self.ignore_label = ignore_label
+        self.thing_label_in_seg = thing_label_in_seg
+
+        self.attention = MultiheadAttention(in_channels * conv_kernel_size**2,
+                                            num_heads, dropout)
+        self.attention_norm = build_norm_layer(
+            dict(type='LN'), in_channels * conv_kernel_size**2)[1]
+
+        self.kernel_update_conv = build_transformer_layer(kernel_updator_cfg)
+
+        if feat_transform_cfg is not None:
+            kernel_size = feat_transform_cfg.pop('kernel_size', 1)
+            self.feat_transform = ConvModule(
+                in_channels,
+                in_channels,
+                kernel_size,
+                stride=feat_gather_stride,
+                padding=int(feat_gather_stride // 2),
+                **feat_transform_cfg)
+        else:
+            self.feat_transform = None
+
+        if self.with_ffn:
+            self.ffn = FFN(
+                in_channels,
+                feedforward_channels,
+                num_ffn_fcs,
+                act_cfg=ffn_act_cfg,
+                dropout=dropout)
+            self.ffn_norm = build_norm_layer(dict(type='LN'), in_channels)[1]
+
+        self.cls_fcs = nn.ModuleList()
+        for _ in range(num_cls_fcs):
+            self.cls_fcs.append(
+                nn.Linear(in_channels, in_channels, bias=False))
+            self.cls_fcs.append(
+                build_norm_layer(dict(type='LN'), in_channels)[1])
+            self.cls_fcs.append(build_activation_layer(act_cfg))
+
+        if self.loss_cls.use_sigmoid:
+            self.fc_cls = nn.Linear(in_channels, self.num_classes)
+        else:
+            self.fc_cls = nn.Linear(in_channels, self.num_classes + 1)
+
+        self.mask_fcs = nn.ModuleList()
+        for _ in range(num_mask_fcs):
+            self.mask_fcs.append(
+                nn.Linear(in_channels, in_channels, bias=False))
+            self.mask_fcs.append(
+                build_norm_layer(dict(type='LN'), in_channels)[1])
+            self.mask_fcs.append(build_activation_layer(act_cfg))
+
+        self.fc_mask = nn.Linear(in_channels, out_channels)
+
+        self.previous = previous
+        self.previous_type = previous_type
+        self.previous_link = previous_link
+        self.previous_x_feat = previous_x_feat
+        self.previous_detach = previous_detach
+        self.previous_detach_link = previous_detach_link
+        self.previous_link_detach = previous_link_detach
+
+        if self.previous is not None:
+            _in_channels = self.in_channels
+            _conv_kernel_size = self.conv_kernel_size
+            _num_head = 8
+            _dropout = 0.
+            # tracking embedding
+            if self.previous_type == 'ffn':
+                self.attention_previous = MultiheadAttention(
+                    _in_channels * _conv_kernel_size**2,
+                    _num_head,
+                    _dropout,
+                )
+                _, self.attention_previous_norm = build_norm_layer(
+                    dict(type='LN'), _in_channels * _conv_kernel_size**2)
+                # add link ffn
+                self.link_ffn = FFN(
+                    in_channels,
+                    feedforward_channels,
+                    num_ffn_fcs,
+                    act_cfg=ffn_act_cfg,
+                    dropout=dropout)
+                self.link_ffn_norm = build_norm_layer(
+                    dict(type='LN'), in_channels)[1]
+
+            elif self.previous_type == 'update' or self.previous_type == 'update_obj':
+
+                self.attention_previous_update_track = build_transformer_layer(
+                    kernel_updator_cfg)
+
+                self.attention_previous_track = MultiheadAttention(
+                    _in_channels * _conv_kernel_size**2,
+                    _num_head,
+                    _dropout,
+                )
+                _, self.attention_previous_norm_track = build_norm_layer(
+                    dict(type='LN'), _in_channels * _conv_kernel_size**2)
+                # add link ffn
+                self.link_ffn_track = FFN(
+                    in_channels,
+                    feedforward_channels,
+                    num_ffn_fcs,
+                    act_cfg=ffn_act_cfg,
+                    dropout=dropout)
+                self.link_ffn_norm_track = build_norm_layer(
+                    dict(type='LN'), in_channels)[1]
+
+            # seg and cls embedding Link
+            if self.previous_link == 'update_dynamic_cov':
+                _in_channels = self.in_channels
+                _conv_kernel_size = self.conv_kernel_size
+                _num_head = 8
+                _dropout = 0.
+                self.attention_previous_update_link = build_transformer_layer(
+                    kernel_updator_cfg)
+                self.attention_previous_link = MultiheadAttention(
+                    _in_channels * _conv_kernel_size**2,
+                    _num_head,
+                    _dropout,
+                )
+                _, self.attention_previous_norm_link = build_norm_layer(
+                    dict(type='LN'), _in_channels * _conv_kernel_size**2)
+                # add link ffn
+                self.link_ffn_link = FFN(
+                    in_channels,
+                    feedforward_channels,
+                    num_ffn_fcs,
+                    act_cfg=ffn_act_cfg,
+                    dropout=dropout)
+                self.link_ffn_norm_link = build_norm_layer(
+                    dict(type='LN'), in_channels)[1]
+
+            elif self.previous_link == 'link_atten':
+                _in_channels = self.in_channels
+                _conv_kernel_size = self.conv_kernel_size
+                _num_head = 8
+                _dropout = 0.
+                self.attention_previous_link = MultiheadAttention(
+                    _in_channels * _conv_kernel_size**2,
+                    _num_head,
+                    _dropout,
+                )
+                _, self.attention_previous_norm_link = build_norm_layer(
+                    dict(type='LN'), _in_channels * _conv_kernel_size**2)
+                # add link ffn
+                self.link_ffn_link = FFN(
+                    in_channels,
+                    feedforward_channels,
+                    num_ffn_fcs,
+                    act_cfg=ffn_act_cfg,
+                    dropout=dropout)
+                self.link_ffn_norm_link = build_norm_layer(
+                    dict(type='LN'), in_channels)[1]
+
+    def forward(self,
+                x,
+                proposal_feat,
+                mask_preds,
+                prev_cls_score=None,
+                mask_shape=None,
+                img_metas=None,
+                previous_obj_feats=None,
+                previous_mask_preds=None,
+                previous_x_feats=None):
+
+        N, num_proposals = proposal_feat.shape[:2]
+        if self.feat_transform is not None:
+            x = self.feat_transform(x)
+            if previous_x_feats is not None:
+                previous_x_feats = self.feat_transform(previous_x_feats)
+        C, H, W = x.shape[-3:]
+
+        mask_h, mask_w = mask_preds.shape[-2:]
+        if mask_h != H or mask_w != W:
+            gather_mask = F.interpolate(
+                mask_preds, (H, W), align_corners=False, mode='bilinear')
+        else:
+            gather_mask = mask_preds
+
+        sigmoid_masks = gather_mask.sigmoid()
+        nonzero_inds = sigmoid_masks > self.hard_mask_thr
+        sigmoid_masks = nonzero_inds.float()
+
+        # einsum is faster than bmm by 30%
+        x_feat = torch.einsum('bnhw,bchw->bnc', sigmoid_masks, x)
+
+        # obj_feat in shape [B, N, C, K, K] -> [B, N, C, K*K] -> [B, N, K*K, C]
+        proposal_feat = proposal_feat.reshape(N, num_proposals,
+                                              self.in_channels,
+                                              -1).permute(0, 1, 3, 2)
+
+        # whether to detach the previous outputs
+        if self.training and self.previous_detach:
+            previous_obj_feats = previous_obj_feats.detach()
+
+        # update previous with link object query
+        if previous_obj_feats is not None and self.previous_link == 'update_dynamic_cov':
+            previous_obj_feats_link = previous_obj_feats.reshape(
+                N, num_proposals, self.in_channels, -1).permute(0, 1, 3, 2)
+
+            if self.training and self.previous_detach_link:
+                previous_obj_feats_link = previous_obj_feats_link.detach()
+
+            previous_obj_feats_update = self.attention_previous_update_link(
+                x_feat, previous_obj_feats_link)
+
+            previous_obj_feats_update = previous_obj_feats_update.reshape(
+                N, num_proposals, -1).permute(1, 0, 2)
+            cur_obj_feat = proposal_feat.reshape(N, num_proposals, self.in_channels * self.conv_kernel_size ** 2). \
+                permute(1, 0, 2)
+            cur_obj_feat = self.attention_previous_norm_link(
+                self.attention_previous_link(
+                    query=cur_obj_feat,
+                    key=previous_obj_feats_update,
+                    value=previous_obj_feats_update,
+                    identity=cur_obj_feat), )
+            cur_obj_feat = cur_obj_feat.permute(1, 0, 2)
+            cur_obj_feat = cur_obj_feat.reshape(N, num_proposals, -1,
+                                                self.in_channels)
+            # pre_obj_feat in shape [B, N, K*K*C] -> [B, N, K*K, C]
+            proposal_feat = self.link_ffn_norm_link(
+                self.link_ffn_link(cur_obj_feat))
+
+        if previous_obj_feats is not None and self.previous_link == 'link_atten':
+            previous_obj_feats_link = previous_obj_feats.reshape(
+                N, num_proposals, self.in_channels, -1).permute(0, 1, 3, 2)
+
+            previous_obj_feats_update = previous_obj_feats_link.reshape(
+                N, num_proposals, -1).permute(1, 0, 2)
+            cur_obj_feat = proposal_feat.reshape(N, num_proposals, self.in_channels * self.conv_kernel_size ** 2). \
+                permute(1, 0, 2)
+            cur_obj_feat = self.attention_previous_norm_link(
+                self.attention_previous_link(
+                    query=cur_obj_feat,
+                    key=previous_obj_feats_update,
+                    value=previous_obj_feats_update,
+                    identity=cur_obj_feat), )
+            cur_obj_feat = cur_obj_feat.permute(1, 0, 2)
+            cur_obj_feat = cur_obj_feat.reshape(N, num_proposals, -1,
+                                                self.in_channels)
+            # pre_obj_feat in shape [B, N, K*K*C] -> [B, N, K*K, C]
+            proposal_feat = self.link_ffn_norm_link(
+                self.link_ffn_link(cur_obj_feat))
+
+        # update current
+        obj_feat = self.kernel_update_conv(x_feat, proposal_feat)
+
+        # [B, N, K*K, C] -> [B, N, K*K*C] -> [N, B, K*K*C]
+        obj_feat = obj_feat.reshape(N, num_proposals, -1).permute(1, 0, 2)
+        obj_feat = self.attention_norm(self.attention(obj_feat))
+        # [N, B, K*K*C] -> [B, N, K*K*C]
+        obj_feat = obj_feat.permute(1, 0, 2)
+
+        # obj_feat in shape [B, N, K*K*C] -> [B, N, K*K, C]
+        obj_feat = obj_feat.reshape(N, num_proposals, -1, self.in_channels)
+
+        # FFN
+        if self.with_ffn:
+            obj_feat = self.ffn_norm(self.ffn(obj_feat))
+
+        # For Tracking Parts
+        # Link previous and cur if previous obj feat is Not None
+        if previous_obj_feats is not None:
+            # previous_obj_feats (b, n, c, k, k) -> (b,n,c,k*k) -> (b,,n, k*k, c)
+            # permute to correct dimension
+
+            if self.previous_type == 'ffn':
+                previous_obj_feats = previous_obj_feats.reshape(
+                    N, num_proposals, self.in_channels,
+                    -1).permute(0, 1, 3, 2)
+                cur_obj_feat = obj_feat.reshape(N, num_proposals, self.in_channels * self.conv_kernel_size ** 2). \
+                    permute(1, 0, 2)
+                previous_obj_feats = previous_obj_feats.reshape(
+                    N, num_proposals,
+                    self.in_channels * self.conv_kernel_size**2).permute(
+                        1, 0, 2)
+
+                previous_obj_feat = self.attention_previous_norm(
+                    self.attention_previous(
+                        query=cur_obj_feat,
+                        key=previous_obj_feats,
+                        value=previous_obj_feats,
+                        identity=cur_obj_feat), )
+                previous_obj_feat = previous_obj_feat.permute(1, 0, 2)
+                previous_obj_feat_track = previous_obj_feat.reshape(
+                    N, num_proposals, -1, self.in_channels)
+                # pre_obj_feat in shape [B, N, K*K*C] -> [B, N, K*K, C]
+                previous_obj_feat_track = self.link_ffn_norm(
+                    self.link_ffn(previous_obj_feat_track))
+
+            elif self.previous_type == 'update':
+                # not work
+                previous_obj_feats = previous_obj_feats.reshape(
+                    N, num_proposals, self.in_channels,
+                    -1).permute(0, 1, 3, 2)
+                previous_obj_feats_track = self.attention_previous_update_track(
+                    x_feat, previous_obj_feats)
+
+                previous_obj_feats_track = previous_obj_feats_track.reshape(
+                    N, num_proposals, self.in_channels,
+                    -1).permute(0, 1, 3, 2)
+                cur_obj_feat = obj_feat.reshape(N, num_proposals, self.in_channels * self.conv_kernel_size ** 2). \
+                    permute(1, 0, 2)
+                previous_obj_feats_track = previous_obj_feats_track.reshape(
+                    N, num_proposals,
+                    self.in_channels * self.conv_kernel_size**2).permute(
+                        1, 0, 2)
+
+                previous_obj_feats_track = self.attention_previous_norm_track(
+                    self.attention_previous_track(
+                        query=cur_obj_feat,
+                        key=previous_obj_feats_track,
+                        value=previous_obj_feats_track,
+                        identity=cur_obj_feat), )
+                previous_obj_feats_track = previous_obj_feats_track.permute(
+                    1, 0, 2)
+                previous_obj_feats_track = previous_obj_feats_track.reshape(
+                    N, num_proposals, -1, self.in_channels)
+                # pre_obj_feat in shape [B, N, K*K*C] -> [B, N, K*K, C]
+                previous_obj_feat_track = self.link_ffn_norm_track(
+                    self.link_ffn_track(previous_obj_feats_track))
+
+            elif self.previous_type == 'update_obj':
+                # not work
+                previous_obj_feats = previous_obj_feats.reshape(
+                    N, num_proposals, self.in_channels,
+                    -1).permute(0, 1, 3, 2)
+                previous_obj_feats_track = self.attention_previous_update_track(
+                    obj_feat.squeeze(2), previous_obj_feats)
+
+                previous_obj_feats_track = previous_obj_feats_track.reshape(
+                    N, num_proposals, self.in_channels,
+                    -1).permute(0, 1, 3, 2)
+                cur_obj_feat = obj_feat.reshape(N, num_proposals, self.in_channels * self.conv_kernel_size ** 2). \
+                    permute(1, 0, 2)
+                previous_obj_feats_track = previous_obj_feats_track.reshape(
+                    N, num_proposals,
+                    self.in_channels * self.conv_kernel_size**2).permute(
+                        1, 0, 2)
+
+                previous_obj_feats_track = self.attention_previous_norm_track(
+                    self.attention_previous_track(
+                        query=cur_obj_feat,
+                        key=previous_obj_feats_track,
+                        value=previous_obj_feats_track,
+                        identity=cur_obj_feat), )
+                previous_obj_feats_track = previous_obj_feats_track.permute(
+                    1, 0, 2)
+                previous_obj_feats_track = previous_obj_feats_track.reshape(
+                    N, num_proposals, -1, self.in_channels)
+                # pre_obj_feat in shape [B, N, K*K*C] -> [B, N, K*K, C]
+                previous_obj_feat_track = self.link_ffn_norm_track(
+                    self.link_ffn_track(previous_obj_feats_track))
+            else:
+                previous_obj_feat_track = None
+
+        cls_feat = obj_feat.sum(-2)
+        mask_feat = obj_feat
+
+        for cls_layer in self.cls_fcs:
+            cls_feat = cls_layer(cls_feat)
+        for reg_layer in self.mask_fcs:
+            mask_feat = reg_layer(mask_feat)
+
+        cls_score = self.fc_cls(cls_feat).view(N, num_proposals, -1)
+        # [B, N, K*K, C] -> [B, N, C, K*K]
+        mask_feat = self.fc_mask(mask_feat).permute(0, 1, 3, 2)
+
+        if (self.mask_transform_stride == 2 and self.feat_gather_stride == 1):
+            mask_x = F.interpolate(
+                x, scale_factor=0.5, mode='bilinear', align_corners=False)
+            H, W = mask_x.shape[-2:]
+        else:
+            mask_x = x
+        # [B, N, C, K*K] -> [B*N, C, K, K]
+        mask_feat = mask_feat.reshape(N, num_proposals, C,
+                                      self.conv_kernel_size,
+                                      self.conv_kernel_size)
+        # [B, C, H, W] -> [1, B*C, H, W]
+        new_mask_preds = []
+        for i in range(N):
+            new_mask_preds.append(
+                F.conv2d(
+                    mask_x[i:i + 1],
+                    mask_feat[i],
+                    padding=int(self.conv_kernel_size // 2)))
+
+        new_mask_preds = torch.cat(new_mask_preds, dim=0)
+        new_mask_preds = new_mask_preds.reshape(N, num_proposals, H, W)
+        if self.mask_transform_stride == 2:
+            new_mask_preds = F.interpolate(
+                new_mask_preds,
+                scale_factor=2,
+                mode='bilinear',
+                align_corners=False)
+
+        if mask_shape is not None and mask_shape[0] != H:
+            new_mask_preds = F.interpolate(
+                new_mask_preds,
+                mask_shape,
+                align_corners=False,
+                mode='bilinear')
+
+        if previous_obj_feats is not None and previous_obj_feat_track is not None:
+            obj_feat = obj_feat.permute(0, 1, 3,
+                                        2).reshape(N, num_proposals,
+                                                   self.in_channels,
+                                                   self.conv_kernel_size,
+                                                   self.conv_kernel_size)
+            previous_obj_feat_track = previous_obj_feat_track.permute(
+                0, 1, 3,
+                2).reshape(N, num_proposals, self.in_channels,
+                           self.conv_kernel_size, self.conv_kernel_size)
+            return cls_score, new_mask_preds, obj_feat, x_feat, previous_obj_feat_track
+        else:
+            return cls_score, new_mask_preds, obj_feat.permute(
+                0, 1, 3, 2).reshape(N, num_proposals, self.in_channels,
+                                    self.conv_kernel_size,
+                                    self.conv_kernel_size), x_feat, None
+
+    def _get_target_single(self, pos_inds, neg_inds, pos_mask, neg_mask,
+                           pos_gt_mask, pos_gt_labels, gt_sem_seg, gt_sem_cls,
+                           cfg):
+
+        num_pos = pos_mask.size(0)
+        num_neg = neg_mask.size(0)
+        num_samples = num_pos + num_neg
+        H, W = pos_mask.shape[-2:]
+        # original implementation uses new_zeros since BG are set to be 0
+        # now use empty & fill because BG cat_id = num_classes,
+        # FG cat_id = [0, num_classes-1]
+        labels = pos_mask.new_full((num_samples, ),
+                                   self.num_classes,
+                                   dtype=torch.long)
+        label_weights = pos_mask.new_zeros((num_samples, self.num_classes))
+        mask_targets = pos_mask.new_zeros(num_samples, H, W)
+        mask_weights = pos_mask.new_zeros(num_samples, H, W)
+        if num_pos > 0:
+            labels[pos_inds] = pos_gt_labels
+            pos_weight = 1.0 if cfg.pos_weight <= 0 else cfg.pos_weight
+            label_weights[pos_inds] = pos_weight
+            pos_mask_targets = pos_gt_mask
+            mask_targets[pos_inds, ...] = pos_mask_targets
+            mask_weights[pos_inds, ...] = 1
+
+        if num_neg > 0:
+            label_weights[neg_inds] = 1.0
+
+        if gt_sem_cls is not None and gt_sem_seg is not None:
+            sem_labels = pos_mask.new_full((self.num_stuff_classes, ),
+                                           self.num_classes,
+                                           dtype=torch.long)
+            sem_targets = pos_mask.new_zeros(self.num_stuff_classes, H, W)
+            sem_weights = pos_mask.new_zeros(self.num_stuff_classes, H, W)
+            sem_stuff_weights = torch.eye(
+                self.num_stuff_classes, device=pos_mask.device)
+            sem_thing_weights = pos_mask.new_zeros(
+                (self.num_stuff_classes, self.num_thing_classes))
+            sem_label_weights = torch.cat(
+                [sem_thing_weights, sem_stuff_weights], dim=-1)
+            if len(gt_sem_cls > 0):
+                sem_inds = gt_sem_cls - self.num_thing_classes
+                sem_inds = sem_inds.long()
+                sem_labels[sem_inds] = gt_sem_cls.long()
+                sem_targets[sem_inds] = gt_sem_seg
+                sem_weights[sem_inds] = 1
+
+            label_weights[:, self.num_thing_classes:] = 0
+            labels = torch.cat([labels, sem_labels])
+            label_weights = torch.cat([label_weights, sem_label_weights])
+            mask_targets = torch.cat([mask_targets, sem_targets])
+            mask_weights = torch.cat([mask_weights, sem_weights])
+
+        return labels, label_weights, mask_targets, mask_weights
+
+    def get_targets(self,
+                    sampling_results,
+                    gt_mask,
+                    gt_labels,
+                    rcnn_train_cfg,
+                    concat=True,
+                    gt_sem_seg=None,
+                    gt_sem_cls=None):
+
+        pos_inds_list = [res.pos_inds for res in sampling_results]
+        neg_inds_list = [res.neg_inds for res in sampling_results]
+        pos_mask_list = [res.pos_masks for res in sampling_results]
+        neg_mask_list = [res.neg_masks for res in sampling_results]
+        pos_gt_mask_list = [res.pos_gt_masks for res in sampling_results]
+        pos_gt_labels_list = [res.pos_gt_labels for res in sampling_results]
+        if gt_sem_seg is None:
+            gt_sem_seg = [None] * 2
+            gt_sem_cls = [None] * 2
+
+        labels, label_weights, mask_targets, mask_weights = multi_apply(
+            self._get_target_single,
+            pos_inds_list,
+            neg_inds_list,
+            pos_mask_list,
+            neg_mask_list,
+            pos_gt_mask_list,
+            pos_gt_labels_list,
+            gt_sem_seg,
+            gt_sem_cls,
+            cfg=rcnn_train_cfg)
+        if concat:
+            labels = torch.cat(labels, 0)
+            label_weights = torch.cat(label_weights, 0)
+            mask_targets = torch.cat(mask_targets, 0)
+            mask_weights = torch.cat(mask_weights, 0)
+        return labels, label_weights, mask_targets, mask_weights
+
+    def rescale_masks(self, masks_per_img, img_meta):
+        h, w, _ = img_meta['img_shape']
+        masks_per_img = F.interpolate(
+            masks_per_img.unsqueeze(0).sigmoid(),
+            size=img_meta['batch_input_shape'],
+            mode='bilinear',
+            align_corners=False)
+
+        masks_per_img = masks_per_img[:, :, :h, :w]
+        ori_shape = img_meta['ori_shape']
+        seg_masks = F.interpolate(
+            masks_per_img,
+            size=ori_shape[:2],
+            mode='bilinear',
+            align_corners=False).squeeze(0)
+        return seg_masks
+
+    def get_seg_masks(self, masks_per_img, labels_per_img, scores_per_img,
+                      test_cfg, img_meta):
+        # resize mask predictions back
+        seg_masks = self.rescale_masks(masks_per_img, img_meta)
+        seg_masks = seg_masks > test_cfg.mask_thr
+        bbox_result, segm_result, mask_preds = self.segm2result(
+            seg_masks, labels_per_img, scores_per_img)
+        return bbox_result, segm_result, mask_preds
+
+    def segm2result(self, mask_preds, det_labels, cls_scores):
+        num_classes = self.num_classes
+        # bbox_result = None
+        segm_result = [[] for _ in range(num_classes)]
+        det_labels = det_labels.cpu().numpy()
+        cls_scores = cls_scores.cpu().numpy()
+        num_ins = mask_preds.shape[0]
+        # fake bboxes mask to bboxes
+        bboxes = np.zeros((num_ins, 5), dtype=np.float32)
+        bboxes[:, -1] = cls_scores
+        bboxes[:, :4] = np.array(tensor_mask2box(mask_preds).clip(min=0))
+
+        for idx in range(num_ins):
+            segm_result[det_labels[idx]].append(mask_preds[idx])
+        return bboxes, segm_result, mask_preds
diff --git a/modelscope/models/cv/video_panoptic_segmentation/head/kernel_updator.py b/modelscope/models/cv/video_panoptic_segmentation/head/kernel_updator.py
new file mode 100644
index 00000000..e5ee1362
--- /dev/null
+++ b/modelscope/models/cv/video_panoptic_segmentation/head/kernel_updator.py
@@ -0,0 +1,96 @@
+# The implementation is adopted from Video-K-Net,
+# made publicly available at https://github.com/lxtGH/Video-K-Net
+
+import torch.nn as nn
+from mmcv.cnn import build_activation_layer, build_norm_layer
+from mmcv.cnn.bricks.transformer import TRANSFORMER_LAYER
+
+
+@TRANSFORMER_LAYER.register_module()
+class KernelUpdator(nn.Module):
+
+    def __init__(self,
+                 in_channels=256,
+                 feat_channels=64,
+                 out_channels=None,
+                 input_feat_shape=3,
+                 gate_sigmoid=True,
+                 gate_norm_act=False,
+                 activate_out=False,
+                 act_cfg=dict(type='ReLU', inplace=True),
+                 norm_cfg=dict(type='LN')):
+        super(KernelUpdator, self).__init__()
+        self.in_channels = in_channels
+        self.feat_channels = feat_channels
+        self.out_channels_raw = out_channels
+        self.gate_sigmoid = gate_sigmoid
+        self.gate_norm_act = gate_norm_act
+        self.activate_out = activate_out
+        if isinstance(input_feat_shape, int):
+            input_feat_shape = [input_feat_shape] * 2
+        self.input_feat_shape = input_feat_shape
+        self.act_cfg = act_cfg
+        self.norm_cfg = norm_cfg
+        self.out_channels = out_channels if out_channels else in_channels
+
+        self.num_params_in = self.feat_channels
+        self.num_params_out = self.feat_channels
+        self.dynamic_layer = nn.Linear(
+            self.in_channels, self.num_params_in + self.num_params_out)
+        self.input_layer = nn.Linear(self.in_channels,
+                                     self.num_params_in + self.num_params_out,
+                                     1)
+        self.input_gate = nn.Linear(self.in_channels, self.feat_channels, 1)
+        self.update_gate = nn.Linear(self.in_channels, self.feat_channels, 1)
+        if self.gate_norm_act:
+            self.gate_norm = build_norm_layer(norm_cfg, self.feat_channels)[1]
+
+        self.norm_in = build_norm_layer(norm_cfg, self.feat_channels)[1]
+        self.norm_out = build_norm_layer(norm_cfg, self.feat_channels)[1]
+        self.input_norm_in = build_norm_layer(norm_cfg, self.feat_channels)[1]
+        self.input_norm_out = build_norm_layer(norm_cfg, self.feat_channels)[1]
+
+        self.activation = build_activation_layer(act_cfg)
+
+        self.fc_layer = nn.Linear(self.feat_channels, self.out_channels, 1)
+        self.fc_norm = build_norm_layer(norm_cfg, self.out_channels)[1]
+
+    def forward(self, update_feature, input_feature):
+        update_feature = update_feature.reshape(-1, self.in_channels)
+        num_proposals = update_feature.size(0)
+        parameters = self.dynamic_layer(update_feature)
+        param_in = parameters[:, :self.num_params_in].view(
+            -1, self.feat_channels)
+        param_out = parameters[:, -self.num_params_out:].view(
+            -1, self.feat_channels)
+
+        input_feats = self.input_layer(
+            input_feature.reshape(num_proposals, -1, self.feat_channels))
+        input_in = input_feats[..., :self.num_params_in]
+        input_out = input_feats[..., -self.num_params_out:]
+
+        gate_feats = input_in * param_in.unsqueeze(-2)
+        if self.gate_norm_act:
+            gate_feats = self.activation(self.gate_norm(gate_feats))
+
+        input_gate = self.input_norm_in(self.input_gate(gate_feats))
+        update_gate = self.norm_in(self.update_gate(gate_feats))
+        if self.gate_sigmoid:
+            input_gate = input_gate.sigmoid()
+            update_gate = update_gate.sigmoid()
+        param_out = self.norm_out(param_out)
+        input_out = self.input_norm_out(input_out)
+
+        if self.activate_out:
+            param_out = self.activation(param_out)
+            input_out = self.activation(input_out)
+
+        # param_out has shape (batch_size, feat_channels, out_channels)
+        features = update_gate * param_out.unsqueeze(
+            -2) + input_gate * input_out
+
+        features = self.fc_layer(features)
+        features = self.fc_norm(features)
+        features = self.activation(features)
+
+        return features
diff --git a/modelscope/models/cv/video_panoptic_segmentation/head/mask.py b/modelscope/models/cv/video_panoptic_segmentation/head/mask.py
new file mode 100644
index 00000000..5cba620f
--- /dev/null
+++ b/modelscope/models/cv/video_panoptic_segmentation/head/mask.py
@@ -0,0 +1,233 @@
+# The implementation is adopted from Video-K-Net,
+# made publicly available at https://github.com/lxtGH/Video-K-Net
+
+from __future__ import absolute_import, division, print_function
+
+import cv2
+import numpy as np
+import pycocotools.mask as mask_utils
+import torch
+
+
+def coords2bbox(coords, extend=2):
+    """
+    INPUTS:
+     - coords: coordinates of pixels in the next frame
+    """
+    center = torch.mean(coords, dim=0)  # b * 2
+    center = center.view(1, 2)
+    center_repeat = center.repeat(coords.size(0), 1)
+
+    dis_x = torch.sqrt(torch.pow(coords[:, 0] - center_repeat[:, 0], 2))
+    dis_x = max(torch.mean(dis_x, dim=0).detach(), 1)
+    dis_y = torch.sqrt(torch.pow(coords[:, 1] - center_repeat[:, 1], 2))
+    dis_y = max(torch.mean(dis_y, dim=0).detach(), 1)
+
+    left = center[:, 0] - dis_x * extend
+    right = center[:, 0] + dis_x * extend
+    top = center[:, 1] - dis_y * extend
+    bottom = center[:, 1] + dis_y * extend
+
+    return (top.item(), left.item(), bottom.item(), right.item())
+
+
+def coords2bbox_all(coords):
+    left = coords[:, 0].min().item()
+    top = coords[:, 1].min().item()
+    right = coords[:, 0].max().item()
+    bottom = coords[:, 1].max().item()
+    return top, left, bottom, right
+
+
+def coords2bboxTensor(coords, extend=2):
+    """
+    INPUTS:
+     - coords: coordinates of pixels in the next frame
+    """
+    center = torch.mean(coords, dim=0)  # b * 2
+    center = center.view(1, 2)
+    center_repeat = center.repeat(coords.size(0), 1)
+
+    dis_x = torch.sqrt(torch.pow(coords[:, 0] - center_repeat[:, 0], 2))
+    dis_x = max(torch.mean(dis_x, dim=0).detach(), 1)
+    dis_y = torch.sqrt(torch.pow(coords[:, 1] - center_repeat[:, 1], 2))
+    dis_y = max(torch.mean(dis_y, dim=0).detach(), 1)
+
+    left = center[:, 0] - dis_x * extend
+    right = center[:, 0] + dis_x * extend
+    top = center[:, 1] - dis_y * extend
+    bottom = center[:, 1] + dis_y * extend
+
+    return torch.Tensor([top.item(),
+                         left.item(),
+                         bottom.item(),
+                         right.item()]).to(coords.device)
+
+
+def mask2box(masks):
+    boxes = []
+    for mask in masks:
+        m = mask[0].nonzero().float()
+        if m.numel() > 0:
+            box = coords2bbox(m, extend=2)
+        else:
+            box = (-1, -1, 10, 10)
+        boxes.append(box)
+    return np.asarray(boxes)
+
+
+def tensor_mask2box(masks):
+    boxes = []
+    for mask in masks:
+        m = mask.nonzero().float()
+        if m.numel() > 0:
+            box = coords2bbox_all(m)
+        else:
+            box = (-1, -1, 10, 10)
+        boxes.append(box)
+    return np.asarray(boxes)
+
+
+def batch_mask2boxlist(masks):
+    """
+    Args:
+        masks: Tensor b,n,h,w
+
+    Returns: List[List[box]]
+
+    """
+    batch_bbox = []
+    for i, b_masks in enumerate(masks):
+        boxes = []
+        for mask in b_masks:
+            m = mask.nonzero().float()
+            if m.numel() > 0:
+                box = coords2bboxTensor(m, extend=2)
+            else:
+                box = torch.Tensor([0, 0, 0, 0]).to(m.device)
+            boxes.append(box.unsqueeze(0))
+        boxes_t = torch.cat(boxes, 0)
+        batch_bbox.append(boxes_t)
+
+    return batch_bbox
+
+
+def bboxlist2roi(bbox_list):
+    """Convert a list of bboxes to roi format.
+
+    Args:
+        bbox_list (list[Tensor]): a list of bboxes corresponding to a batch
+            of images.
+
+    Returns:
+        Tensor: shape (n, 5), [batch_ind, x1, y1, x2, y2]
+    """
+    rois_list = []
+    for img_id, bboxes in enumerate(bbox_list):
+        if bboxes.size(0) > 0:
+            img_inds = bboxes.new_full((bboxes.size(0), 1), img_id)
+            rois = torch.cat([img_inds, bboxes[:, :4]], dim=-1)
+        else:
+            rois = bboxes.new_zeros((0, 5))
+        rois_list.append(rois)
+    rois = torch.cat(rois_list, 0)
+    return rois
+
+
+def bbox2roi(bbox_list):
+    """Convert a list of bboxes to roi format.
+
+    Args:
+        bbox_list (list[Tensor]): a list of bboxes corresponding to a batch
+            of images.
+
+    Returns:
+        Tensor: shape (n, 5), [batch_ind, x1, y1, x2, y2]
+    """
+    rois_list = []
+    for img_id, bboxes in enumerate(bbox_list):
+        if bboxes.size(0) > 0:
+            img_inds = bboxes.new_full((bboxes.size(0), 1), img_id)
+            rois = torch.cat([img_inds, bboxes[:, :4]], dim=-1)
+        else:
+            rois = bboxes.new_zeros((0, 5))
+        rois_list.append(rois)
+    rois = torch.cat(rois_list, 0)
+    return rois
+
+
+def temp_interp_mask(maskseq, T):
+    '''
+    maskseq: list of elements (RLE_mask, timestamp)
+    return list of RLE_mask, length of list is T
+    '''
+    size = maskseq[0][0]['size']
+    blank_mask = np.asfortranarray(np.zeros(size).astype(np.uint8))
+    blank_mask = mask_utils.encode(blank_mask)
+    blank_mask['counts'] = blank_mask['counts'].decode('ascii')
+    ret = [
+        blank_mask,
+    ] * T
+    for m, t in maskseq:
+        ret[t] = m
+    return ret
+
+
+def mask_seq_jac(sa, sb):
+    j = np.zeros((len(sa), len(sb)))
+    for ia, a in enumerate(sa):
+        for ib, b in enumerate(sb):
+            ious = [
+                mask_utils.iou([at], [bt], [
+                    False,
+                ]) for (at, bt) in zip(a, b)
+            ]
+            tiou = np.mean(ious)
+            j[ia, ib] = tiou
+    return j
+
+
+def skltn2mask(skltn, size):
+    h, w = size
+    mask = np.zeros((h, w))
+
+    dskltn = dict()
+    for s in skltn:
+        dskltn[s['id'][0]] = (int(s['x'][0]), int(s['y'][0]))
+    if len(dskltn) == 0:
+        return mask
+    trunk_polygon = list()
+    for k in np.array([3, 4, 10, 13, 9]) - 1:
+        p = dskltn.get(k, None)
+        if p is not None:
+            trunk_polygon.append(p)
+    trunk_polygon = np.asarray(trunk_polygon, 'int32')
+    if len(trunk_polygon) > 2:
+        cv2.fillConvexPoly(mask, trunk_polygon, 1)
+
+    xmin = np.min([dskltn[k][0] for k in dskltn])
+    xmax = np.max([dskltn[k][0] for k in dskltn])
+    ymin = np.min([dskltn[k][1] for k in dskltn])
+    ymax = np.max([dskltn[k][1] for k in dskltn])
+    line_width = np.max([int(np.max([xmax - xmin, ymax - ymin, 0]) / 20), 8])
+
+    skeleton = [[10, 11], [11, 12], [9, 8], [8, 7], [10, 13], [9, 13],
+                [13, 15], [10, 4], [4, 5], [5, 6], [9, 3], [3, 2], [2, 1]]
+
+    for sk in skeleton:
+        st = dskltn.get(sk[0] - 1, None)
+        ed = dskltn.get(sk[1] - 1, None)
+        if st is None or ed is None:
+            continue
+        cv2.line(mask, st, ed, color=1, thickness=line_width)
+
+    return mask
+
+
+def pts2array(pts):
+    arr = np.zeros((15, 3))
+    for s in pts:
+        arr[s['id'][0]][0] = int(s['x'][0])
+        arr[s['id'][0]][1] = int(s['y'][0])
+        arr[s['id'][0]][2] = s['score'][0]
+    return arr
diff --git a/modelscope/models/cv/video_panoptic_segmentation/head/semantic_fpn_wrapper.py b/modelscope/models/cv/video_panoptic_segmentation/head/semantic_fpn_wrapper.py
new file mode 100644
index 00000000..0cf487b8
--- /dev/null
+++ b/modelscope/models/cv/video_panoptic_segmentation/head/semantic_fpn_wrapper.py
@@ -0,0 +1,221 @@
+# The implementation is adopted from Video-K-Net,
+# made publicly available at https://github.com/lxtGH/Video-K-Net
+
+import torch
+import torch.nn as nn
+from mmcv.cnn import ConvModule, normal_init
+from mmcv.cnn.bricks.transformer import build_positional_encoding
+
+
+class SemanticFPNWrapper(nn.Module):
+    """
+    Implementation of Semantic FPN used in Panoptic FPN.
+
+    Args:
+        in_channels ([type]): [description]
+        feat_channels ([type]): [description]
+        out_channels ([type]): [description]
+        start_level ([type]): [description]
+        end_level ([type]): [description]
+        cat_coors (bool, optional): [description]. Defaults to False.
+        fuse_by_cat (bool, optional): [description]. Defaults to False.
+        conv_cfg ([type], optional): [description]. Defaults to None.
+        norm_cfg ([type], optional): [description]. Defaults to None.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 feat_channels,
+                 out_channels,
+                 start_level,
+                 end_level,
+                 cat_coors=False,
+                 positional_encoding=None,
+                 cat_coors_level=3,
+                 fuse_by_cat=False,
+                 return_list=False,
+                 upsample_times=3,
+                 with_pred=True,
+                 num_aux_convs=0,
+                 act_cfg=dict(type='ReLU', inplace=True),
+                 out_act_cfg=dict(type='ReLU'),
+                 conv_cfg=None,
+                 norm_cfg=None):
+        super(SemanticFPNWrapper, self).__init__()
+
+        self.in_channels = in_channels
+        self.feat_channels = feat_channels
+        self.start_level = start_level
+        self.end_level = end_level
+        assert start_level >= 0 and end_level >= start_level
+        self.out_channels = out_channels
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.act_cfg = act_cfg
+        self.cat_coors = cat_coors
+        self.cat_coors_level = cat_coors_level
+        self.fuse_by_cat = fuse_by_cat
+        self.return_list = return_list
+        self.upsample_times = upsample_times
+        self.with_pred = with_pred
+        if positional_encoding is not None:
+            self.positional_encoding = build_positional_encoding(
+                positional_encoding)
+        else:
+            self.positional_encoding = None
+
+        self.convs_all_levels = nn.ModuleList()
+        for i in range(self.start_level, self.end_level + 1):
+            convs_per_level = nn.Sequential()
+            if i == 0:
+                if i == self.cat_coors_level and self.cat_coors:
+                    chn = self.in_channels + 2
+                else:
+                    chn = self.in_channels
+                if upsample_times == self.end_level - i:
+                    one_conv = ConvModule(
+                        chn,
+                        self.feat_channels,
+                        3,
+                        padding=1,
+                        conv_cfg=self.conv_cfg,
+                        norm_cfg=self.norm_cfg,
+                        act_cfg=self.act_cfg,
+                        inplace=False)
+                    convs_per_level.add_module('conv' + str(i), one_conv)
+                else:
+                    for i in range(self.end_level - upsample_times):
+                        one_conv = ConvModule(
+                            chn,
+                            self.feat_channels,
+                            3,
+                            padding=1,
+                            stride=2,
+                            conv_cfg=self.conv_cfg,
+                            norm_cfg=self.norm_cfg,
+                            act_cfg=self.act_cfg,
+                            inplace=False)
+                        convs_per_level.add_module('conv' + str(i), one_conv)
+                self.convs_all_levels.append(convs_per_level)
+                continue
+
+            for j in range(i):
+                if j == 0:
+                    if i == self.cat_coors_level and self.cat_coors:
+                        chn = self.in_channels + 2
+                    else:
+                        chn = self.in_channels
+                    one_conv = ConvModule(
+                        chn,
+                        self.feat_channels,
+                        3,
+                        padding=1,
+                        conv_cfg=self.conv_cfg,
+                        norm_cfg=self.norm_cfg,
+                        act_cfg=self.act_cfg,
+                        inplace=False)
+                    convs_per_level.add_module('conv' + str(j), one_conv)
+                    if j < upsample_times - (self.end_level - i):
+                        one_upsample = nn.Upsample(
+                            scale_factor=2,
+                            mode='bilinear',
+                            align_corners=False)
+                        convs_per_level.add_module('upsample' + str(j),
+                                                   one_upsample)
+                    continue
+
+                one_conv = ConvModule(
+                    self.feat_channels,
+                    self.feat_channels,
+                    3,
+                    padding=1,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg,
+                    act_cfg=self.act_cfg,
+                    inplace=False)
+                convs_per_level.add_module('conv' + str(j), one_conv)
+                if j < upsample_times - (self.end_level - i):
+                    one_upsample = nn.Upsample(
+                        scale_factor=2, mode='bilinear', align_corners=False)
+                    convs_per_level.add_module('upsample' + str(j),
+                                               one_upsample)
+
+            self.convs_all_levels.append(convs_per_level)
+
+        if fuse_by_cat:
+            in_channels = self.feat_channels * len(self.convs_all_levels)
+        else:
+            in_channels = self.feat_channels
+
+        if self.with_pred:
+            self.conv_pred = ConvModule(
+                in_channels,
+                self.out_channels,
+                1,
+                padding=0,
+                conv_cfg=self.conv_cfg,
+                act_cfg=out_act_cfg,
+                norm_cfg=self.norm_cfg)
+
+        self.num_aux_convs = num_aux_convs
+        self.aux_convs = nn.ModuleList()
+        for i in range(num_aux_convs):
+            self.aux_convs.append(
+                ConvModule(
+                    in_channels,
+                    self.out_channels,
+                    1,
+                    padding=0,
+                    conv_cfg=self.conv_cfg,
+                    act_cfg=out_act_cfg,
+                    norm_cfg=self.norm_cfg))
+
+    def generate_coord(self, input_feat):
+        x_range = torch.linspace(
+            -1, 1, input_feat.shape[-1], device=input_feat.device)
+        y_range = torch.linspace(
+            -1, 1, input_feat.shape[-2], device=input_feat.device)
+        y, x = torch.meshgrid(y_range, x_range)
+        y = y.expand([input_feat.shape[0], 1, -1, -1])
+        x = x.expand([input_feat.shape[0], 1, -1, -1])
+        coord_feat = torch.cat([x, y], 1)
+        return coord_feat
+
+    def forward(self, inputs):
+        mlvl_feats = []
+        for i in range(self.start_level, self.end_level + 1):
+            input_p = inputs[i]
+            if i == self.cat_coors_level:
+                if self.positional_encoding is not None:
+                    ignore_mask = input_p.new_zeros(
+                        (input_p.shape[0], input_p.shape[-2],
+                         input_p.shape[-1]),
+                        dtype=torch.bool)
+                    positional_encoding = self.positional_encoding(ignore_mask)
+                    input_p = input_p + positional_encoding
+                if self.cat_coors:
+                    coord_feat = self.generate_coord(input_p)
+                    input_p = torch.cat([input_p, coord_feat], 1)
+
+            mlvl_feats.append(self.convs_all_levels[i](input_p))
+
+        if self.fuse_by_cat:
+            feature_add_all_level = torch.cat(mlvl_feats, dim=1)
+        else:
+            feature_add_all_level = sum(mlvl_feats)
+
+        if self.with_pred:
+            out = self.conv_pred(feature_add_all_level)
+        else:
+            out = feature_add_all_level
+
+        if self.num_aux_convs > 0:
+            outs = [out]
+            for conv in self.aux_convs:
+                outs.append(conv(feature_add_all_level))
+            return outs
+
+        if self.return_list:
+            return [out]
+        else:
+            return out
diff --git a/modelscope/models/cv/video_panoptic_segmentation/head/track_heads.py b/modelscope/models/cv/video_panoptic_segmentation/head/track_heads.py
new file mode 100644
index 00000000..761ea63d
--- /dev/null
+++ b/modelscope/models/cv/video_panoptic_segmentation/head/track_heads.py
@@ -0,0 +1,154 @@
+# The implementation is adopted from Video-K-Net,
+# made publicly available at https://github.com/lxtGH/Video-K-Net
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import ConvModule
+
+
+def cal_similarity(key_embeds,
+                   ref_embeds,
+                   method='dot_product',
+                   temperature=-1):
+    assert method in ['dot_product', 'cosine']
+
+    if key_embeds.size(0) == 0 or ref_embeds.size(0) == 0:
+        return torch.zeros((key_embeds.size(0), ref_embeds.size(0)),
+                           device=key_embeds.device)
+
+    if method == 'cosine':
+        key_embeds = F.normalize(key_embeds, p=2, dim=1)
+        ref_embeds = F.normalize(ref_embeds, p=2, dim=1)
+        return torch.mm(key_embeds, ref_embeds.t())
+    elif method == 'dot_product':
+        if temperature > 0:
+            dists = cal_similarity(key_embeds, ref_embeds, method='cosine')
+            dists /= temperature
+            return dists
+        else:
+            return torch.mm(key_embeds, ref_embeds.t())
+
+
+class QuasiDenseMaskEmbedHeadGTMask(nn.Module):
+
+    def __init__(
+        self,
+        num_convs=4,
+        num_fcs=1,
+        roi_feat_size=7,
+        in_channels=256,
+        conv_out_channels=256,
+        fc_out_channels=1024,
+        embed_channels=256,
+        conv_cfg=None,
+        norm_cfg=None,
+        softmax_temp=-1,
+    ):
+        super(QuasiDenseMaskEmbedHeadGTMask, self).__init__()
+        self.num_convs = num_convs
+        self.num_fcs = num_fcs
+        self.roi_feat_size = roi_feat_size
+        self.in_channels = in_channels
+        self.conv_out_channels = conv_out_channels
+        self.fc_out_channels = fc_out_channels
+        self.embed_channels = embed_channels
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.relu = nn.ReLU(inplace=True)
+        self.convs, self.fcs, last_layer_dim = self._add_conv_fc_branch(
+            self.num_convs, self.num_fcs, self.in_channels)
+        self.fc_embed = nn.Linear(last_layer_dim, embed_channels)
+
+        self.softmax_temp = softmax_temp
+
+    def _add_conv_fc_branch(self, num_convs, num_fcs, in_channels):
+        last_layer_dim = in_channels
+        # add branch specific conv layers
+        convs = nn.ModuleList()
+        if num_convs > 0:
+            for i in range(num_convs):
+                conv_in_channels = (
+                    last_layer_dim if i == 0 else self.conv_out_channels)
+                convs.append(
+                    ConvModule(
+                        conv_in_channels,
+                        self.conv_out_channels,
+                        3,
+                        padding=1,
+                        conv_cfg=self.conv_cfg,
+                        norm_cfg=self.norm_cfg))
+            last_layer_dim = self.conv_out_channels
+        # add branch specific fc layers
+        fcs = nn.ModuleList()
+        if num_fcs > 0:
+            last_layer_dim *= (self.roi_feat_size * self.roi_feat_size)
+            for i in range(num_fcs):
+                fc_in_channels = (
+                    last_layer_dim if i == 0 else self.fc_out_channels)
+                fcs.append(nn.Linear(fc_in_channels, self.fc_out_channels))
+            last_layer_dim = self.fc_out_channels
+        return convs, fcs, last_layer_dim
+
+    def init_weights(self):
+        for m in self.fcs:
+            if isinstance(m, nn.Linear):
+                nn.init.xavier_uniform_(m.weight)
+                nn.init.constant_(m.bias, 0)
+        nn.init.normal_(self.fc_embed.weight, 0, 0.01)
+        nn.init.constant_(self.fc_embed.bias, 0)
+
+    def forward(self, x):
+        if self.num_convs > 0:
+            for i, conv in enumerate(self.convs):
+                x = conv(x)
+        x = x.view(x.size(0), -1)
+
+        if self.num_fcs > 0:
+            for i, fc in enumerate(self.fcs):
+                x = self.relu(fc(x))
+        x = self.fc_embed(x)
+        return x
+
+    def get_track_targets(self, gt_match_indices, key_sampling_results,
+                          ref_sampling_results):
+        track_targets = []
+        track_weights = []
+        for _gt_match_indices, key_res, ref_res in zip(gt_match_indices,
+                                                       key_sampling_results,
+                                                       ref_sampling_results):
+            targets = _gt_match_indices.new_zeros(
+                (key_res.pos_masks.size(0), ref_res.pos_masks.size(0)),
+                dtype=torch.int)
+            _match_indices = _gt_match_indices[key_res.pos_assigned_gt_inds]
+            pos2pos = (_match_indices.view(
+                -1, 1) == ref_res.pos_assigned_gt_inds.view(1, -1)).int()
+            targets[:, :pos2pos.size(1)] = pos2pos
+            weights = (targets.sum(dim=1) > 0).float()
+            track_targets.append(targets)
+            track_weights.append(weights)
+        return track_targets, track_weights
+
+    def match(self, key_embeds, ref_embeds, key_sampling_results,
+              ref_sampling_results):
+        num_key_rois = [res.pos_masks.size(0) for res in key_sampling_results]
+        key_embeds = torch.split(key_embeds, num_key_rois)
+        num_ref_rois = [res.pos_masks.size(0) for res in ref_sampling_results]
+        ref_embeds = torch.split(ref_embeds, num_ref_rois)
+
+        dists, cos_dists = [], []
+        for key_embed, ref_embed in zip(key_embeds, ref_embeds):
+            dist = cal_similarity(
+                key_embed,
+                ref_embed,
+                method='dot_product',
+                temperature=self.softmax_temp)
+            dists.append(dist)
+            if self.loss_track_aux is not None:
+                cos_dist = cal_similarity(
+                    key_embed, ref_embed, method='cosine')
+                cos_dists.append(cos_dist)
+            else:
+                cos_dists.append(None)
+        return dists, cos_dists
diff --git a/modelscope/models/cv/video_panoptic_segmentation/neck/__init__.py b/modelscope/models/cv/video_panoptic_segmentation/neck/__init__.py
new file mode 100644
index 00000000..b937315b
--- /dev/null
+++ b/modelscope/models/cv/video_panoptic_segmentation/neck/__init__.py
@@ -0,0 +1 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
diff --git a/modelscope/models/cv/video_panoptic_segmentation/neck/fpn.py b/modelscope/models/cv/video_panoptic_segmentation/neck/fpn.py
new file mode 100644
index 00000000..c0c49971
--- /dev/null
+++ b/modelscope/models/cv/video_panoptic_segmentation/neck/fpn.py
@@ -0,0 +1,153 @@
+# The implementation is adopted from Video-K-Net,
+# made publicly available at https://github.com/lxtGH/Video-K-Net
+
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import ConvModule
+from mmcv.runner import BaseModule
+
+
+class FPN(BaseModule):
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 num_outs,
+                 start_level=0,
+                 end_level=-1,
+                 add_extra_convs=False,
+                 relu_before_extra_convs=False,
+                 no_norm_on_lateral=False,
+                 conv_cfg=None,
+                 norm_cfg=None,
+                 act_cfg=None,
+                 upsample_cfg=dict(mode='nearest'),
+                 init_cfg=dict(
+                     type='Xavier', layer='Conv2d', distribution='uniform')):
+        super(FPN, self).__init__(init_cfg)
+        assert isinstance(in_channels, list)
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.num_ins = len(in_channels)
+        self.num_outs = num_outs
+        self.relu_before_extra_convs = relu_before_extra_convs
+        self.no_norm_on_lateral = no_norm_on_lateral
+        self.fp16_enabled = False
+        self.upsample_cfg = upsample_cfg.copy()
+
+        if end_level == -1 or end_level == self.num_ins - 1:
+            self.backbone_end_level = self.num_ins
+            assert num_outs >= self.num_ins - start_level
+        else:
+            # if end_level is not the last level, no extra level is allowed
+            self.backbone_end_level = end_level + 1
+            assert end_level < self.num_ins
+            assert num_outs == end_level - start_level + 1
+        self.start_level = start_level
+        self.end_level = end_level
+        self.add_extra_convs = add_extra_convs
+        assert isinstance(add_extra_convs, (str, bool))
+        if isinstance(add_extra_convs, str):
+            # Extra_convs_source choices: 'on_input', 'on_lateral', 'on_output'
+            assert add_extra_convs in ('on_input', 'on_lateral', 'on_output')
+        elif add_extra_convs:  # True
+            self.add_extra_convs = 'on_input'
+
+        self.lateral_convs = nn.ModuleList()
+        self.fpn_convs = nn.ModuleList()
+
+        for i in range(self.start_level, self.backbone_end_level):
+            l_conv = ConvModule(
+                in_channels[i],
+                out_channels,
+                1,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg if not self.no_norm_on_lateral else None,
+                act_cfg=act_cfg,
+                inplace=False)
+            fpn_conv = ConvModule(
+                out_channels,
+                out_channels,
+                3,
+                padding=1,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                act_cfg=act_cfg,
+                inplace=False)
+
+            self.lateral_convs.append(l_conv)
+            self.fpn_convs.append(fpn_conv)
+
+        # add extra conv layers (e.g., RetinaNet)
+        extra_levels = num_outs - self.backbone_end_level + self.start_level
+        if self.add_extra_convs and extra_levels >= 1:
+            for i in range(extra_levels):
+                if i == 0 and self.add_extra_convs == 'on_input':
+                    in_channels = self.in_channels[self.backbone_end_level - 1]
+                else:
+                    in_channels = out_channels
+                extra_fpn_conv = ConvModule(
+                    in_channels,
+                    out_channels,
+                    3,
+                    stride=2,
+                    padding=1,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    act_cfg=act_cfg,
+                    inplace=False)
+                self.fpn_convs.append(extra_fpn_conv)
+
+    def forward(self, inputs):
+        """Forward function."""
+        assert len(inputs) == len(self.in_channels)
+
+        # build laterals
+        laterals = [
+            lateral_conv(inputs[i + self.start_level])
+            for i, lateral_conv in enumerate(self.lateral_convs)
+        ]
+
+        # build top-down path
+        used_backbone_levels = len(laterals)
+        for i in range(used_backbone_levels - 1, 0, -1):
+            # In some cases, fixing `scale factor` (e.g. 2) is preferred, but
+            #  it cannot co-exist with `size` in `F.interpolate`.
+            if 'scale_factor' in self.upsample_cfg:
+                # fix runtime error of "+=" inplace operation in PyTorch 1.10
+                laterals[i - 1] = laterals[i - 1] + F.interpolate(
+                    laterals[i], **self.upsample_cfg)
+            else:
+                prev_shape = laterals[i - 1].shape[2:]
+                laterals[i - 1] = laterals[i - 1] + F.interpolate(
+                    laterals[i], size=prev_shape, **self.upsample_cfg)
+
+        # build outputs
+        # part 1: from original levels
+        outs = [
+            self.fpn_convs[i](laterals[i]) for i in range(used_backbone_levels)
+        ]
+        # part 2: add extra levels
+        if self.num_outs > len(outs):
+            # use max pool to get more levels on top of outputs
+            # (e.g., Faster R-CNN, Mask R-CNN)
+            if not self.add_extra_convs:
+                for i in range(self.num_outs - used_backbone_levels):
+                    outs.append(F.max_pool2d(outs[-1], 1, stride=2))
+            # add conv layers on top of original feature maps (RetinaNet)
+            else:
+                if self.add_extra_convs == 'on_input':
+                    extra_source = inputs[self.backbone_end_level - 1]
+                elif self.add_extra_convs == 'on_lateral':
+                    extra_source = laterals[-1]
+                elif self.add_extra_convs == 'on_output':
+                    extra_source = outs[-1]
+                else:
+                    raise NotImplementedError
+                outs.append(self.fpn_convs[used_backbone_levels](extra_source))
+                for i in range(used_backbone_levels + 1, self.num_outs):
+                    if self.relu_before_extra_convs:
+                        outs.append(self.fpn_convs[i](F.relu(outs[-1])))
+                    else:
+                        outs.append(self.fpn_convs[i](outs[-1]))
+        return tuple(outs)
diff --git a/modelscope/models/cv/video_panoptic_segmentation/track/__init__.py b/modelscope/models/cv/video_panoptic_segmentation/track/__init__.py
new file mode 100644
index 00000000..b937315b
--- /dev/null
+++ b/modelscope/models/cv/video_panoptic_segmentation/track/__init__.py
@@ -0,0 +1 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
diff --git a/modelscope/models/cv/video_panoptic_segmentation/track/quasi_dense_embed_tracker.py b/modelscope/models/cv/video_panoptic_segmentation/track/quasi_dense_embed_tracker.py
new file mode 100644
index 00000000..5b0cd787
--- /dev/null
+++ b/modelscope/models/cv/video_panoptic_segmentation/track/quasi_dense_embed_tracker.py
@@ -0,0 +1,217 @@
+# The implementation is adopted from Video-K-Net,
+# made publicly available at https://github.com/lxtGH/Video-K-Net
+
+import torch
+import torch.nn.functional as F
+from mmcv.cnn import build_model_from_cfg as build
+from mmcv.utils import Registry
+from mmdet.core import bbox_overlaps
+
+TRACKERS = Registry('tracker')
+
+
+def build_tracker(cfg):
+    """Build tracker."""
+    return build(cfg, TRACKERS)
+
+
+@TRACKERS.register_module()
+class QuasiDenseEmbedTracker(object):
+
+    def __init__(self,
+                 init_score_thr=0.8,
+                 obj_score_thr=0.5,
+                 match_score_thr=0.5,
+                 memo_tracklet_frames=10,
+                 memo_backdrop_frames=1,
+                 memo_momentum=0.8,
+                 nms_conf_thr=0.5,
+                 nms_backdrop_iou_thr=0.3,
+                 nms_class_iou_thr=0.7,
+                 with_cats=True,
+                 match_metric='bisoftmax'):
+        assert 0 <= memo_momentum <= 1.0
+        assert memo_tracklet_frames >= 0
+        assert memo_backdrop_frames >= 0
+        self.init_score_thr = init_score_thr
+        self.obj_score_thr = obj_score_thr
+        self.match_score_thr = match_score_thr
+        self.memo_tracklet_frames = memo_tracklet_frames
+        self.memo_backdrop_frames = memo_backdrop_frames
+        self.memo_momentum = memo_momentum
+        self.nms_conf_thr = nms_conf_thr
+        self.nms_backdrop_iou_thr = nms_backdrop_iou_thr
+        self.nms_class_iou_thr = nms_class_iou_thr
+        self.with_cats = with_cats
+        assert match_metric in ['bisoftmax', 'softmax', 'cosine']
+        self.match_metric = match_metric
+
+        self.num_tracklets = 0
+        self.tracklets = dict()
+        self.backdrops = []
+
+    @property
+    def empty(self):
+        return False if self.tracklets else True
+
+    def update_memo(self, ids, bboxes, embeds, labels, frame_id):
+        tracklet_inds = ids > -1
+
+        # update memo
+        for id, bbox, embed, label in zip(ids[tracklet_inds],
+                                          bboxes[tracklet_inds],
+                                          embeds[tracklet_inds],
+                                          labels[tracklet_inds]):
+            id = int(id)
+            if id in self.tracklets.keys():
+                velocity = (bbox - self.tracklets[id]['bbox']) / (
+                    frame_id - self.tracklets[id]['last_frame'])
+                self.tracklets[id]['bbox'] = bbox
+                self.tracklets[id]['embed'] = (
+                    1 - self.memo_momentum
+                ) * self.tracklets[id]['embed'] + self.memo_momentum * embed
+                self.tracklets[id]['last_frame'] = frame_id
+                self.tracklets[id]['label'] = label
+                self.tracklets[id]['velocity'] = (
+                    self.tracklets[id]['velocity']
+                    * self.tracklets[id]['acc_frame'] + velocity) / (
+                        self.tracklets[id]['acc_frame'] + 1)
+                self.tracklets[id]['acc_frame'] += 1
+            else:
+                self.tracklets[id] = dict(
+                    bbox=bbox,
+                    embed=embed,
+                    label=label,
+                    last_frame=frame_id,
+                    velocity=torch.zeros_like(bbox),
+                    acc_frame=0)
+
+        backdrop_inds = torch.nonzero(ids == -1, as_tuple=False).squeeze(1)
+        ious = bbox_overlaps(bboxes[backdrop_inds, :-1], bboxes[:, :-1])
+        for i, ind in enumerate(backdrop_inds):
+            if (ious[i, :ind] > self.nms_backdrop_iou_thr).any():
+                backdrop_inds[i] = -1
+        backdrop_inds = backdrop_inds[backdrop_inds > -1]
+
+        self.backdrops.insert(
+            0,
+            dict(
+                bboxes=bboxes[backdrop_inds],
+                embeds=embeds[backdrop_inds],
+                labels=labels[backdrop_inds]))
+
+        # pop memo
+        invalid_ids = []
+        for k, v in self.tracklets.items():
+            if frame_id - v['last_frame'] >= self.memo_tracklet_frames:
+                invalid_ids.append(k)
+        for invalid_id in invalid_ids:
+            self.tracklets.pop(invalid_id)
+
+        if len(self.backdrops) > self.memo_backdrop_frames:
+            self.backdrops.pop()
+
+    @property
+    def memo(self):
+        memo_embeds = []
+        memo_ids = []
+        memo_bboxes = []
+        memo_labels = []
+        memo_vs = []
+        for k, v in self.tracklets.items():
+            memo_bboxes.append(v['bbox'][None, :])
+            memo_embeds.append(v['embed'][None, :])
+            memo_ids.append(k)
+            memo_labels.append(v['label'].view(1, 1))
+            memo_vs.append(v['velocity'][None, :])
+        memo_ids = torch.tensor(memo_ids, dtype=torch.long).view(1, -1)
+
+        for backdrop in self.backdrops:
+            backdrop_ids = torch.full((1, backdrop['embeds'].size(0)),
+                                      -1,
+                                      dtype=torch.long)
+            backdrop_vs = torch.zeros_like(backdrop['bboxes'])
+            memo_bboxes.append(backdrop['bboxes'])
+            memo_embeds.append(backdrop['embeds'])
+            memo_ids = torch.cat([memo_ids, backdrop_ids], dim=1)
+            memo_labels.append(backdrop['labels'][:, None])
+            memo_vs.append(backdrop_vs)
+
+        memo_bboxes = torch.cat(memo_bboxes, dim=0)
+        memo_embeds = torch.cat(memo_embeds, dim=0)
+        memo_labels = torch.cat(memo_labels, dim=0).squeeze(1)
+        memo_vs = torch.cat(memo_vs, dim=0)
+        return memo_bboxes, memo_labels, memo_embeds, memo_ids.squeeze(
+            0), memo_vs
+
+    def match(self, bboxes, labels, track_feats, frame_id, asso_tau=-1):
+
+        _, inds = bboxes[:, -1].sort(descending=True)
+        bboxes = bboxes[inds, :]
+        labels = labels[inds]
+        embeds = track_feats[inds, :]
+
+        # hack we do not consider the nms since we use
+        # # duplicate removal for potential backdrops and cross classes
+        valids = bboxes.new_ones((bboxes.size(0)))
+        ious = bbox_overlaps(bboxes[:, :-1], bboxes[:, :-1])
+        for i in range(1, bboxes.size(0)):
+            thr = self.nms_backdrop_iou_thr if bboxes[
+                i, -1] < self.obj_score_thr else self.nms_class_iou_thr
+            if (ious[i, :i] > thr).any():
+                valids[i] = 0
+        valids = valids == 1
+        bboxes = bboxes[valids, :]
+        labels = labels[valids]
+        embeds = embeds[valids, :]
+
+        # init ids container
+        ids = torch.full((bboxes.size(0), ), -1, dtype=torch.long)
+
+        # match if buffer is not empty
+        if bboxes.size(0) > 0 and not self.empty:
+            (memo_bboxes, memo_labels, memo_embeds, memo_ids,
+             memo_vs) = self.memo
+
+            if self.match_metric == 'bisoftmax':
+                feats = torch.mm(embeds, memo_embeds.t())
+                d2t_scores = feats.softmax(dim=1)
+                t2d_scores = feats.softmax(dim=0)
+                scores = (d2t_scores + t2d_scores) / 2
+            elif self.match_metric == 'softmax':
+                feats = torch.mm(embeds, memo_embeds.t())
+                scores = feats.softmax(dim=1)
+            elif self.match_metric == 'cosine':
+                scores = torch.mm(
+                    F.normalize(embeds, p=2, dim=1),
+                    F.normalize(memo_embeds, p=2, dim=1).t())
+            else:
+                raise NotImplementedError
+
+            if self.with_cats:
+                cat_same = labels.view(-1, 1) == memo_labels.view(1, -1)
+                scores *= cat_same.float().to(scores.device)
+
+            for i in range(bboxes.size(0)):
+                conf, memo_ind = torch.max(scores[i, :], dim=0)
+                id = memo_ids[memo_ind]
+                if conf > self.match_score_thr:
+                    if id > -1:
+                        if bboxes[i, -1] > self.obj_score_thr:
+                            ids[i] = id
+                            scores[:i, memo_ind] = 0
+                            scores[i + 1:, memo_ind] = 0
+                        else:
+                            if conf > self.nms_conf_thr:
+                                ids[i] = -2
+        new_inds = (ids == -1) & (bboxes[:, 4] > self.init_score_thr).cpu()
+        num_news = new_inds.sum()
+        ids[new_inds] = torch.arange(
+            self.num_tracklets,
+            self.num_tracklets + num_news,
+            dtype=torch.long)
+        self.num_tracklets += num_news
+
+        self.update_memo(ids, bboxes, embeds, labels, frame_id)
+
+        return bboxes, labels, ids
diff --git a/modelscope/models/cv/video_panoptic_segmentation/video_k_net.py b/modelscope/models/cv/video_panoptic_segmentation/video_k_net.py
new file mode 100644
index 00000000..8f92d4a9
--- /dev/null
+++ b/modelscope/models/cv/video_panoptic_segmentation/video_k_net.py
@@ -0,0 +1,453 @@
+# The implementation is adopted from Video-K-Net,
+# made publicly available at https://github.com/lxtGH/Video-K-Net
+
+import numpy as np
+import torch
+import torch.nn as nn
+from mmcv.cnn import build_activation_layer, build_norm_layer
+from mmdet.models.builder import build_head
+
+from modelscope.metainfo import Models
+from modelscope.models.base import TorchModel
+from modelscope.models.builder import MODELS
+from modelscope.utils.constant import Tasks
+from .backbone.swin_transformer import SwinTransformerDIY
+from .head.kernel_head import ConvKernelHead
+from .head.kernel_iter_head import VideoKernelIterHead
+from .head.track_heads import QuasiDenseMaskEmbedHeadGTMask
+from .neck.fpn import FPN
+from .track.quasi_dense_embed_tracker import (QuasiDenseEmbedTracker,
+                                              build_tracker)
+from .visualizer import vip_seg_id_to_label
+
+
+def coords2bbox_all(coords):
+    left = coords[:, 0].min().item()
+    top = coords[:, 1].min().item()
+    right = coords[:, 0].max().item()
+    bottom = coords[:, 1].max().item()
+    return top, left, bottom, right
+
+
+def tensor_mask2box(masks):
+    boxes = []
+    for mask in masks:
+        m = mask.nonzero().float()
+        if m.numel() > 0:
+            box = coords2bbox_all(m)
+        else:
+            box = (-1, -1, 10, 10)
+        boxes.append(box)
+    return np.asarray(boxes)
+
+
+@MODELS.register_module(
+    Tasks.video_panoptic_segmentation,
+    module_name=Models.video_panoptic_segmentation)
+class VideoKNet(TorchModel):
+
+    def __init__(self, model_dir: str, *args, **kwargs):
+        super().__init__(model_dir, *args, **kwargs)
+
+        num_proposals = 100
+        num_stages = 3
+        conv_kernel_size = 1
+        num_thing_classes = 58
+        num_stuff_classes = 66
+        num_classes = num_thing_classes + num_stuff_classes
+
+        self.num_proposals = num_proposals
+        self.num_stages = num_stages
+        self.conv_kernel_size = conv_kernel_size
+        self.num_thing_classes = num_thing_classes
+        self.num_stuff_classes = num_stuff_classes
+        self.num_classes = num_classes
+
+        self.semantic_filter = True
+        self.link_previous = True
+        self.kitti_step = False
+        self.cityscapes = False
+        self.vipseg = True
+
+        self.test_cfg = dict(
+            rpn=None,
+            rcnn=dict(
+                max_per_img=num_proposals,
+                mask_thr=0.5,
+                stuff_score_thr=0.05,
+                merge_stuff_thing=dict(
+                    overlap_thr=0.6,
+                    iou_thr=0.5,
+                    stuff_max_area=4096,
+                    instance_score_thr=0.25)))
+
+        self.backbone = SwinTransformerDIY(
+            embed_dims=128,
+            depths=[2, 2, 18, 2],
+            num_heads=[4, 8, 16, 32],
+            window_size=7,
+            mlp_ratio=4,
+            qkv_bias=True,
+            qk_scale=None,
+            drop_rate=0.,
+            attn_drop_rate=0.,
+            drop_path_rate=0.3,
+            use_abs_pos_embed=False,
+            patch_norm=True,
+            out_indices=(0, 1, 2, 3),
+            with_cp=False)
+
+        self.neck = FPN(
+            in_channels=[128, 256, 512, 1024],
+            out_channels=256,
+            start_level=0,
+            add_extra_convs='on_input',
+            num_outs=4)
+
+        self.rpn_head = ConvKernelHead(
+            conv_kernel_size=conv_kernel_size,
+            feat_downsample_stride=4,
+            feat_refine_stride=1,
+            feat_refine=False,
+            use_binary=True,
+            num_loc_convs=1,
+            num_seg_convs=1,
+            conv_normal_init=True,
+            num_proposals=num_proposals,
+            proposal_feats_with_obj=True,
+            xavier_init_kernel=False,
+            kernel_init_std=1,
+            num_cls_fcs=1,
+            in_channels=256,
+            num_thing_classes=num_thing_classes,
+            num_stuff_classes=num_stuff_classes,
+            num_classes=num_classes,
+            cat_stuff_mask=True,
+            feat_transform_cfg=None)
+
+        roi_head = dict(
+            type='VideoKernelIterHead',
+            num_stages=num_stages,
+            stage_loss_weights=[1] * num_stages,
+            proposal_feature_channel=256,
+            num_thing_classes=num_thing_classes,
+            num_stuff_classes=num_stuff_classes,
+            do_panoptic=True,
+            with_track=True,
+            merge_joint=True,
+            mask_head=[
+                dict(
+                    type='VideoKernelUpdateHead',
+                    num_classes=num_classes,
+                    previous='placeholder',
+                    previous_type='ffn',
+                    num_thing_classes=num_thing_classes,
+                    num_stuff_classes=num_stuff_classes,
+                    num_ffn_fcs=2,
+                    num_heads=8,
+                    num_cls_fcs=1,
+                    num_mask_fcs=1,
+                    feedforward_channels=2048,
+                    in_channels=256,
+                    out_channels=256,
+                    dropout=0.0,
+                    mask_thr=0.5,
+                    conv_kernel_size=conv_kernel_size,
+                    mask_upsample_stride=4,
+                    ffn_act_cfg=dict(type='ReLU', inplace=True),
+                    with_ffn=True,
+                    feat_transform_cfg=dict(
+                        conv_cfg=dict(type='Conv2d'), act_cfg=None),
+                    kernel_updator_cfg=dict(
+                        type='KernelUpdator',
+                        in_channels=256,
+                        feat_channels=256,
+                        out_channels=256,
+                        input_feat_shape=3,
+                        act_cfg=dict(type='ReLU', inplace=True),
+                        norm_cfg=dict(type='LN')),
+                    loss_mask=dict(
+                        type='CrossEntropyLoss',
+                        use_sigmoid=True,
+                        loss_weight=1.0),
+                    loss_dice=dict(type='DiceLoss', loss_weight=4.0),
+                    loss_cls=dict(
+                        type='FocalLoss',
+                        use_sigmoid=True,
+                        gamma=2.0,
+                        alpha=0.25,
+                        loss_weight=2.0),
+                ) for _ in range(num_stages)
+            ])
+        roi_head.update(test_cfg=self.test_cfg['rcnn'])
+        self.roi_head = build_head(roi_head)
+
+        self.track_head = QuasiDenseMaskEmbedHeadGTMask(
+            num_convs=0,
+            num_fcs=2,
+            roi_feat_size=1,
+            in_channels=256,
+            fc_out_channels=256,
+            embed_channels=256,
+            norm_cfg=dict(type='GN', num_groups=32))
+
+        self.tracker_cfg = dict(
+            type='QuasiDenseEmbedTracker',
+            init_score_thr=0.35,
+            obj_score_thr=0.3,
+            match_score_thr=0.5,
+            memo_tracklet_frames=5,
+            memo_backdrop_frames=1,
+            memo_momentum=0.8,
+            nms_conf_thr=0.5,
+            nms_backdrop_iou_thr=0.3,
+            nms_class_iou_thr=0.7,
+            with_cats=True,
+            match_metric='bisoftmax')
+
+        # add embedding fcs for the final stage queries
+        num_emb_fcs = 1
+        act_cfg = dict(type='ReLU', inplace=True)
+        in_channels = 256
+        out_channels = 256
+        self.embed_fcs = nn.ModuleList()
+        for _ in range(num_emb_fcs):
+            self.embed_fcs.append(
+                nn.Linear(in_channels, in_channels, bias=False))
+            self.embed_fcs.append(
+                build_norm_layer(dict(type='LN'), in_channels)[1])
+            self.embed_fcs.append(build_activation_layer(act_cfg))
+
+        self.fc_embed = nn.Linear(in_channels, out_channels)
+
+        self.link_previous = True,
+
+    def extract_feat(self, img):
+        """Directly extract features from the backbone+neck."""
+        x = self.backbone(img)
+        x = self.neck(x)
+        return x
+
+    def init_tracker(self):
+        self.tracker = build_tracker(self.tracker_cfg)
+
+    def forward(self, img, img_metas, rescale=False, ref_img=None, iid=0):
+        result = self.simple_test(img, img_metas, rescale, ref_img, iid)
+        return result
+
+    def simple_test(self, img, img_metas, rescale=False, ref_img=None, iid=0):
+        """Test function without test time augmentation.
+
+        Args:
+            imgs (list[torch.Tensor]): List of multiple images
+            img_metas (list[dict]): List of image information.
+            rescale (bool): Whether to rescale the results.
+                Defaults to False.
+
+        Returns:
+            list[list[np.ndarray]]: BBox results of each image and classes.
+                The outer list corresponds to each image. The inner list
+                corresponds to each class.
+        """
+
+        # set the dataset type
+        fid = iid % 10000
+        is_first = (fid == 0)
+
+        # for current frame
+        x = self.extract_feat(img)
+        # current frame inference
+        rpn_results = self.rpn_head.simple_test_rpn(x, img_metas)
+        (proposal_feats, x_feats, mask_preds, cls_scores,
+         seg_preds) = rpn_results
+
+        # init tracker
+        if is_first:
+            self.init_tracker()
+            self.obj_feats_memory = None
+            self.x_feats_memory = None
+            self.mask_preds_memory = None
+            print('fid', fid)
+
+        # wheter to link the previous
+        if self.link_previous:
+            simple_test_result = self.roi_head.simple_test_with_previous(
+                x_feats,
+                proposal_feats,
+                mask_preds,
+                cls_scores,
+                img_metas,
+                previous_obj_feats=self.obj_feats_memory,
+                previous_mask_preds=self.mask_preds_memory,
+                previous_x_feats=self.x_feats_memory,
+                is_first=is_first)
+            cur_segm_results, obj_feats, cls_scores, mask_preds, scaled_mask_preds = simple_test_result
+            self.obj_feats_memory = obj_feats
+            self.x_feats_memory = x_feats
+            self.mask_preds_memory = scaled_mask_preds
+        else:
+            cur_segm_results, query_output, cls_scores, mask_preds, scaled_mask_preds = self.roi_head.simple_test(
+                x_feats, proposal_feats, mask_preds, cls_scores, img_metas)
+
+        # for tracking part
+        _, segm_result, mask_preds, panoptic_result, query_output = cur_segm_results[
+            0]
+        panoptic_seg, segments_info = panoptic_result
+
+        # get sorted tracking thing ids, labels, masks, score for tracking
+        things_index_for_tracking, things_labels_for_tracking, thing_masks_for_tracking, things_score_for_tracking = \
+            self.get_things_id_for_tracking(panoptic_seg, segments_info)
+        things_labels_for_tracking = torch.Tensor(
+            things_labels_for_tracking).to(cls_scores.device).long()
+
+        # get the semantic filter
+        if self.semantic_filter:
+            seg_preds = torch.nn.functional.interpolate(
+                seg_preds,
+                panoptic_seg.shape,
+                mode='bilinear',
+                align_corners=False)
+            seg_preds = seg_preds.sigmoid()
+            seg_out = seg_preds.argmax(1)
+            semantic_thing = (seg_out < self.num_thing_classes).to(
+                dtype=torch.float32)
+        else:
+            semantic_thing = 1.
+
+        if len(things_labels_for_tracking) > 0:
+            things_bbox_for_tracking = torch.zeros(
+                (len(things_score_for_tracking), 5),
+                dtype=torch.float,
+                device=x_feats.device)
+            things_bbox_for_tracking[:, 4] = torch.tensor(
+                things_score_for_tracking,
+                device=things_bbox_for_tracking.device)
+
+            thing_masks_for_tracking_final = []
+            for mask in thing_masks_for_tracking:
+                thing_masks_for_tracking_final.append(
+                    torch.Tensor(mask).unsqueeze(0).to(x_feats.device).float())
+            thing_masks_for_tracking_final = torch.cat(
+                thing_masks_for_tracking_final, 0)
+            thing_masks_for_tracking = thing_masks_for_tracking_final
+            thing_masks_for_tracking_with_semantic_filter = thing_masks_for_tracking_final * semantic_thing
+        else:
+            things_bbox_for_tracking = []
+
+        if len(things_labels_for_tracking) == 0:
+            track_feats = None
+        else:
+            # tracking embeddings
+            N, _, _, _ = query_output.shape
+            emb_feat = query_output.squeeze(-2).squeeze(-1).unsqueeze(
+                0)  # (n,d,1,1) -> (1,n,d)
+
+            for emb_layer in self.embed_fcs:
+                emb_feat = emb_layer(emb_feat)
+            object_feats_embed = self.fc_embed(emb_feat).view(1, N, -1)
+            object_feats_embed_for_tracking = object_feats_embed.squeeze(0)
+            track_feats = self._track_forward(
+                [object_feats_embed_for_tracking])
+
+        if track_feats is not None:
+            things_bbox_for_tracking[:, :4] = torch.tensor(
+                tensor_mask2box(thing_masks_for_tracking_with_semantic_filter),
+                device=things_bbox_for_tracking.device)
+            bboxes, labels, ids = self.tracker.match(
+                bboxes=things_bbox_for_tracking,
+                labels=things_labels_for_tracking,
+                track_feats=track_feats,
+                frame_id=fid)
+
+            ids = ids + 1
+            ids[ids == -1] = 0
+        else:
+            ids = []
+
+        track_maps = self.generate_track_id_maps(ids, thing_masks_for_tracking,
+                                                 panoptic_seg)
+
+        semantic_map, binary_masks, labels = self.get_semantic_seg(
+            panoptic_seg, segments_info)
+
+        vis_tracker = None
+        vis_sem = None
+        from .visualizer import trackmap2rgb, cityscapes_cat2rgb, draw_bbox_on_img
+        if len(things_labels_for_tracking):
+            vis_tracker = trackmap2rgb(track_maps)
+            vis_sem = cityscapes_cat2rgb(semantic_map)
+            vis_tracker = draw_bbox_on_img(
+                vis_tracker,
+                things_bbox_for_tracking.cpu().numpy())
+
+        return semantic_map, track_maps, None, vis_sem, vis_tracker, labels, binary_masks, ids, things_bbox_for_tracking
+
+    def _track_forward(self, track_feats, x=None, mask_pred=None):
+        track_feats = torch.cat(track_feats, 0)
+        track_feats = self.track_head(track_feats)
+        return track_feats
+
+    def get_things_id_for_tracking(self, panoptic_seg, seg_infos):
+        idxs = []
+        labels = []
+        masks = []
+        score = []
+        for segment in seg_infos:
+            if segment['isthing'] is True:
+                thing_mask = panoptic_seg == segment['id']
+                masks.append(thing_mask)
+                idxs.append(segment['instance_id'])
+                labels.append(segment['category_id'])
+                score.append(segment['score'])
+        return idxs, labels, masks, score
+
+    def get_semantic_seg(self, panoptic_seg, segments_info):
+        kitti_step2cityscpaes = [11, 13]
+        semantic_seg = np.zeros(panoptic_seg.shape)
+        binary_masks = []
+        labels = []
+        for segment in segments_info:
+            binary_mask = np.zeros(panoptic_seg.shape)
+            if segment['isthing'] is True:
+                # for things
+                if self.kitti_step:
+                    cat_cur = kitti_step2cityscpaes[segment['category_id']]
+                    semantic_seg[panoptic_seg == segment['id']] = cat_cur
+                    label = cat_cur
+                else:  # city and vip_seg
+                    semantic_seg[panoptic_seg == segment['id']] = segment[
+                        'category_id'] + self.num_stuff_classes
+                    label = segment['category_id'] + self.num_stuff_classes
+            else:
+                # for stuff (0 - n-1)
+                if self.kitti_step:
+                    cat_cur = segment['category_id']
+                    cat_cur -= 1
+                    offset = 0
+                    for thing_id in kitti_step2cityscpaes:
+                        if cat_cur + offset >= thing_id:
+                            offset += 1
+                    cat_cur += offset
+                    semantic_seg[panoptic_seg == segment['id']] = cat_cur
+                    label = cat_cur
+                else:  # city and vip_seg
+                    mask_idx = panoptic_seg == segment['id']
+                    semantic_seg[mask_idx] = segment['category_id'] - 1
+                    label = segment['category_id'] - 1
+            binary_mask[panoptic_seg == segment['id']] = 1
+            binary_masks.append(binary_mask)
+            labels.append(vip_seg_id_to_label[label])
+        return semantic_seg, binary_masks, labels
+
+    def generate_track_id_maps(self, ids, masks, panopitc_seg_maps):
+        final_id_maps = np.zeros(panopitc_seg_maps.shape)
+
+        if len(ids) == 0:
+            return final_id_maps
+        masks = masks.bool()
+
+        for i, id in enumerate(ids):
+            mask = masks[i].cpu().numpy()
+            final_id_maps[mask] = id
+
+        return final_id_maps
diff --git a/modelscope/models/cv/video_panoptic_segmentation/visualizer.py b/modelscope/models/cv/video_panoptic_segmentation/visualizer.py
new file mode 100644
index 00000000..e6b0f95b
--- /dev/null
+++ b/modelscope/models/cv/video_panoptic_segmentation/visualizer.py
@@ -0,0 +1,202 @@
+# The implementation is adopted from Video-K-Net,
+# made publicly available at https://github.com/lxtGH/Video-K-Net
+
+import hashlib
+
+import cv2
+import numpy as np
+
+vip_seg_label = {
+    'wall': '1',
+    'ceiling': '2',
+    'door': '3',
+    'stair': '4',
+    'ladder': '5',
+    'escalator': '6',
+    'Playground_slide': '7',
+    'handrail_or_fence': '8',
+    'window': '9',
+    'others': '0',
+    'rail': '10',
+    'goal': '11',
+    'pillar': '12',
+    'pole': '13',
+    'floor': '14',
+    'ground': '15',
+    'grass': '16',
+    'sand': '17',
+    'athletic_field': '18',
+    'road': '19',
+    'path': '20',
+    'crosswalk': '21',
+    'building': '22',
+    'house': '23',
+    'bridge': '24',
+    'tower': '25',
+    'windmill': '26',
+    'well_or_well_lid': '27',
+    'other_construction': '28',
+    'sky': '29',
+    'mountain': '30',
+    'stone': '31',
+    'wood': '32',
+    'ice': '33',
+    'snowfield': '34',
+    'grandstand': '35',
+    'sea': '36',
+    'river': '37',
+    'lake': '38',
+    'waterfall': '39',
+    'water': '40',
+    'billboard_or_Bulletin_Board': '41',
+    'sculpture': '42',
+    'pipeline': '43',
+    'flag': '44',
+    'parasol_or_umbrella': '45',
+    'cushion_or_carpet': '46',
+    'tent': '47',
+    'roadblock': '48',
+    'car': '49',
+    'bus': '50',
+    'truck': '51',
+    'bicycle': '52',
+    'motorcycle': '53',
+    'wheeled_machine': '54',
+    'ship_or_boat': '55',
+    'raft': '56',
+    'airplane': '57',
+    'tyre': '58',
+    'traffic_light': '59',
+    'lamp': '60',
+    'person': '61',
+    'cat': '62',
+    'dog': '63',
+    'horse': '64',
+    'cattle': '65',
+    'other_animal': '66',
+    'tree': '67',
+    'flower': '68',
+    'other_plant': '69',
+    'toy': '70',
+    'ball_net': '71',
+    'backboard': '72',
+    'skateboard': '73',
+    'bat': '74',
+    'ball': '75',
+    'cupboard_or_showcase_or_storage_rack': '76',
+    'box': '77',
+    'traveling_case_or_trolley_case': '78',
+    'basket': '79',
+    'bag_or_package': '80',
+    'trash_can': '81',
+    'cage': '82',
+    'plate': '83',
+    'tub_or_bowl_or_pot': '84',
+    'bottle_or_cup': '85',
+    'barrel': '86',
+    'fishbowl': '87',
+    'bed': '88',
+    'pillow': '89',
+    'table_or_desk': '90',
+    'chair_or_seat': '91',
+    'bench': '92',
+    'sofa': '93',
+    'shelf': '94',
+    'bathtub': '95',
+    'gun': '96',
+    'commode': '97',
+    'roaster': '98',
+    'other_machine': '99',
+    'refrigerator': '100',
+    'washing_machine': '101',
+    'Microwave_oven': '102',
+    'fan': '103',
+    'curtain': '104',
+    'textiles': '105',
+    'clothes': '106',
+    'painting_or_poster': '107',
+    'mirror': '108',
+    'flower_pot_or_vase': '109',
+    'clock': '110',
+    'book': '111',
+    'tool': '112',
+    'blackboard': '113',
+    'tissue': '114',
+    'screen_or_television': '115',
+    'computer': '116',
+    'printer': '117',
+    'Mobile_phone': '118',
+    'keyboard': '119',
+    'other_electronic_product': '120',
+    'fruit': '121',
+    'food': '122',
+    'instrument': '123',
+    'train': '124'
+}
+
+vip_seg_label_to_id = {k: int(v) for k, v in vip_seg_label.items()}
+vip_seg_id_to_label = {int(v): k for k, v in vip_seg_label.items()}
+
+city_labels = [('road', 0, (128, 64, 128)), ('sidewalk', 1, (244, 35, 232)),
+               ('building', 2, (70, 70, 70)), ('wall', 3, (102, 102, 156)),
+               ('fence', 4, (190, 153, 153)), ('pole', 5, (153, 153, 153)),
+               ('traffic light', 6, (250, 170, 30)),
+               ('traffic sign', 7, (220, 220, 0)),
+               ('vegetation', 8, (107, 142, 35)),
+               ('terrain', 9, (152, 251, 152)), ('sky', 10, (70, 130, 180)),
+               ('person', 11, (220, 20, 60)), ('rider', 12, (255, 0, 0)),
+               ('car', 13, (0, 0, 142)), ('truck', 14, (0, 0, 70)),
+               ('bus', 15, (0, 60, 100)), ('train', 16, (0, 80, 100)),
+               ('motorcycle', 17, (0, 0, 230)), ('bicycle', 18, (119, 11, 32)),
+               ('void', 19, (0, 0, 0)), ('void', 255, (0, 0, 0))]
+
+
+def sha256num(num):
+    hex = hashlib.sha256(str(num).encode('utf-8')).hexdigest()
+    hex = hex[-6:]
+    return int(hex, 16)
+
+
+def id2rgb(id_map):
+    if isinstance(id_map, np.ndarray):
+        id_map_copy = id_map.copy()
+        rgb_shape = tuple(list(id_map.shape) + [3])
+        rgb_map = np.zeros(rgb_shape, dtype=np.uint8)
+        for i in range(3):
+            rgb_map[..., i] = id_map_copy % 256
+            id_map_copy //= 256
+        return rgb_map
+    color = []
+    for _ in range(3):
+        color.append(id_map % 256)
+        id_map //= 256
+    return color
+
+
+def cityscapes_cat2rgb(cat_map):
+    color_map = np.zeros_like(cat_map).astype(np.uint8)
+    color_map = color_map[..., None].repeat(3, axis=-1)
+    for each_class in city_labels:
+        index = cat_map == each_class[1]
+        if index.any():
+            color_map[index] = each_class[2]
+    return color_map
+
+
+def trackmap2rgb(track_map):
+    color_map = np.zeros_like(track_map).astype(np.uint8)
+    color_map = color_map[..., None].repeat(3, axis=-1)
+    for id_cur in np.unique(track_map):
+        if id_cur == 0:
+            continue
+        color_map[track_map == id_cur] = id2rgb(sha256num(id_cur))
+    return color_map
+
+
+def draw_bbox_on_img(vis_img, bboxes):
+    for index in range(bboxes.shape[0]):
+        cv2.rectangle(
+            vis_img, (int(bboxes[index][0]), int(bboxes[index][1])),
+            (int(bboxes[index][2]), int(bboxes[index][3])), (0, 0, 255),
+            thickness=1)
+    return vis_img
diff --git a/modelscope/models/cv/video_super_resolution/__init__.py b/modelscope/models/cv/video_super_resolution/__init__.py
index 0a2cc193..07ab414d 100644
--- a/modelscope/models/cv/video_super_resolution/__init__.py
+++ b/modelscope/models/cv/video_super_resolution/__init__.py
@@ -5,11 +5,13 @@ from modelscope.utils.import_utils import LazyImportModule
 
 if TYPE_CHECKING:
     from .real_basicvsr_for_video_super_resolution import RealBasicVSRNetForVideoSR
+    from .msrresnet_lite_model import MSRResNetLiteModel
 
 else:
     _import_structure = {
         'real_basicvsr_for_video_super_resolution':
-        ['RealBasicVSRNetForVideoSR']
+        ['RealBasicVSRNetForVideoSR'],
+        'msrresnet_lite_model': ['MSRResNetLiteModel']
     }
 
     import sys
diff --git a/modelscope/models/cv/video_super_resolution/msrresnet_lite_model.py b/modelscope/models/cv/video_super_resolution/msrresnet_lite_model.py
new file mode 100644
index 00000000..e34744bd
--- /dev/null
+++ b/modelscope/models/cv/video_super_resolution/msrresnet_lite_model.py
@@ -0,0 +1,131 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import functools
+import os
+from typing import Any, Dict, Union
+
+import torch
+import torch.cuda
+import torch.nn as nn
+import torch.nn.functional as F
+
+from modelscope.metainfo import Models
+from modelscope.models.base import Tensor
+from modelscope.models.base.base_torch_model import TorchModel
+from modelscope.models.builder import MODELS
+from modelscope.utils.config import Config
+from modelscope.utils.constant import ModelFile, Tasks
+from modelscope.utils.logger import get_logger
+from .common import ResidualBlockNoBN, make_layer
+
+logger = get_logger()
+__all__ = ['MSRResNetLiteModel']
+
+
+@MODELS.register_module(
+    Tasks.video_super_resolution, module_name=Models.msrresnet_lite)
+class MSRResNetLiteModel(TorchModel):
+
+    def __init__(self, model_dir: str, *args, **kwargs):
+        """initialize the video super-resolution model from the `model_dir` path.
+
+        Args:
+            model_dir (str): the model path.
+
+        """
+        super().__init__(model_dir, *args, **kwargs)
+        self.model_dir = model_dir
+        self.config = Config.from_file(
+            os.path.join(self.model_dir, ModelFile.CONFIGURATION))
+
+        self.max_seq_len = 1
+
+        # network architecture
+        in_nc = self.config.model.model_args.in_nc
+        out_nc = self.config.model.model_args.out_nc
+        nf = self.config.model.model_args.nf
+        nb = self.config.model.model_args.nb
+        self.upscale = self.config.model.model_args.upscale
+
+        self.conv_first = nn.Conv2d(in_nc, nf // 2, 3, 1, 1, bias=True)
+        # use stride=2 conv to downsample
+        self.conv_down = nn.Conv2d(nf // 2, nf, 3, 2, 1, bias=True)
+        self.recon_trunk = make_layer(ResidualBlockNoBN, nb, mid_channels=nf)
+
+        # upsampling
+        if self.upscale == 2:
+            self.pixel_shuffle = nn.PixelShuffle(2)
+            self.upconv2 = nn.Conv2d(nf // 4, nf, 3, 1, 1, bias=True)
+            self.conv_last = nn.Conv2d(nf // 4, out_nc, 3, 1, 1, bias=True)
+        elif self.upscale == 1:
+            self.pixel_shuffle = nn.PixelShuffle(2)
+            self.conv_last = nn.Conv2d(nf // 4, out_nc, 3, 1, 1, bias=True)
+        elif self.upscale == 4:
+            self.pixel_shuffle = nn.PixelShuffle(2)
+            self.upconv1 = nn.Conv2d(nf // 4, nf, 3, 1, 1, bias=True)
+            self.upconv2 = nn.Conv2d(nf // 4, nf, 3, 1, 1, bias=True)
+            self.conv_last = nn.Conv2d(nf // 4, out_nc, 3, 1, 1, bias=True)
+
+        # activation function
+        self.lrelu = nn.LeakyReLU(negative_slope=0.1, inplace=True)
+
+    def _inference_forward(self, input: Tensor) -> Dict[str, Tensor]:
+        if input.ndim == 5:
+            input = input.squeeze(1)
+
+        fea = self.lrelu(self.conv_first(input))
+        fea = self.lrelu(self.conv_down(fea))
+        out = self.recon_trunk(fea)
+
+        out = self.lrelu(self.pixel_shuffle(out))
+
+        if self.upscale == 2:
+            out = self.lrelu(self.pixel_shuffle(self.upconv2(out)))
+            out = self.conv_last(out)
+            base = F.interpolate(
+                input,
+                scale_factor=self.upscale,
+                mode='bilinear',
+                align_corners=False)
+            out += base
+        elif self.upscale == 1:
+            out = self.conv_last(out) + input
+        elif self.upscale == 4:
+            out = self.lrelu(self.pixel_shuffle(self.upconv1(out)))
+            out = self.lrelu(self.pixel_shuffle(self.upconv2(out)))
+            out = self.conv_last(out)
+            base = F.interpolate(
+                input,
+                scale_factor=self.upscale,
+                mode='bilinear',
+                align_corners=False)
+            out += base
+
+        output = torch.clamp(out, 0.0, 1.0)
+
+        if output.ndim == 4:
+            output = output.unsqueeze(1)
+        return {'output': output}
+
+    def forward(self, inputs: Dict[str,
+                                   Tensor]) -> Dict[str, Union[list, Tensor]]:
+        """return the result by the model
+
+        Args:
+            inputs (Tensor): the preprocessed data
+
+        Returns:
+            Dict[str, Tensor]: results
+        """
+        return self._inference_forward(**inputs)
+
+    @classmethod
+    def _instantiate(cls, **kwargs):
+        model_file = kwargs.get('am_model_name', ModelFile.TORCH_MODEL_FILE)
+        model_dir = kwargs['model_dir']
+        ckpt_path = os.path.join(model_dir, model_file)
+        logger.info(f'loading model from {ckpt_path}')
+        model_dir = kwargs.pop('model_dir')
+        model = cls(model_dir=model_dir, **kwargs)
+        ckpt_path = os.path.join(model_dir, model_file)
+        model.load_state_dict(torch.load(ckpt_path, map_location='cpu'))
+        return model
diff --git a/modelscope/models/cv/vision_efficient_tuning/__init__.py b/modelscope/models/cv/vision_efficient_tuning/__init__.py
new file mode 100644
index 00000000..05243554
--- /dev/null
+++ b/modelscope/models/cv/vision_efficient_tuning/__init__.py
@@ -0,0 +1,30 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+
+    from .vision_efficient_tuning_adapter import VisionEfficientTuningAdapterModel
+    from .vision_efficient_tuning_prompt import VisionEfficientTuningPromptModel
+    from .vision_efficient_tuning_prefix import VisionEfficientTuningPrefixModel
+    from .vision_efficient_tuning_lora import VisionEfficientTuningLoRAModel
+
+else:
+    _import_structure = {
+        'vision_efficient_tuning_adapter':
+        ['VisionEfficientTuningAdapterModel'],
+        'vision_efficient_tuning_prompt': ['VisionEfficientTuningPromptModel'],
+        'vision_efficient_tuning_prefix': ['VisionEfficientTuningPrefixModel'],
+        'vision_efficient_tuning_lora': ['VisionEfficientTuningLoRAModel'],
+    }
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/models/cv/vision_efficient_tuning/backbone.py b/modelscope/models/cv/vision_efficient_tuning/backbone.py
new file mode 100644
index 00000000..e7556ea1
--- /dev/null
+++ b/modelscope/models/cv/vision_efficient_tuning/backbone.py
@@ -0,0 +1,351 @@
+# The implementation here is modified based on timm,
+# originally Apache 2.0 License and publicly available at
+# https://github.com/rwightman/pytorch-image-models/blob/main/timm/models/vision_transformer.py
+from functools import partial
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from .petl import Adapter, LoRA, Prefix, Prompt
+from .timm_vision_transformer import (Attention, Block, DropPath, LayerScale,
+                                      Mlp, PatchEmbed, VisionTransformer)
+
+
+class AttentionPETL(nn.Module):
+    """Extend the parameter-efficient transfer learning (PETL) method to the original Attention.
+
+    Prefix tuning optimizes the task-specific vector in the multi-head attention layer.
+    'Prefix-tuning: Optimizing continuous prompts for generation' by Li & Liang(2021)
+    See https://arxiv.org/abs/2101.00190
+
+    LoRA constructs an additional layer with low-rank decomposition matrices of the weights in the network.
+    'LoRA: Low-Rank Adaptation of Large Language Models' by Hu et al.(2021)
+    See https://arxiv.org/abs/2106.09685
+
+    Attributes:
+        prefix_length: An integer indicating the length of prefix tuning.
+        prefix_type: A string indicating the type of prefix tuning.
+        lora_length: An integer indicating the length of LoRA tuning.
+        lora_type: A string indicating the type of LoRA tuning.
+    """
+
+    def __init__(
+        self,
+        dim,
+        num_heads=8,
+        qkv_bias=False,
+        attn_drop=0.,
+        proj_drop=0.,
+        prefix_length=None,
+        prefix_type=None,
+        lora_length=None,
+        lora_type=None,
+    ):
+        super().__init__()
+        assert dim % num_heads == 0, 'dim should be divisible by num_heads'
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = head_dim**-0.5
+
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+
+        if lora_length and lora_length > 0:
+            self.lora = LoRA(
+                dim=dim,
+                num_heads=num_heads,
+                lora_length=lora_length,
+                lora_type=lora_type)
+        else:
+            self.lora = None
+
+        if prefix_length and prefix_length > 0:
+            self.prefix = Prefix(
+                dim=dim,
+                num_heads=num_heads,
+                prefix_length=prefix_length,
+                prefix_type=prefix_type)
+        else:
+            self.prefix = None
+
+    def forward(self, x):
+        B, N, C = x.shape
+        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads,
+                                  C // self.num_heads).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv.unbind(0)
+
+        if self.lora is not None:
+            q, k, v = self.lora(x, q, k, v)
+
+        if self.prefix is not None:
+            q, k, v = self.prefix(x, q, k, v)
+
+        attn = (q @ k.transpose(-2, -1)) * self.scale
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+
+        x = (attn @ v).transpose(1, 2).reshape(B, N, C)
+
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+
+
+class BlockPETL(nn.Module):
+    """Extend the parameter-efficient transfer learning (PETL) method to the original Block.
+
+    Visual prompt tuning (VPT) is proposed to initialize tunable prompt tokens
+    and prepend to the original tokens in the first layer or multiple layers.
+    'Visual Prompt Tuning' by Jia et al.(2022)
+    See https://arxiv.org/abs/2203.12119
+
+    Adapters project input tokens by an MLP layer.
+    'Parameter-Efficient Transfer Learning for NLP' by Houlsby et al.(2019)
+    See http://arxiv.org/abs/1902.00751
+
+    Attributes:
+        adapter_length: An integer indicating the length of adapter tuning.
+        adapter_type: A string indicating the type of adapter tuning.
+        prompt_length: An integer indicating the length of prompt tuning.
+        prompt_type: A string indicating the type of prompt tuning.
+    """
+
+    def __init__(
+        self,
+        dim,
+        num_heads,
+        mlp_ratio=4.,
+        qkv_bias=False,
+        drop=0.,
+        attn_drop=0.,
+        init_values=None,
+        drop_path=0.,
+        act_layer=nn.GELU,
+        norm_layer=nn.LayerNorm,
+        attn_layer=Attention,
+        layer_num=-1,
+        prompt_length=None,
+        prompt_type=None,
+        prefix_length=None,
+        prefix_type=None,
+        adapter_length=None,
+        adapter_type=None,
+        lora_length=None,
+        lora_type=None,
+    ):
+        super().__init__()
+        self.layer_num = layer_num
+        self.norm1 = norm_layer(dim)
+        self.attn = attn_layer(
+            dim,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            attn_drop=attn_drop,
+            proj_drop=drop,
+            prefix_length=prefix_length,
+            prefix_type=prefix_type,
+            lora_length=lora_length,
+            lora_type=lora_type,
+        )
+        self.ls1 = LayerScale(
+            dim, init_values=init_values) if init_values else nn.Identity()
+
+        self.drop_path1 = DropPath(
+            drop_path) if drop_path > 0. else nn.Identity()
+
+        self.norm2 = norm_layer(dim)
+        self.mlp = Mlp(
+            in_features=dim,
+            hidden_features=int(dim * mlp_ratio),
+            act_layer=act_layer,
+            drop=drop)
+        self.ls2 = LayerScale(
+            dim, init_values=init_values) if init_values else nn.Identity()
+        self.drop_path2 = DropPath(
+            drop_path) if drop_path > 0. else nn.Identity()
+
+        self.adapter_length = adapter_length
+        self.adapter_type = adapter_type
+        if adapter_length and adapter_length > 0:
+            self.adapter = Adapter(
+                dim=dim,
+                adapter_length=adapter_length,
+                adapter_type=adapter_type,
+                act_layer=act_layer)
+        else:
+            self.adapter = None
+
+        self.prompt_length = prompt_length
+        self.prompt_type = prompt_type
+        if prompt_length and prompt_length > 0:
+            self.prompt = Prompt(
+                dim=dim,
+                layer_num=layer_num,
+                prompt_length=prompt_length,
+                prompt_type=prompt_type)
+        else:
+            self.prompt = None
+
+    def forward(self, x):
+        if self.prompt is not None:
+            x = self.prompt(x)
+
+        x = x + self.drop_path1(self.ls1(self.attn(self.norm1(x))))
+
+        if self.adapter is not None:
+            x = x + self.adapter(
+                self.drop_path2(self.ls2(self.mlp(self.norm2(x)))))
+        else:
+            x = x + self.drop_path2(self.ls2(self.mlp(self.norm2(x))))
+        return x
+
+
+class VisionTransformerPETL(VisionTransformer):
+    """ Extend the parameter-efficient transfer learning (PETL) method to the original Vision Transformer.
+
+    A PyTorch impl of : `An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale`
+        - https://arxiv.org/abs/2010.11929
+
+    The implementation of several tuning methods (prompt, prefix, adapter, and LoRA) based on ViT.
+    """
+
+    def __init__(
+        self,
+        img_size=224,
+        patch_size=16,
+        in_chans=3,
+        num_classes=1000,
+        global_pool='token',
+        embed_dim=768,
+        depth=12,
+        num_heads=12,
+        mlp_ratio=4.,
+        qkv_bias=True,
+        init_values=None,
+        class_token=True,
+        no_embed_class=False,
+        pre_norm=False,
+        fc_norm=None,
+        drop_rate=0.,
+        attn_drop_rate=0.,
+        drop_path_rate=0.,
+        weight_init='',
+        embed_layer=PatchEmbed,
+        norm_layer=None,
+        act_layer=None,
+        block_fn=Block,
+        prompt_length=None,
+        prompt_type=None,
+        prefix_length=None,
+        prefix_type=None,
+        adapter_length=None,
+        adapter_type=None,
+        lora_length=None,
+        lora_type=None,
+    ):
+
+        super().__init__()
+        assert global_pool in ('', 'avg', 'token')
+        assert class_token or global_pool != 'token'
+        use_fc_norm = global_pool == 'avg' if fc_norm is None else fc_norm
+        norm_layer = norm_layer or partial(nn.LayerNorm, eps=1e-6)
+        act_layer = act_layer or nn.GELU
+
+        self.num_classes = num_classes
+        self.global_pool = global_pool
+        self.num_features = self.embed_dim = embed_dim
+        self.num_prefix_tokens = 1 if class_token else 0
+        self.no_embed_class = no_embed_class
+        self.grad_checkpointing = False
+
+        self.depth = depth
+        self.img_size = img_size
+        self.class_token = class_token
+
+        self.prompt_length = prompt_length
+        self.prompt_type = prompt_type
+
+        self.prefix_length = prefix_length
+        self.prefix_type = prefix_type
+
+        self.adapter_length = adapter_length
+        self.adapter_type = adapter_type
+
+        self.lora_length = lora_length
+        self.lora_type = lora_type
+
+        self.patch_embed = embed_layer(
+            img_size=img_size,
+            patch_size=patch_size,
+            in_chans=in_chans,
+            embed_dim=embed_dim,
+            bias=not pre_norm,
+        )
+        num_patches = self.patch_embed.num_patches
+
+        self.cls_token = nn.Parameter(torch.zeros(
+            1, 1, embed_dim)) if class_token else None
+        embed_len = num_patches if no_embed_class else num_patches + self.num_prefix_tokens
+        self.pos_embed = nn.Parameter(
+            torch.randn(1, embed_len, embed_dim) * .02)
+        self.pos_drop = nn.Dropout(p=drop_rate)
+        self.norm_pre = norm_layer(embed_dim) if pre_norm else nn.Identity()
+
+        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)]
+        if prompt_length is not None or prefix_length is not None \
+           or adapter_length is not None or lora_length is not None:
+            attn_layer = AttentionPETL
+            block_fn = BlockPETL
+            self.blocks = nn.Sequential(*[
+                block_fn(
+                    dim=embed_dim,
+                    num_heads=num_heads,
+                    mlp_ratio=mlp_ratio,
+                    qkv_bias=qkv_bias,
+                    init_values=init_values,
+                    drop=drop_rate,
+                    attn_drop=attn_drop_rate,
+                    drop_path=dpr[i],
+                    norm_layer=norm_layer,
+                    act_layer=act_layer,
+                    attn_layer=attn_layer,
+                    layer_num=i,
+                    prompt_length=prompt_length[i] if isinstance(
+                        prompt_length, list) else prompt_length,
+                    prompt_type=prompt_type,
+                    prefix_length=prefix_length[i] if isinstance(
+                        prefix_length, list) else prefix_length,
+                    prefix_type=prefix_type,
+                    adapter_length=adapter_length[i] if isinstance(
+                        adapter_length, list) else adapter_length,
+                    adapter_type=adapter_type,
+                    lora_length=lora_length[i] if isinstance(
+                        lora_length, list) else lora_length,
+                    lora_type=lora_type) for i in range(depth)
+            ])
+        else:
+            self.blocks = nn.Sequential(*[
+                block_fn(
+                    dim=embed_dim,
+                    num_heads=num_heads,
+                    mlp_ratio=mlp_ratio,
+                    qkv_bias=qkv_bias,
+                    init_values=init_values,
+                    drop=drop_rate,
+                    attn_drop=attn_drop_rate,
+                    drop_path=dpr[i],
+                    norm_layer=norm_layer,
+                    act_layer=act_layer) for i in range(depth)
+            ])
+
+        self.norm = norm_layer(embed_dim) if not use_fc_norm else nn.Identity()
+
+        self.fc_norm = norm_layer(embed_dim) if use_fc_norm else nn.Identity()
+        self.head = nn.Linear(
+            self.embed_dim, num_classes) if num_classes > 0 else nn.Identity()
+
+        if weight_init != 'skip':
+            self.init_weights(weight_init)
diff --git a/modelscope/models/cv/vision_efficient_tuning/head.py b/modelscope/models/cv/vision_efficient_tuning/head.py
new file mode 100644
index 00000000..a82ef997
--- /dev/null
+++ b/modelscope/models/cv/vision_efficient_tuning/head.py
@@ -0,0 +1,25 @@
+# Copyright 2022-2023 The Alibaba Fundamental Vision Team Authors. All rights reserved.
+import torch.nn as nn
+
+
+class ClassifierHead(nn.Module):
+    """The implementation of classification head.
+
+    Attributes:
+        dim: An integer indicating the hidden dimension.
+        num_classes: A string indicating the number of class.
+        dropout_rate: A float indicating the dropout rate.
+    """
+
+    def __init__(self, dim, num_classes, dropout_rate=0):
+        super().__init__()
+        self.dim = dim
+        self.num_classes = num_classes
+        if dropout_rate > 0.0:
+            self.dropout = nn.Dropout(dropout_rate)
+        self.fc = nn.Linear(dim, num_classes)
+
+    def forward(self, x):
+        if hasattr(self, 'dropout'):
+            x = self.dropout(x)
+        return self.fc(x)
diff --git a/modelscope/models/cv/vision_efficient_tuning/petl.py b/modelscope/models/cv/vision_efficient_tuning/petl.py
new file mode 100644
index 00000000..f43ba10b
--- /dev/null
+++ b/modelscope/models/cv/vision_efficient_tuning/petl.py
@@ -0,0 +1,174 @@
+# Copyright 2022-2023 The Alibaba Fundamental Vision Team Authors. All rights reserved.
+import math
+
+import torch
+import torch.nn as nn
+
+
+class Prompt(nn.Module):
+    """The implementation of vision prompt tuning method.
+
+    Visual prompt tuning (VPT) is proposed to initialize tunable prompt tokens
+    and prepend to the original tokens in the first layer or multiple layers.
+    'Visual Prompt Tuning' by Jia et al.(2022)
+    See https://arxiv.org/abs/2203.12119
+
+    Attributes:
+        dim: An integer indicating the embedding dimension.
+        layer_num: An integer indicating number of layers.
+        prompt_length: An integer indicating the length of vision prompt tuning.
+        prompt_type: A string indicating the type of vision prompt tuning.
+    """
+
+    def __init__(self, dim, layer_num, prompt_length=None, prompt_type=None):
+        super(Prompt, self).__init__()
+        self.dim = dim
+        self.layer_num = layer_num
+        self.prompt_length = prompt_length
+        self.prompt_type = prompt_type
+
+        self.prompt_token = nn.Parameter(torch.zeros(1, prompt_length, dim))
+        nn.init.xavier_uniform_(self.prompt_token)
+
+    def forward(self, x):
+        B, N, C = x.shape
+        prompt_token = self.prompt_token.expand(B, -1, -1)
+
+        if self.layer_num == 0:
+            x = torch.cat((x, prompt_token), dim=1)
+        else:
+            x = torch.cat((x[:, :-self.prompt_length, :], prompt_token), dim=1)
+        return x
+
+
+class Adapter(nn.Module):
+    """The implementation of adapter tuning method.
+
+    Adapters project input tokens by an MLP layer.
+    'Parameter-Efficient Transfer Learning for NLP' by Houlsby et al.(2019)
+    See http://arxiv.org/abs/1902.00751
+
+    Attributes:
+        dim: An integer indicating the embedding dimension.
+        adapter_length: An integer indicating the length of adapter tuning.
+        adapter_type: A string indicating the type of adapter tuning.
+    """
+
+    def __init__(
+        self,
+        dim,
+        adapter_length=None,
+        adapter_type=None,
+        act_layer=nn.GELU,
+    ):
+        super(Adapter, self).__init__()
+        self.dim = dim
+        self.adapter_length = adapter_length
+        self.adapter_type = adapter_type
+        self.ln1 = nn.Linear(dim, adapter_length)
+        self.activate = act_layer()
+        self.ln2 = nn.Linear(adapter_length, dim)
+        self.init_weights()
+
+    def init_weights(self):
+
+        def _init_weights(m):
+            if isinstance(m, nn.Linear):
+                nn.init.xavier_uniform_(m.weight)
+                nn.init.normal_(m.bias, std=1e-6)
+
+        self.apply(_init_weights)
+
+    def forward(self, x, identity=None):
+        out = self.ln2(self.activate(self.ln1(x)))
+        if identity is None:
+            identity = x
+        out = identity + out
+        return out
+
+
+class LoRA(nn.Module):
+    """The implementation of LoRA tuning method.
+
+    LoRA constructs an additional layer with low-rank decomposition matrices of the weights in the network.
+    'LoRA: Low-Rank Adaptation of Large Language Models' by Hu et al.(2021)
+    See https://arxiv.org/abs/2106.09685
+
+    Attributes:
+        dim: An integer indicating the embedding dimension.
+        num_heads: An integer indicating number of attention heads.
+        lora_length: An integer indicating the length of LoRA tuning.
+        lora_type: A string indicating the type of LoRA tuning.
+    """
+
+    def __init__(
+        self,
+        dim,
+        num_heads,
+        lora_length=None,
+        lora_type=None,
+    ):
+        super(LoRA, self).__init__()
+        self.dim = dim
+        self.num_heads = num_heads
+        self.lora_a = nn.Linear(dim, lora_length, bias=False)
+        nn.init.kaiming_uniform_(self.lora_a.weight, a=math.sqrt(5))
+        self.lora_b = nn.Linear(lora_length, dim * 3, bias=False)
+        nn.init.zeros_(self.lora_b.weight)
+
+        self.lora_length = lora_length
+        self.lora_type = lora_type
+
+    def forward(self, x, q, k, v):
+        B, N, C = x.shape
+        qkv_delta = self.lora_b(self.lora_a(x))
+        qkv_delta = qkv_delta.reshape(B, N, 3, self.num_heads,
+                                      C // self.num_heads).permute(
+                                          2, 0, 3, 1, 4)
+        q_delta, k_delta, v_delta = qkv_delta.unbind(0)
+        q, k, v = q + q_delta, k + k_delta, v + v_delta
+        return q, k, v
+
+
+class Prefix(nn.Module):
+    """The implementation of prefix tuning method.
+
+    Prefix tuning optimizes the task-specific vector in the multi-head attention layer.
+    'Prefix-tuning: Optimizing continuous prompts for generation' by Li & Liang(2021)
+    See https://arxiv.org/abs/2101.00190
+
+    Attributes:
+        dim: An integer indicating the embedding dimension.
+        num_heads: An integer indicating number of attention heads.
+        prefix_length: An integer indicating the length of prefix tuning.
+        prefix_type: A string indicating the type of prefix tuning.
+    """
+
+    def __init__(
+        self,
+        dim,
+        num_heads,
+        prefix_length=None,
+        prefix_type=None,
+    ):
+        super(Prefix, self).__init__()
+        self.dim = dim
+        self.num_heads = num_heads
+        self.prefix_length = prefix_length
+        self.prefix_type = prefix_type
+        self.prefix_key = nn.Parameter(torch.zeros(1, prefix_length, dim))
+        self.prefix_value = nn.Parameter(torch.zeros(1, prefix_length, dim))
+        nn.init.xavier_uniform_(self.prefix_key)
+        nn.init.xavier_uniform_(self.prefix_value)
+
+    def forward(self, x, q, k, v):
+        B, N, C = x.shape
+        prefix_key = self.prefix_key.expand(B, -1, -1).reshape(
+            B, self.prefix_length, self.num_heads,
+            self.dim // self.num_heads).permute(0, 2, 1, 3)
+        prefix_value = self.prefix_value.expand(B, -1, -1).reshape(
+            B, self.prefix_length, self.num_heads,
+            self.dim // self.num_heads).permute(0, 2, 1, 3)
+        k, v = torch.cat((k, prefix_key), dim=2), torch.cat((v, prefix_value),
+                                                            dim=2)
+        return q, k, v
diff --git a/modelscope/models/cv/vision_efficient_tuning/timm_helpers.py b/modelscope/models/cv/vision_efficient_tuning/timm_helpers.py
new file mode 100644
index 00000000..fee47464
--- /dev/null
+++ b/modelscope/models/cv/vision_efficient_tuning/timm_helpers.py
@@ -0,0 +1,132 @@
+# The implementation is adopted from timm (version: 0.6.11),
+# made publicly available under the Apache 2.0 License at
+# https://github.com/rwightman/pytorch-image-models/blob/main/timm/models/helpers.py
+import math
+from itertools import chain
+from typing import Callable
+
+import torch
+import torch.nn as nn
+from torch.utils.checkpoint import checkpoint
+
+
+def named_apply(fn: Callable,
+                module: nn.Module,
+                name='',
+                depth_first=True,
+                include_root=False) -> nn.Module:
+    if not depth_first and include_root:
+        fn(module=module, name=name)
+    for child_name, child_module in module.named_children():
+        child_name = '.'.join((name, child_name)) if name else child_name
+        named_apply(
+            fn=fn,
+            module=child_module,
+            name=child_name,
+            depth_first=depth_first,
+            include_root=True)
+    if depth_first and include_root:
+        fn(module=module, name=name)
+    return module
+
+
+def adapt_input_conv(in_chans, conv_weight):
+    conv_type = conv_weight.dtype
+    conv_weight = conv_weight.float(
+    )  # Some weights are in torch.half, ensure it's float for sum on CPU
+    O, I, J, K = conv_weight.shape
+    if in_chans == 1:
+        if I > 3:
+            assert conv_weight.shape[1] % 3 == 0
+            # For models with space2depth stems
+            conv_weight = conv_weight.reshape(O, I // 3, 3, J, K)
+            conv_weight = conv_weight.sum(dim=2, keepdim=False)
+        else:
+            conv_weight = conv_weight.sum(dim=1, keepdim=True)
+    elif in_chans != 3:
+        if I != 3:  # noqa
+            raise NotImplementedError(
+                'Weight format not supported by conversion.')
+        else:
+            # NOTE this strategy should be better than random init, but there could be other combinations of
+            # the original RGB input layer weights that'd work better for specific cases.
+            repeat = int(math.ceil(in_chans / 3))
+            conv_weight = conv_weight.repeat(1, repeat, 1,
+                                             1)[:, :in_chans, :, :]
+            conv_weight *= (3 / float(in_chans))
+    conv_weight = conv_weight.to(conv_type)
+    return conv_weight
+
+
+def checkpoint_seq(functions,
+                   x,
+                   every=1,
+                   flatten=False,
+                   skip_last=False,
+                   preserve_rng_state=True):
+    r"""A helper function for checkpointing sequential models.
+
+    Sequential models execute a list of modules/functions in order
+    (sequentially). Therefore, we can divide such a sequence into segments
+    and checkpoint each segment. All segments except run in :func:`torch.no_grad`
+    manner, i.e., not storing the intermediate activations. The inputs of each
+    checkpointed segment will be saved for re-running the segment in the backward pass.
+
+    See :func:`~torch.utils.checkpoint.checkpoint` on how checkpointing works.
+
+    .. warning::
+        Checkpointing currently only supports :func:`torch.autograd.backward`
+        and only if its `inputs` argument is not passed. :func:`torch.autograd.grad`
+        is not supported.
+
+    .. warning:
+        At least one of the inputs needs to have :code:`requires_grad=True` if
+        grads are needed for model inputs, otherwise the checkpointed part of the
+        model won't have gradients.
+
+    Args:
+        functions: A :class:`torch.nn.Sequential` or the list of modules or functions to run sequentially.
+        x: A Tensor that is input to :attr:`functions`
+        every: checkpoint every-n functions (default: 1)
+        flatten (bool): flatten nn.Sequential of nn.Sequentials
+        skip_last (bool): skip checkpointing the last function in the sequence if True
+        preserve_rng_state (bool, optional, default=True):  Omit stashing and restoring
+            the RNG state during each checkpoint.
+
+    Returns:
+        Output of running :attr:`functions` sequentially on :attr:`*inputs`
+
+    Example:
+        >>> model = nn.Sequential(...)
+        >>> input_var = checkpoint_seq(model, input_var, every=2)
+    """
+
+    def run_function(start, end, functions):
+
+        def forward(_x):
+            for j in range(start, end + 1):
+                _x = functions[j](_x)
+            return _x
+
+        return forward
+
+    if isinstance(functions, torch.nn.Sequential):
+        functions = functions.children()
+    if flatten:
+        functions = chain.from_iterable(functions)
+    if not isinstance(functions, (tuple, list)):
+        functions = tuple(functions)
+
+    num_checkpointed = len(functions)
+    if skip_last:
+        num_checkpointed -= 1
+    end = -1
+    for start in range(0, num_checkpointed, every):
+        end = min(start + every - 1, num_checkpointed - 1)
+        x = checkpoint(
+            run_function(start, end, functions),
+            x,
+            preserve_rng_state=preserve_rng_state)
+    if skip_last:
+        return run_function(end + 1, len(functions) - 1, functions)(x)
+    return x
diff --git a/modelscope/models/cv/vision_efficient_tuning/timm_vision_transformer.py b/modelscope/models/cv/vision_efficient_tuning/timm_vision_transformer.py
new file mode 100644
index 00000000..65e7090f
--- /dev/null
+++ b/modelscope/models/cv/vision_efficient_tuning/timm_vision_transformer.py
@@ -0,0 +1,755 @@
+# The implementation is adopted from timm (version: 0.6.11),
+# made publicly available under the Apache 2.0 License at
+# https://github.com/rwightman/pytorch-image-models/blob/main/timm/models/vision_transformer.py,
+# https://github.com/rwightman/pytorch-image-models/blob/main/timm/models/layers/mlp.py,
+# https://github.com/rwightman/pytorch-image-models/blob/main/timm/models/layers/mlp.py,
+# https://github.com/rwightman/pytorch-image-models/blob/main/timm/models/layers/patch_embed.py,
+# https://github.com/rwightman/pytorch-image-models/blob/main/timm/models/layers/drop.py,
+# https://github.com/rwightman/pytorch-image-models/blob/main/timm/models/helpers.py
+import collections.abc
+import logging
+import math
+from collections import OrderedDict
+from functools import partial
+from itertools import repeat
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.checkpoint
+from torch import _assert
+
+from .timm_helpers import adapt_input_conv, checkpoint_seq, named_apply
+from .timm_weight_init import lecun_normal_, trunc_normal_
+
+_logger = logging.getLogger(__name__)
+
+
+def _ntuple(n):
+
+    def parse(x):
+        if isinstance(x, collections.abc.Iterable) and not isinstance(x, str):
+            return x
+        return tuple(repeat(x, n))
+
+    return parse
+
+
+to_1tuple = _ntuple(1)
+to_2tuple = _ntuple(2)
+to_3tuple = _ntuple(3)
+to_4tuple = _ntuple(4)
+to_ntuple = _ntuple
+
+
+class PatchEmbed(nn.Module):
+    """ 2D Image to Patch Embedding
+    """
+
+    def __init__(
+        self,
+        img_size=224,
+        patch_size=16,
+        in_chans=3,
+        embed_dim=768,
+        norm_layer=None,
+        flatten=True,
+        bias=True,
+    ):
+        super().__init__()
+        img_size = to_2tuple(img_size)
+        patch_size = to_2tuple(patch_size)
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.grid_size = (img_size[0] // patch_size[0],
+                          img_size[1] // patch_size[1])
+        self.num_patches = self.grid_size[0] * self.grid_size[1]
+        self.flatten = flatten
+
+        self.proj = nn.Conv2d(
+            in_chans,
+            embed_dim,
+            kernel_size=patch_size,
+            stride=patch_size,
+            bias=bias)
+        self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity()
+
+    def forward(self, x):
+        B, C, H, W = x.shape
+        _assert(
+            H == self.img_size[0],
+            f"Input image height ({H}) doesn't match model ({self.img_size[0]})."
+        )
+        _assert(
+            W == self.img_size[1],
+            f"Input image width ({W}) doesn't match model ({self.img_size[1]})."
+        )
+        x = self.proj(x)
+        if self.flatten:
+            x = x.flatten(2).transpose(1, 2)  # BCHW -> BNC
+        x = self.norm(x)
+        return x
+
+
+class Mlp(nn.Module):
+    """ MLP as used in Vision Transformer, MLP-Mixer and related networks
+    """
+
+    def __init__(self,
+                 in_features,
+                 hidden_features=None,
+                 out_features=None,
+                 act_layer=nn.GELU,
+                 bias=True,
+                 drop=0.):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        bias = to_2tuple(bias)
+        drop_probs = to_2tuple(drop)
+
+        self.fc1 = nn.Linear(in_features, hidden_features, bias=bias[0])
+        self.act = act_layer()
+        self.drop1 = nn.Dropout(drop_probs[0])
+        self.fc2 = nn.Linear(hidden_features, out_features, bias=bias[1])
+        self.drop2 = nn.Dropout(drop_probs[1])
+
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop1(x)
+        x = self.fc2(x)
+        x = self.drop2(x)
+        return x
+
+
+def drop_path(x,
+              drop_prob: float = 0.,
+              training: bool = False,
+              scale_by_keep: bool = True):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+
+    This is the same as the DropConnect impl I created for EfficientNet, etc networks, however,
+    the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
+    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for
+    changing the layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use
+    'survival rate' as the argument.
+
+    """
+    if drop_prob == 0. or not training:
+        return x
+    keep_prob = 1 - drop_prob
+    shape = (x.shape[0], ) + (1, ) * (
+        x.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
+    random_tensor = x.new_empty(shape).bernoulli_(keep_prob)
+    if keep_prob > 0.0 and scale_by_keep:
+        random_tensor.div_(keep_prob)
+    return x * random_tensor
+
+
+class DropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks).
+    """
+
+    def __init__(self, drop_prob: float = 0., scale_by_keep: bool = True):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+        self.scale_by_keep = scale_by_keep
+
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training, self.scale_by_keep)
+
+    def extra_repr(self):
+        return f'drop_prob={round(self.drop_prob,3):0.3f}'
+
+
+class Attention(nn.Module):
+
+    def __init__(self,
+                 dim,
+                 num_heads=8,
+                 qkv_bias=False,
+                 attn_drop=0.,
+                 proj_drop=0.):
+        super().__init__()
+        assert dim % num_heads == 0, 'dim should be divisible by num_heads'
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = head_dim**-0.5
+
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+
+    def forward(self, x):
+        B, N, C = x.shape
+        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads,
+                                  C // self.num_heads).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv.unbind(
+            0)  # make torchscript happy (cannot use tensor as tuple)
+
+        attn = (q @ k.transpose(-2, -1)) * self.scale
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+
+        x = (attn @ v).transpose(1, 2).reshape(B, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+
+
+class LayerScale(nn.Module):
+
+    def __init__(self, dim, init_values=1e-5, inplace=False):
+        super().__init__()
+        self.inplace = inplace
+        self.gamma = nn.Parameter(init_values * torch.ones(dim))
+
+    def forward(self, x):
+        return x.mul_(self.gamma) if self.inplace else x * self.gamma
+
+
+class Block(nn.Module):
+
+    def __init__(self,
+                 dim,
+                 num_heads,
+                 mlp_ratio=4.,
+                 qkv_bias=False,
+                 drop=0.,
+                 attn_drop=0.,
+                 init_values=None,
+                 drop_path=0.,
+                 act_layer=nn.GELU,
+                 norm_layer=nn.LayerNorm):
+        super().__init__()
+        self.norm1 = norm_layer(dim)
+        self.attn = Attention(
+            dim,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            attn_drop=attn_drop,
+            proj_drop=drop)
+        self.ls1 = LayerScale(
+            dim, init_values=init_values) if init_values else nn.Identity()
+        # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
+        self.drop_path1 = DropPath(
+            drop_path) if drop_path > 0. else nn.Identity()
+
+        self.norm2 = norm_layer(dim)
+        self.mlp = Mlp(
+            in_features=dim,
+            hidden_features=int(dim * mlp_ratio),
+            act_layer=act_layer,
+            drop=drop)
+        self.ls2 = LayerScale(
+            dim, init_values=init_values) if init_values else nn.Identity()
+        self.drop_path2 = DropPath(
+            drop_path) if drop_path > 0. else nn.Identity()
+
+    def forward(self, x):
+        x = x + self.drop_path1(self.ls1(self.attn(self.norm1(x))))
+        x = x + self.drop_path2(self.ls2(self.mlp(self.norm2(x))))
+        return x
+
+
+class ResPostBlock(nn.Module):
+
+    def __init__(self,
+                 dim,
+                 num_heads,
+                 mlp_ratio=4.,
+                 qkv_bias=False,
+                 drop=0.,
+                 attn_drop=0.,
+                 init_values=None,
+                 drop_path=0.,
+                 act_layer=nn.GELU,
+                 norm_layer=nn.LayerNorm):
+        super().__init__()
+        self.init_values = init_values
+
+        self.attn = Attention(
+            dim,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            attn_drop=attn_drop,
+            proj_drop=drop)
+        self.norm1 = norm_layer(dim)
+        self.drop_path1 = DropPath(
+            drop_path) if drop_path > 0. else nn.Identity()
+
+        self.mlp = Mlp(
+            in_features=dim,
+            hidden_features=int(dim * mlp_ratio),
+            act_layer=act_layer,
+            drop=drop)
+        self.norm2 = norm_layer(dim)
+        self.drop_path2 = DropPath(
+            drop_path) if drop_path > 0. else nn.Identity()
+
+        self.init_weights()
+
+    def init_weights(self):
+        # NOTE this init overrides that base model init with specific changes for the block type
+        if self.init_values is not None:
+            nn.init.constant_(self.norm1.weight, self.init_values)
+            nn.init.constant_(self.norm2.weight, self.init_values)
+
+    def forward(self, x):
+        x = x + self.drop_path1(self.norm1(self.attn(x)))
+        x = x + self.drop_path2(self.norm2(self.mlp(x)))
+        return x
+
+
+class ParallelBlock(nn.Module):
+
+    def __init__(self,
+                 dim,
+                 num_heads,
+                 num_parallel=2,
+                 mlp_ratio=4.,
+                 qkv_bias=False,
+                 init_values=None,
+                 drop=0.,
+                 attn_drop=0.,
+                 drop_path=0.,
+                 act_layer=nn.GELU,
+                 norm_layer=nn.LayerNorm):
+        super().__init__()
+        self.num_parallel = num_parallel
+        self.attns = nn.ModuleList()
+        self.ffns = nn.ModuleList()
+        for _ in range(num_parallel):
+            self.attns.append(
+                nn.Sequential(
+                    OrderedDict([('norm', norm_layer(dim)),
+                                 ('attn',
+                                  Attention(
+                                      dim,
+                                      num_heads=num_heads,
+                                      qkv_bias=qkv_bias,
+                                      attn_drop=attn_drop,
+                                      proj_drop=drop)),
+                                 ('ls',
+                                  LayerScale(dim, init_values=init_values)
+                                  if init_values else nn.Identity()),
+                                 ('drop_path', DropPath(drop_path)
+                                  if drop_path > 0. else nn.Identity())])))
+            self.ffns.append(
+                nn.Sequential(
+                    OrderedDict([('norm', norm_layer(dim)),
+                                 ('mlp',
+                                  Mlp(dim,
+                                      hidden_features=int(dim * mlp_ratio),
+                                      act_layer=act_layer,
+                                      drop=drop)),
+                                 ('ls',
+                                  LayerScale(dim, init_values=init_values)
+                                  if init_values else nn.Identity()),
+                                 ('drop_path', DropPath(drop_path)
+                                  if drop_path > 0. else nn.Identity())])))
+
+    def _forward_jit(self, x):
+        x = x + torch.stack([attn(x) for attn in self.attns]).sum(dim=0)
+        x = x + torch.stack([ffn(x) for ffn in self.ffns]).sum(dim=0)
+        return x
+
+    @torch.jit.ignore
+    def _forward(self, x):
+        x = x + sum(attn(x) for attn in self.attns)
+        x = x + sum(ffn(x) for ffn in self.ffns)
+        return x
+
+    def forward(self, x):
+        if torch.jit.is_scripting() or torch.jit.is_tracing():
+            return self._forward_jit(x)
+        else:
+            return self._forward(x)
+
+
+class VisionTransformer(nn.Module):
+    """ Vision Transformer
+
+    A PyTorch impl of : `An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale`
+        - https://arxiv.org/abs/2010.11929
+    """
+
+    def __init__(
+        self,
+        img_size=224,
+        patch_size=16,
+        in_chans=3,
+        num_classes=1000,
+        global_pool='token',
+        embed_dim=768,
+        depth=12,
+        num_heads=12,
+        mlp_ratio=4.,
+        qkv_bias=True,
+        init_values=None,
+        class_token=True,
+        no_embed_class=False,
+        pre_norm=False,
+        fc_norm=None,
+        drop_rate=0.,
+        attn_drop_rate=0.,
+        drop_path_rate=0.,
+        weight_init='',
+        embed_layer=PatchEmbed,
+        norm_layer=None,
+        act_layer=None,
+        block_fn=Block,
+    ):
+        """
+        Args:
+            img_size (int, tuple): input image size
+            patch_size (int, tuple): patch size
+            in_chans (int): number of input channels
+            num_classes (int): number of classes for classification head
+            global_pool (str): type of global pooling for final sequence (default: 'token')
+            embed_dim (int): embedding dimension
+            depth (int): depth of transformer
+            num_heads (int): number of attention heads
+            mlp_ratio (int): ratio of mlp hidden dim to embedding dim
+            qkv_bias (bool): enable bias for qkv if True
+            init_values: (float): layer-scale init values
+            class_token (bool): use class token
+            fc_norm (Optional[bool]): pre-fc norm after pool, set if global_pool == 'avg' if None (default: None)
+            drop_rate (float): dropout rate
+            attn_drop_rate (float): attention dropout rate
+            drop_path_rate (float): stochastic depth rate
+            weight_init (str): weight init scheme
+            embed_layer (nn.Module): patch embedding layer
+            norm_layer: (nn.Module): normalization layer
+            act_layer: (nn.Module): MLP activation layer
+        """
+        super().__init__()
+        assert global_pool in ('', 'avg', 'token')
+        assert class_token or global_pool != 'token'
+        use_fc_norm = global_pool == 'avg' if fc_norm is None else fc_norm
+        norm_layer = norm_layer or partial(nn.LayerNorm, eps=1e-6)
+        act_layer = act_layer or nn.GELU
+
+        self.num_classes = num_classes
+        self.global_pool = global_pool
+        self.num_features = self.embed_dim = embed_dim  # num_features for consistency with other models
+        self.num_prefix_tokens = 1 if class_token else 0
+        self.no_embed_class = no_embed_class
+        self.grad_checkpointing = False
+
+        self.patch_embed = embed_layer(
+            img_size=img_size,
+            patch_size=patch_size,
+            in_chans=in_chans,
+            embed_dim=embed_dim,
+            bias=not pre_norm,  # disable bias if pre-norm is used (e.g. CLIP)
+        )
+        num_patches = self.patch_embed.num_patches
+
+        self.cls_token = nn.Parameter(torch.zeros(
+            1, 1, embed_dim)) if class_token else None
+        embed_len = num_patches if no_embed_class else num_patches + self.num_prefix_tokens
+        self.pos_embed = nn.Parameter(
+            torch.randn(1, embed_len, embed_dim) * .02)
+        self.pos_drop = nn.Dropout(p=drop_rate)
+        self.norm_pre = norm_layer(embed_dim) if pre_norm else nn.Identity()
+
+        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)
+               ]  # stochastic depth decay rule
+        self.blocks = nn.Sequential(*[
+            block_fn(
+                dim=embed_dim,
+                num_heads=num_heads,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                init_values=init_values,
+                drop=drop_rate,
+                attn_drop=attn_drop_rate,
+                drop_path=dpr[i],
+                norm_layer=norm_layer,
+                act_layer=act_layer) for i in range(depth)
+        ])
+        self.norm = norm_layer(embed_dim) if not use_fc_norm else nn.Identity()
+
+        # Classifier Head
+        self.fc_norm = norm_layer(embed_dim) if use_fc_norm else nn.Identity()
+        self.head = nn.Linear(
+            self.embed_dim, num_classes) if num_classes > 0 else nn.Identity()
+
+        if weight_init != 'skip':
+            self.init_weights(weight_init)
+
+    def init_weights(self, mode=''):
+        assert mode in ('jax', 'jax_nlhb', 'moco', '')
+        head_bias = -math.log(self.num_classes) if 'nlhb' in mode else 0.
+        trunc_normal_(self.pos_embed, std=.02)
+        if self.cls_token is not None:
+            nn.init.normal_(self.cls_token, std=1e-6)
+        named_apply(get_init_weights_vit(mode, head_bias), self)
+
+    def _init_weights(self, m):
+        # this fn left here for compat with downstream users
+        init_weights_vit_timm(m)
+
+    @torch.jit.ignore()
+    def load_pretrained(self, checkpoint_path, prefix=''):
+        _load_weights(self, checkpoint_path, prefix)
+
+    @torch.jit.ignore
+    def no_weight_decay(self):
+        return {'pos_embed', 'cls_token', 'dist_token'}
+
+    @torch.jit.ignore
+    def group_matcher(self, coarse=False):
+        return dict(
+            stem=r'^cls_token|pos_embed|patch_embed',  # stem and embed
+            blocks=[(r'^blocks\.(\d+)', None), (r'^norm', (99999, ))])
+
+    @torch.jit.ignore
+    def set_grad_checkpointing(self, enable=True):
+        self.grad_checkpointing = enable
+
+    @torch.jit.ignore
+    def get_classifier(self):
+        return self.head
+
+    def reset_classifier(self, num_classes: int, global_pool=None):
+        self.num_classes = num_classes
+        if global_pool is not None:
+            assert global_pool in ('', 'avg', 'token')
+            self.global_pool = global_pool
+        self.head = nn.Linear(
+            self.embed_dim, num_classes) if num_classes > 0 else nn.Identity()
+
+    def _pos_embed(self, x):
+        if self.no_embed_class:
+            # deit-3, updated JAX (big vision)
+            # position embedding does not overlap with class token, add then concat
+            x = x + self.pos_embed
+            if self.cls_token is not None:
+                x = torch.cat((self.cls_token.expand(x.shape[0], -1, -1), x),
+                              dim=1)
+        else:
+            # original timm, JAX, and deit vit impl
+            # pos_embed has entry for class token, concat then add
+            if self.cls_token is not None:
+                x = torch.cat((self.cls_token.expand(x.shape[0], -1, -1), x),
+                              dim=1)
+            x = x + self.pos_embed
+        return self.pos_drop(x)
+
+    def forward_features(self, x):
+        x = self.patch_embed(x)
+        x = self._pos_embed(x)
+        x = self.norm_pre(x)
+        if self.grad_checkpointing and not torch.jit.is_scripting():
+            x = checkpoint_seq(self.blocks, x)
+        else:
+            x = self.blocks(x)
+        x = self.norm(x)
+        return x
+
+    def forward_head(self, x, pre_logits: bool = False):
+        if self.global_pool:
+            x = x[:, self.num_prefix_tokens:].mean(
+                dim=1) if self.global_pool == 'avg' else x[:, 0]
+        x = self.fc_norm(x)
+        return x if pre_logits else self.head(x)
+
+    def forward(self, x):
+        x = self.forward_features(x)
+        x = self.forward_head(x)
+        return x
+
+
+def init_weights_vit_timm(module: nn.Module, name: str = ''):
+    """ ViT weight initialization, original timm impl (for reproducibility) """
+    if isinstance(module, nn.Linear):
+        trunc_normal_(module.weight, std=.02)
+        if module.bias is not None:
+            nn.init.zeros_(module.bias)
+    elif hasattr(module, 'init_weights'):
+        module.init_weights()
+
+
+def init_weights_vit_jax(module: nn.Module,
+                         name: str = '',
+                         head_bias: float = 0.):
+    """ ViT weight initialization, matching JAX (Flax) impl """
+    if isinstance(module, nn.Linear):
+        if name.startswith('head'):
+            nn.init.zeros_(module.weight)
+            nn.init.constant_(module.bias, head_bias)
+        else:
+            nn.init.xavier_uniform_(module.weight)
+            if module.bias is not None:
+                nn.init.normal_(
+                    module.bias,
+                    std=1e-6) if 'mlp' in name else nn.init.zeros_(module.bias)
+    elif isinstance(module, nn.Conv2d):
+        lecun_normal_(module.weight)
+        if module.bias is not None:
+            nn.init.zeros_(module.bias)
+    elif hasattr(module, 'init_weights'):
+        module.init_weights()
+
+
+def init_weights_vit_moco(module: nn.Module, name: str = ''):
+    """ ViT weight initialization, matching moco-v3 impl minus fixed PatchEmbed """
+    if isinstance(module, nn.Linear):
+        if 'qkv' in name:
+            # treat the weights of Q, K, V separately
+            val = math.sqrt(
+                6.
+                / float(module.weight.shape[0] // 3 + module.weight.shape[1]))
+            nn.init.uniform_(module.weight, -val, val)
+        else:
+            nn.init.xavier_uniform_(module.weight)
+        if module.bias is not None:
+            nn.init.zeros_(module.bias)
+    elif hasattr(module, 'init_weights'):
+        module.init_weights()
+
+
+def get_init_weights_vit(mode='jax', head_bias: float = 0.):
+    if 'jax' in mode:
+        return partial(init_weights_vit_jax, head_bias=head_bias)
+    elif 'moco' in mode:
+        return init_weights_vit_moco
+    else:
+        return init_weights_vit_timm
+
+
+@torch.no_grad()
+def _load_weights(model: VisionTransformer,
+                  checkpoint_path: str,
+                  prefix: str = ''):
+    """ Load weights from .npz checkpoints for official Google Brain Flax implementation
+    """
+    import numpy as np
+
+    def _n2p(w, t=True):
+        if w.ndim == 4 and w.shape[0] == w.shape[1] == w.shape[2] == 1:
+            w = w.flatten()
+        if t:
+            if w.ndim == 4:
+                w = w.transpose([3, 2, 0, 1])
+            elif w.ndim == 3:
+                w = w.transpose([2, 0, 1])
+            elif w.ndim == 2:
+                w = w.transpose([1, 0])
+        return torch.from_numpy(w)
+
+    w = np.load(checkpoint_path)
+    if not prefix and 'opt/target/embedding/kernel' in w:
+        prefix = 'opt/target/'
+
+    if hasattr(model.patch_embed, 'backbone'):
+        # hybrid
+        backbone = model.patch_embed.backbone
+        stem_only = not hasattr(backbone, 'stem')
+        stem = backbone if stem_only else backbone.stem
+        stem.conv.weight.copy_(
+            adapt_input_conv(stem.conv.weight.shape[1],
+                             _n2p(w[f'{prefix}conv_root/kernel'])))
+        stem.norm.weight.copy_(_n2p(w[f'{prefix}gn_root/scale']))
+        stem.norm.bias.copy_(_n2p(w[f'{prefix}gn_root/bias']))
+        if not stem_only:
+            for i, stage in enumerate(backbone.stages):
+                for j, block in enumerate(stage.blocks):
+                    bp = f'{prefix}block{i + 1}/unit{j + 1}/'
+                    for r in range(3):
+                        getattr(block, f'conv{r + 1}').weight.copy_(
+                            _n2p(w[f'{bp}conv{r + 1}/kernel']))
+                        getattr(block, f'norm{r + 1}').weight.copy_(
+                            _n2p(w[f'{bp}gn{r + 1}/scale']))
+                        getattr(block, f'norm{r + 1}').bias.copy_(
+                            _n2p(w[f'{bp}gn{r + 1}/bias']))
+                    if block.downsample is not None:
+                        block.downsample.conv.weight.copy_(
+                            _n2p(w[f'{bp}conv_proj/kernel']))
+                        block.downsample.norm.weight.copy_(
+                            _n2p(w[f'{bp}gn_proj/scale']))
+                        block.downsample.norm.bias.copy_(
+                            _n2p(w[f'{bp}gn_proj/bias']))
+        embed_conv_w = _n2p(w[f'{prefix}embedding/kernel'])
+    else:
+        embed_conv_w = adapt_input_conv(model.patch_embed.proj.weight.shape[1],
+                                        _n2p(w[f'{prefix}embedding/kernel']))
+    model.patch_embed.proj.weight.copy_(embed_conv_w)
+    model.patch_embed.proj.bias.copy_(_n2p(w[f'{prefix}embedding/bias']))
+    model.cls_token.copy_(_n2p(w[f'{prefix}cls'], t=False))
+    pos_embed_w = _n2p(
+        w[f'{prefix}Transformer/posembed_input/pos_embedding'], t=False)
+    if pos_embed_w.shape != model.pos_embed.shape:
+        pos_embed_w = resize_pos_embed(  # resize pos embedding when different size from pretrained weights
+            pos_embed_w, model.pos_embed, getattr(model, 'num_prefix_tokens',
+                                                  1),
+            model.patch_embed.grid_size)
+    model.pos_embed.copy_(pos_embed_w)
+    model.norm.weight.copy_(_n2p(w[f'{prefix}Transformer/encoder_norm/scale']))
+    model.norm.bias.copy_(_n2p(w[f'{prefix}Transformer/encoder_norm/bias']))
+    if isinstance(
+            model.head, nn.Linear
+    ) and model.head.bias.shape[0] == w[f'{prefix}head/bias'].shape[-1]:
+        model.head.weight.copy_(_n2p(w[f'{prefix}head/kernel']))
+        model.head.bias.copy_(_n2p(w[f'{prefix}head/bias']))
+    # NOTE representation layer has been removed, not used in latest 21k/1k pretrained weights
+    # if isinstance(getattr(model.pre_logits, 'fc', None), nn.Linear) and f'{prefix}pre_logits/bias' in w:
+    #     model.pre_logits.fc.weight.copy_(_n2p(w[f'{prefix}pre_logits/kernel']))
+    #     model.pre_logits.fc.bias.copy_(_n2p(w[f'{prefix}pre_logits/bias']))
+    for i, block in enumerate(model.blocks.children()):
+        block_prefix = f'{prefix}Transformer/encoderblock_{i}/'
+        mha_prefix = block_prefix + 'MultiHeadDotProductAttention_1/'
+        block.norm1.weight.copy_(_n2p(w[f'{block_prefix}LayerNorm_0/scale']))
+        block.norm1.bias.copy_(_n2p(w[f'{block_prefix}LayerNorm_0/bias']))
+        block.attn.qkv.weight.copy_(
+            torch.cat([
+                _n2p(w[f'{mha_prefix}{n}/kernel'], t=False).flatten(1).T
+                for n in ('query', 'key', 'value')
+            ]))
+        block.attn.qkv.bias.copy_(
+            torch.cat([
+                _n2p(w[f'{mha_prefix}{n}/bias'], t=False).reshape(-1)
+                for n in ('query', 'key', 'value')
+            ]))
+        block.attn.proj.weight.copy_(
+            _n2p(w[f'{mha_prefix}out/kernel']).flatten(1))
+        block.attn.proj.bias.copy_(_n2p(w[f'{mha_prefix}out/bias']))
+        for r in range(2):
+            getattr(block.mlp, f'fc{r + 1}').weight.copy_(
+                _n2p(w[f'{block_prefix}MlpBlock_3/Dense_{r}/kernel']))
+            getattr(block.mlp, f'fc{r + 1}').bias.copy_(
+                _n2p(w[f'{block_prefix}MlpBlock_3/Dense_{r}/bias']))
+        block.norm2.weight.copy_(_n2p(w[f'{block_prefix}LayerNorm_2/scale']))
+        block.norm2.bias.copy_(_n2p(w[f'{block_prefix}LayerNorm_2/bias']))
+
+
+def resize_pos_embed(posemb, posemb_new, num_prefix_tokens=1, gs_new=()):
+    # Rescale the grid of position embeddings when loading from state_dict. Adapted from
+    # https://github.com/google-research/vision_transformer/blob/00883dd691c63a6830751563748663526e811cee/vit_jax/checkpoint.py#L224
+    _logger.info('Resized position embedding: %s to %s', posemb.shape,
+                 posemb_new.shape)
+    ntok_new = posemb_new.shape[1]
+    if num_prefix_tokens:
+        posemb_prefix, posemb_grid = posemb[:, :num_prefix_tokens], posemb[
+            0, num_prefix_tokens:]
+        ntok_new -= num_prefix_tokens
+    else:
+        posemb_prefix, posemb_grid = posemb[:, :0], posemb[0]
+    gs_old = int(math.sqrt(len(posemb_grid)))
+    if not len(gs_new):  # backwards compatibility
+        gs_new = [int(math.sqrt(ntok_new))] * 2
+    assert len(gs_new) >= 2
+    _logger.info('Position embedding grid-size from %s to %s',
+                 [gs_old, gs_old], gs_new)
+    posemb_grid = posemb_grid.reshape(1, gs_old, gs_old,
+                                      -1).permute(0, 3, 1, 2)
+    posemb_grid = F.interpolate(
+        posemb_grid, size=gs_new, mode='bicubic', align_corners=False)
+    posemb_grid = posemb_grid.permute(0, 2, 3,
+                                      1).reshape(1, gs_new[0] * gs_new[1], -1)
+    posemb = torch.cat([posemb_prefix, posemb_grid], dim=1)
+    return posemb
diff --git a/modelscope/models/cv/vision_efficient_tuning/timm_weight_init.py b/modelscope/models/cv/vision_efficient_tuning/timm_weight_init.py
new file mode 100644
index 00000000..fb5ff63e
--- /dev/null
+++ b/modelscope/models/cv/vision_efficient_tuning/timm_weight_init.py
@@ -0,0 +1,131 @@
+# The implementation is adopted from timm (version: 0.6.11),
+# made publicly available under the Apache 2.0 License at
+# https://github.com/rwightman/pytorch-image-models/blob/main/timm/models/layers/weight_init.py
+import math
+import warnings
+
+import torch
+from torch.nn.init import _calculate_fan_in_and_fan_out
+
+
+def _trunc_normal_(tensor, mean, std, a, b):
+    # Cut & paste from PyTorch official master until it's in a few official releases - RW
+    # Method based on https://people.sc.fsu.edu/~jburkardt/presentations/truncated_normal.pdf
+    def norm_cdf(x):
+        # Computes standard normal cumulative distribution function
+        return (1. + math.erf(x / math.sqrt(2.))) / 2.
+
+    if (mean < a - 2 * std) or (mean > b + 2 * std):
+        warnings.warn(
+            'mean is more than 2 std from [a, b] in nn.init.trunc_normal_. '
+            'The distribution of values may be incorrect.',
+            stacklevel=2)
+
+    # Values are generated by using a truncated uniform distribution and
+    # then using the inverse CDF for the normal distribution.
+    # Get upper and lower cdf values
+    l = norm_cdf((a - mean) / std)  # noqa
+    u = norm_cdf((b - mean) / std)
+
+    # Uniformly fill tensor with values from [l, u], then translate to
+    # [2l-1, 2u-1].
+    tensor.uniform_(2 * l - 1, 2 * u - 1)
+
+    # Use inverse cdf transform for normal distribution to get truncated
+    # standard normal
+    tensor.erfinv_()
+
+    # Transform to proper mean, std
+    tensor.mul_(std * math.sqrt(2.))
+    tensor.add_(mean)
+
+    # Clamp to ensure it's in the proper range
+    tensor.clamp_(min=a, max=b)
+    return tensor
+
+
+def trunc_normal_(tensor, mean=0., std=1., a=-2., b=2.):
+    # type: (Tensor, float, float, float, float) -> Tensor
+    r"""Fills the input Tensor with values drawn from a truncated
+    normal distribution. The values are effectively drawn from the
+    normal distribution :math:`\mathcal{N}(\text{mean}, \text{std}^2)`
+    with values outside :math:`[a, b]` redrawn until they are within
+    the bounds. The method used for generating the random values works
+    best when :math:`a \leq \text{mean} \leq b`.
+
+    NOTE: this impl is similar to the PyTorch trunc_normal_, the bounds [a, b] are
+    applied while sampling the normal with mean/std applied, therefore a, b args
+    should be adjusted to match the range of mean, std args.
+
+    Args:
+        tensor: an n-dimensional `torch.Tensor`
+        mean: the mean of the normal distribution
+        std: the standard deviation of the normal distribution
+        a: the minimum cutoff value
+        b: the maximum cutoff value
+    Examples:
+        >>> w = torch.empty(3, 5)
+        >>> nn.init.trunc_normal_(w)
+    """
+    with torch.no_grad():
+        return _trunc_normal_(tensor, mean, std, a, b)
+
+
+def trunc_normal_tf_(tensor, mean=0., std=1., a=-2., b=2.):
+    # type: (Tensor, float, float, float, float) -> Tensor
+    r"""Fills the input Tensor with values drawn from a truncated
+    normal distribution. The values are effectively drawn from the
+    normal distribution :math:`\mathcal{N}(\text{mean}, \text{std}^2)`
+    with values outside :math:`[a, b]` redrawn until they are within
+    the bounds. The method used for generating the random values works
+    best when :math:`a \leq \text{mean} \leq b`.
+
+    NOTE: this 'tf' variant behaves closer to Tensorflow / JAX impl where the
+    bounds [a, b] are applied when sampling the normal distribution with mean=0, std=1.0
+    and the result is subsquently scaled and shifted by the mean and std args.
+
+    Args:
+        tensor: an n-dimensional `torch.Tensor`
+        mean: the mean of the normal distribution
+        std: the standard deviation of the normal distribution
+        a: the minimum cutoff value
+        b: the maximum cutoff value
+    Examples:
+        >>> w = torch.empty(3, 5)
+        >>> nn.init.trunc_normal_(w)
+    """
+    with torch.no_grad():
+        _trunc_normal_(tensor, 0, 1.0, a, b)
+        tensor.mul_(std).add_(mean)
+    return tensor
+
+
+def variance_scaling_(tensor, scale=1.0, mode='fan_in', distribution='normal'):
+    fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor)
+    if mode == 'fan_in':
+        denom = fan_in
+    elif mode == 'fan_out':
+        denom = fan_out
+    elif mode == 'fan_avg':
+        denom = (fan_in + fan_out) / 2
+    else:
+        raise ValueError(f'invalid mode {mode}')
+
+    variance = scale / denom
+
+    if distribution == 'truncated_normal':
+        # constant is stddev of standard normal truncated to (-2, 2)
+        trunc_normal_tf_(tensor, std=math.sqrt(variance) / .87962566103423978)
+    elif distribution == 'normal':
+        with torch.no_grad():
+            tensor.normal_(std=math.sqrt(variance))
+    elif distribution == 'uniform':
+        bound = math.sqrt(3 * variance)
+        with torch.no_grad():
+            tensor.uniform_(-bound, bound)
+    else:
+        raise ValueError(f'invalid distribution {distribution}')
+
+
+def lecun_normal_(tensor):
+    variance_scaling_(tensor, mode='fan_in', distribution='truncated_normal')
diff --git a/modelscope/models/cv/vision_efficient_tuning/vision_efficient_tuning.py b/modelscope/models/cv/vision_efficient_tuning/vision_efficient_tuning.py
new file mode 100644
index 00000000..629e7fac
--- /dev/null
+++ b/modelscope/models/cv/vision_efficient_tuning/vision_efficient_tuning.py
@@ -0,0 +1,65 @@
+# Copyright 2022-2023 The Alibaba Fundamental Vision Team Authors. All rights reserved.
+import os
+
+import torch
+
+from modelscope.metainfo import Models
+from modelscope.models.base.base_torch_model import TorchModel
+from modelscope.models.builder import MODELS
+from modelscope.utils.constant import ModelFile, Tasks
+
+
+@MODELS.register_module(
+    Tasks.vision_efficient_tuning, module_name=Models.vision_efficient_tuning)
+class VisionEfficientTuningModel(TorchModel):
+    """ The implementation of vision efficient tuning.
+
+    This model is constructed with the following parts:
+        - 'backbone': pre-trained backbone model with parameters.
+        - 'head': classification head with fine-tuning.
+    """
+
+    def __init__(self, model_dir: str, **kwargs):
+        """ Initialize a vision efficient tuning model.
+
+        Args:
+          model_dir: model id or path, where model_dir/pytorch_model.pt contains:
+                    - 'backbone_cfg': config of backbone.
+                    - 'backbone_weight': parameters of backbone.
+                    - 'head_cfg': config of head.
+                    - 'head_weight': parameters of head.
+                    - 'CLASSES': list of label name.
+        """
+
+        from .backbone import VisionTransformerPETL
+        from .head import ClassifierHead
+        super().__init__(model_dir)
+
+        model_path = os.path.join(model_dir, ModelFile.TORCH_MODEL_FILE)
+        model_dict = torch.load(model_path)
+
+        backbone_cfg = model_dict['backbone_cfg']
+        if 'type' in backbone_cfg:
+            backbone_cfg.pop('type')
+        self.backbone_model = VisionTransformerPETL(**backbone_cfg)
+        self.backbone_model.load_state_dict(
+            model_dict['backbone_weight'], strict=True)
+
+        head_cfg = model_dict['head_cfg']
+        if 'type' in head_cfg:
+            head_cfg.pop('type')
+        self.head_model = ClassifierHead(**head_cfg)
+        self.head_model.load_state_dict(model_dict['head_weight'], strict=True)
+
+        self.CLASSES = model_dict['CLASSES']
+
+    def forward(self, inputs):
+        """ Dynamic forward function of vision efficient tuning.
+
+        Args:
+          inputs: the input images (B, 3, H, W).
+        """
+
+        backbone_output = self.backbone_model(inputs)
+        head_output = self.head_model(backbone_output)
+        return head_output
diff --git a/modelscope/models/cv/vision_middleware/model.py b/modelscope/models/cv/vision_middleware/model.py
index 91f4a7b7..cb7bde00 100644
--- a/modelscope/models/cv/vision_middleware/model.py
+++ b/modelscope/models/cv/vision_middleware/model.py
@@ -21,23 +21,26 @@ from .head import FPNSegmentor, LinearClassifier
     Tasks.image_segmentation, module_name=Models.vision_middleware)
 class VisionMiddlewareModel(TorchModel):
     """
-        The implementation of 'ViM: Vision Middleware for Unified Downstream Transferring'.
+    The implementation of 'ViM: Vision Middleware for Unified Downstream Transferring'.
         This model is dynamically initialized with the following parts:
-            - backbone: the upstream pre-trained backbone model (CLIP in this code)
-            - ViM: the zoo of middlestream trained ViM modules
-            - ViM-aggregation: the specific aggregation weights for downstream tasks
+
+        - backbone: the upstream pre-trained backbone model (CLIP in this code)
+        - ViM: the zoo of middlestream trained ViM modules
+        - ViM-aggregation: the specific aggregation weights for downstream tasks
     """
 
     def __init__(self, model_dir: str, *args, **kwargs):
         """
-            Initialize a ViM-based Model
-            Args:
-                model_dir: model id or path,
-                where model_dir/pytorch_model.pt contains:
-                    'meta_info': basic information of ViM, e.g. task_list
-                    'backbone_weights': parameters of backbone [upstream]
-                    'ViM_weights': parameters of ViM [midstream]
-                    'ViM_agg_weights': parameters of ViM-aggregation [downstream]
+        Initialize a ViM-based Model.
+
+        Args:
+            model_dir: model id or path, where model_dir/pytorch_model.pt contains:
+
+                - 'meta_info': basic information of ViM, e.g. task_list
+                - 'backbone_weights': parameters of backbone [upstream]
+                - 'ViM_weights': parameters of ViM [midstream]
+                - 'ViM_agg_weights': parameters of ViM-aggregation [downstream]
+
         """
         super(VisionMiddlewareModel, self).__init__()
 
@@ -106,10 +109,11 @@ class VisionMiddlewareModel(TorchModel):
 
     def forward(self, inputs, task_name):
         """
-            Dynamic Forward Function of ViM
-            Args:
-                x: the input images (B, 3, H, W)
-                task_name: specified task for forwarding
+        Dynamic Forward Function of ViM.
+
+        Args:
+            x: the input images (B, 3, H, W)
+            task_name: specified task for forwarding
         """
         if task_name not in self.task_list:
             raise NotImplementedError(
@@ -122,11 +126,12 @@ class VisionMiddlewareModel(TorchModel):
 
     def postprocess(self, outputs, inputs, task_name):
         """
-            Post-process of ViM, based on task_name
-            Args:
-                inputs: batched input image (B, 3, H, W)
-                outputs: batched output (format based on task_name)
-                task_name: str, task name
+        Post-process of ViM, based on task_name.
+
+        Args:
+            inputs: batched input image (B, 3, H, W)
+            outputs: batched output (format based on task_name)
+            task_name (str): task name
         """
 
         _, in_channels, img_height, img_width = inputs.size()
@@ -163,6 +168,6 @@ class VisionMiddlewareModel(TorchModel):
 
     def get_tasks(self):
         """
-            Get the supported tasks of current ViM model
+        Get the supported tasks of current ViM model.
         """
         return self.task_list
diff --git a/modelscope/models/multi_modal/clip/modeling_bert.py b/modelscope/models/multi_modal/clip/modeling_bert.py
index b5f104ce..11c5c833 100644
--- a/modelscope/models/multi_modal/clip/modeling_bert.py
+++ b/modelscope/models/multi_modal/clip/modeling_bert.py
@@ -425,13 +425,12 @@ class BertModel(BertPreTrainedModel):
             Attentions weights after the attention softmax,
             used to compute the weighted average in the self-attention heads.
 
-    Examples::
-
-        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-        model = BertModel.from_pretrained('bert-base-uncased')
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids)
-        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
+    Examples:
+        >>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+        >>> model = BertModel.from_pretrained('bert-base-uncased')
+        >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        >>> outputs = model(input_ids)
+        >>> last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
 
     """
 
diff --git a/modelscope/models/multi_modal/diffusion/structbert.py b/modelscope/models/multi_modal/diffusion/structbert.py
index 16c1407f..0ca57fc4 100644
--- a/modelscope/models/multi_modal/diffusion/structbert.py
+++ b/modelscope/models/multi_modal/diffusion/structbert.py
@@ -751,19 +751,17 @@ class BERTPooler(nn.Module):
 class BertModel(nn.Module):
     """BERT model ("Bidirectional Embedding Representations from a Transformer").
 
-    Example usage:
-    ```python
-    # Already been converted into WordPiece token ids
-    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
-    input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
-    token_type_ids = torch.LongTensor([[0, 0, 1], [0, 2, 0]])
+    Example:
+        >>> # Already been converted into WordPiece token ids
+        >>> input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
+        >>> input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
+        >>> token_type_ids = torch.LongTensor([[0, 0, 1], [0, 2, 0]])
 
-    config = modeling.BertConfig(vocab_size=32000, hidden_size=512,
-        num_hidden_layers=8, num_attention_heads=6, intermediate_size=1024)
+        >>> config = modeling.BertConfig(vocab_size=32000, hidden_size=512,
+        >>>     num_hidden_layers=8, num_attention_heads=6, intermediate_size=1024)
 
-    model = modeling.BertModel(config=config)
-    all_encoder_layers, pooled_output = model(input_ids, token_type_ids, input_mask)
-    ```
+        >>> model = modeling.BertModel(config=config)
+        >>> all_encoder_layers, pooled_output = model(input_ids, token_type_ids, input_mask)
     """
 
     def __init__(self, config: BertConfig):
@@ -846,21 +844,19 @@ class BertForSequenceClassificationMultiTask(nn.Module):
     This module is composed of the BERT model with a linear layer on top of
     the pooled output.
 
-    Example usage:
-    ```python
-    # Already been converted into WordPiece token ids
-    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
-    input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
-    token_type_ids = torch.LongTensor([[0, 0, 1], [0, 2, 0]])
+    Example:
+        >>> # Already been converted into WordPiece token ids
+        >>> input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
+        >>> input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
+        >>> token_type_ids = torch.LongTensor([[0, 0, 1], [0, 2, 0]])
 
-    config = BertConfig(vocab_size=32000, hidden_size=512,
-        num_hidden_layers=8, num_attention_heads=6, intermediate_size=1024)
+        >>> config = BertConfig(vocab_size=32000, hidden_size=512,
+        >>>     num_hidden_layers=8, num_attention_heads=6, intermediate_size=1024)
 
-    num_labels = 2
+        >>> num_labels = 2
 
-    model = BertForSequenceClassification(config, num_labels)
-    logits = model(input_ids, token_type_ids, input_mask)
-    ```
+        >>> model = BertForSequenceClassification(config, num_labels)
+        >>> logits = model(input_ids, token_type_ids, input_mask)
     """
 
     def __init__(self, config, label_list, core_encoder):
diff --git a/modelscope/models/multi_modal/diffusion/tokenizer.py b/modelscope/models/multi_modal/diffusion/tokenizer.py
index e2c951b1..918498cd 100644
--- a/modelscope/models/multi_modal/diffusion/tokenizer.py
+++ b/modelscope/models/multi_modal/diffusion/tokenizer.py
@@ -246,8 +246,8 @@ class WordpieceTokenizer(object):
         using the given vocabulary.
 
         For example:
-          input = "unaffable"
-          output = ["un", "##aff", "##able"]
+          >>> input = "unaffable"
+          >>> output = ["un", "##aff", "##able"]
 
         Args:
           text: A single token or whitespace separated tokens. This should have
diff --git a/modelscope/models/multi_modal/mgeo/backbone.py b/modelscope/models/multi_modal/mgeo/backbone.py
index 863a1163..bed46af0 100644
--- a/modelscope/models/multi_modal/mgeo/backbone.py
+++ b/modelscope/models/multi_modal/mgeo/backbone.py
@@ -1350,7 +1350,7 @@ class BertForPreTraining(BertPreTrainedModel):
         kwargs (:obj:`Dict[str, any]`, optional, defaults to `{}`):
             Used to hide legacy arguments that have been deprecated.
         Returns:
-        Example::
+        Example:
             >>> from transformers import BertTokenizer, BertForPreTraining
             >>> import torch
             >>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
@@ -1483,7 +1483,10 @@ class BertLMHeadModel(BertPreTrainedModel):
             If set to :obj:`True`, :obj:`past_key_values` key value states are
             returned and can be used to speed up decoding (see
             :obj:`past_key_values`).
-        Returns: Example::
+
+        Returns:
+
+        Example:
             >>> from transformers import BertTokenizer, BertLMHeadModel, BertConfig
             >>> import torch
             >>> tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
@@ -1943,7 +1946,8 @@ class BertForNextSentencePrediction(BertPreTrainedModel):
             - 0 indicates sequence B is a continuation of sequence A,
             - 1 indicates sequence B is a random sequence.
         Returns:
-        Example::
+
+        Example:
             >>> from transformers import BertTokenizer, BertForNextSentencePrediction
             >>> import torch
             >>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
diff --git a/modelscope/models/multi_modal/mplug/modeling_mplug.py b/modelscope/models/multi_modal/mplug/modeling_mplug.py
index 98edd898..5ebf4704 100755
--- a/modelscope/models/multi_modal/mplug/modeling_mplug.py
+++ b/modelscope/models/multi_modal/mplug/modeling_mplug.py
@@ -1620,7 +1620,8 @@ class BertLMHeadModel(BertPreTrainedModel):
             If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
             decoding (see :obj:`past_key_values`).
         Returns:
-        Example::
+
+        Example:
             >>> from transformers import BertTokenizer, BertLMHeadModel, BertConfig
             >>> import torch
             >>> tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
diff --git a/modelscope/models/multi_modal/ofa_for_all_tasks.py b/modelscope/models/multi_modal/ofa_for_all_tasks.py
index 3135b2b2..6bdd5154 100644
--- a/modelscope/models/multi_modal/ofa_for_all_tasks.py
+++ b/modelscope/models/multi_modal/ofa_for_all_tasks.py
@@ -43,9 +43,9 @@ __all__ = ['OfaForAllTasks']
 class OfaForAllTasks(TorchModel):
     r"""
     All ofa tasks using uniform ofa model structure. So far, we support three types of tasks:
-    1. text generation tasks: ocr_recognition, image_captioning and text_summarization
-    2. visual grounding tasks: visual grounding
-    3. classification tasks: text classification and image classification.
+        1. text generation tasks: ocr_recognition, image_captioning and text_summarization
+        2. visual grounding tasks: visual grounding
+        3. classification tasks: text classification and image classification.
 
     Attributes:
         cfg: Task configs exclude model configs, such as generator's config.
@@ -235,12 +235,13 @@ class OfaForAllTasks(TorchModel):
     def postprocess(self, input: Dict[str, Any], **kwargs) -> Dict[str, Any]:
         r"""
         Do post processing after task's forward function is executed. So far, we have three strategies while do post
-        processing.
-        1. If the task is image captioning and using English language, some special words will be removed, such as
-           `!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~`
-        2. If the task is not visual grounding, but a generation task using Chinese language, we will remove the blank
-            after/before the words except ` a-zA-Z0-9.,:!?`
-        3. Other cases will return the input as result.
+            processing.
+
+            1. If the task is image captioning and using English language, some special words will be removed, such as
+               `!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~`
+            2. If the task is not visual grounding, but a generation task using Chinese language, we will remove the
+                blank after/before the words except ` a-zA-Z0-9.,:!?`
+            3. Other cases will return the input as result.
 
         Args:
             input (`Dict[Str, Any]`):
diff --git a/modelscope/models/multi_modal/vldoc/transformer_local.py b/modelscope/models/multi_modal/vldoc/transformer_local.py
index 4c0dd55d..e819d4a4 100644
--- a/modelscope/models/multi_modal/vldoc/transformer_local.py
+++ b/modelscope/models/multi_modal/vldoc/transformer_local.py
@@ -18,7 +18,7 @@ class TransformerDecoder(Module):
         num_layers: the number of sub-decoder-layers in the decoder (required).
         norm: the layer normalization component (optional).
 
-    Examples::
+    Examples:
         >>> decoder_layer = nn.TransformerDecoderLayer(d_model=512, nhead=8)
         >>> transformer_decoder = nn.TransformerDecoder(decoder_layer, num_layers=6)
         >>> memory = torch.rand(10, 32, 512)
diff --git a/modelscope/models/nlp/T5/backbone.py b/modelscope/models/nlp/T5/backbone.py
index 9b405449..7fa97308 100644
--- a/modelscope/models/nlp/T5/backbone.py
+++ b/modelscope/models/nlp/T5/backbone.py
@@ -870,38 +870,40 @@ class T5Stack(T5PreTrainedModel):
 
     def parallelize(self, device_map=None):
         r"""
-            This is an experimental feature and is a subject to change at a
-            moment's notice.
+        This is an experimental feature and is a subject to change at a
+        moment's notice.
 
-            Uses a device map to distribute attention modules of the model
-            across several devices. If no device map is given, it will evenly
-            distribute blocks across all devices.
+        Uses a device map to distribute attention modules of the model
+        across several devices. If no device map is given, it will evenly
+        distribute blocks across all devices.
 
-            Args:
-                device_map (`Dict[int, list]`, optional, defaults to None):
-                    A dictionary that maps attention modules to devices. Note
-                    that the embedding module and LMHead are always
-                    automatically mapped to the first device (for esoteric
-                    reasons). That means that the first device should have fewer
-                    attention modules mapped to it than other devices. For
-                    reference, the t5 models have the following number of
-                    attention modules:
+        Args:
+            device_map (`Dict[int, list]`, optional, defaults to None):
+                A dictionary that maps attention modules to devices. Note
+                that the embedding module and LMHead are always
+                automatically mapped to the first device (for esoteric
+                reasons). That means that the first device should have fewer
+                attention modules mapped to it than other devices. For
+                reference, the t5 models have the following number of
+                attention modules:
 
-                        - t5-small: 6
-                        - t5-base: 12
-                        - t5-large: 24
-                        - t5-3b: 24
-                        - t5-11b: 24
+                    - t5-small: 6
+                    - t5-base: 12
+                    - t5-large: 24
+                    - t5-3b: 24
+                    - t5-11b: 24
 
-            Example:
+        Example:
 
-            ```python # Here is an example of a device map on a machine with 4
-            GPUs # using t5-3b, which has a total of 24 attention modules: model
-            = T5ForConditionalGeneration.from_pretrained("t5-3b") device_map = {
-                0: [0, 1, 2], 1: [3, 4, 5, 6, 7, 8, 9], 2: [10, 11, 12, 13, 14,
-                15, 16], 3: [17, 18, 19, 20, 21, 22, 23],
-            } model.parallelize(device_map) ``` all of the parallelize methods
-            in this file are the same
+        >>> # Here is an example of a device map on a machine with 4 GPUs
+        >>> # using t5-3b, which has a total of 24 attention modules:
+        >>> model = T5ForConditionalGeneration.from_pretrained("t5-3b")
+        >>> device_map = {
+        >>>     0: [0, 1, 2], 1: [3, 4, 5, 6, 7, 8, 9], 2: [10, 11, 12, 13, 14,
+        >>>     15, 16], 3: [17, 18, 19, 20, 21, 22, 23],
+        >>> }
+        >>> model.parallelize(device_map)
+        >>> # all of the parallelize methods in this file are the same
 
         """
         # Check validity of device_map
@@ -926,19 +928,21 @@ class T5Stack(T5PreTrainedModel):
 
     def deparallelize(self):
         r"""
-            Moves the model to cpu from a model parallel state.
+        Moves the model to cpu from a model parallel state.
 
-            Example:
+        Example:
 
-            ```python # On a 4 GPU machine with t5-3b: model =
-            T5ForConditionalGeneration.from_pretrained("t5-3b") device_map = {
-                0: [0, 1, 2], 1: [3, 4, 5, 6, 7, 8, 9], 2: [10, 11, 12, 13, 14,
-                15, 16], 3: [17, 18, 19, 20, 21, 22, 23],
-            } model.parallelize(device_map)  # Splits the model across several
-            devices model.deparallelize()  # Put the model back on cpu and
-            cleans memory by calling torch.cuda.empty_cache() ```
-
-            all of the deparallelize methods in this file are the same
+        >>> # On a 4 GPU machine with t5-3b:
+        >>> model = T5ForConditionalGeneration.from_pretrained("t5-3b")
+        >>> device_map = {
+        >>>     0: [0, 1, 2], 1: [3, 4, 5, 6, 7, 8, 9], 2: [10, 11, 12, 13, 14,
+        >>>     15, 16], 3: [17, 18, 19, 20, 21, 22, 23],
+        >>> }
+        >>> model.parallelize(device_map)
+        >>> # Splits the model across several devices model.deparallelize()
+        >>> # Put the model back on cpu and
+        >>> # cleans memory by calling torch.cuda.empty_cache()
+        >>> # all of the deparallelize methods in this file are the same
         """
         self.model_parallel = False
         self.device_map = None
@@ -1439,7 +1443,7 @@ class T5Model(T5PreTrainedModel):
 
         Example:
 
-        ```python >>> from transformers import T5Tokenizer, T5Model
+        >>> from transformers import T5Tokenizer, T5Model
 
         >>> tokenizer = T5Tokenizer.from_pretrained("t5-small")
         >>> model = T5Model.from_pretrained("t5-small")
@@ -1452,7 +1456,7 @@ class T5Model(T5PreTrainedModel):
         >>> # forward pass
         >>> outputs = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids)
         >>> last_hidden_states = outputs.last_hidden_state
-        ```"""
+        """
         use_cache = use_cache if use_cache is not None else self.config.use_cache
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
diff --git a/modelscope/models/nlp/T5/text2text_generation.py b/modelscope/models/nlp/T5/text2text_generation.py
index bead9e25..4c6fd295 100644
--- a/modelscope/models/nlp/T5/text2text_generation.py
+++ b/modelscope/models/nlp/T5/text2text_generation.py
@@ -145,133 +145,127 @@ class T5ForConditionalGeneration(T5PreTrainedModel):
                 **kwargs) -> Union[Tuple[torch.FloatTensor], Seq2SeqLMOutput]:
         r"""
         Args:
-        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
-            Indices of input sequence tokens in the vocabulary. T5 is a model
-            with relative position embeddings so you should be able to pad the
-            inputs on both the right and the left.
+            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+                Indices of input sequence tokens in the vocabulary. T5 is a model
+                with relative position embeddings so you should be able to pad the
+                inputs on both the right and the left.
 
-            Indices can be obtained using [`T5Tokenizer`]. See
-            [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`]
-            for detail.
+                Indices can be obtained using [`T5Tokenizer`]. See
+                [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`]
+                for detail.
 
-            [What are input IDs?](../glossary#input-ids)
+                [What are input IDs?](../glossary#input-ids)
 
-            To know more on how to prepare `input_ids` for pretraining take a
-            look a [T5 Training](./t5#training).
-        attention_mask (`torch.FloatTensor` of shape `(batch_size,
-        sequence_length)`, *optional*):
-            Mask to avoid performing attention on padding token indices. Mask
-            values selected in `[0, 1]`:
+                To know more on how to prepare `input_ids` for pretraining take a
+                look a [T5 Training](./t5#training).
+            attention_mask (`torch.FloatTensor` of shape `(batch_size,sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask
+                values selected in `[0, 1]`:
 
-            - 1 for tokens that are **not masked**,
-            - 0 for tokens that are **masked**.
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
 
-            [What are attention masks?](../glossary#attention-mask)
-        decoder_input_ids (`torch.LongTensor` of shape `(batch_size,
-        target_sequence_length)`, *optional*):
-            Indices of decoder input sequence tokens in the vocabulary.
+                [What are attention masks?](../glossary#attention-mask)
+            decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
+                Indices of decoder input sequence tokens in the vocabulary.
 
-            Indices can be obtained using [`T5Tokenizer`]. See
-            [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`]
-            for details.
+                Indices can be obtained using [`T5Tokenizer`]. See
+                [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`]
+                for details.
 
-            [What are decoder input IDs?](../glossary#decoder-input-ids)
+                [What are decoder input IDs?](../glossary#decoder-input-ids)
 
-            T5 uses the `pad_token_id` as the starting token for
-            `decoder_input_ids` generation. If `past_key_values` is used,
-            optionally only the last `decoder_input_ids` have to be input (see
-            `past_key_values`).
+                T5 uses the `pad_token_id` as the starting token for
+                `decoder_input_ids` generation. If `past_key_values` is used,
+                optionally only the last `decoder_input_ids` have to be input (see
+                `past_key_values`).
 
-            To know more on how to prepare `decoder_input_ids` for pretraining
-            take a look at [T5 Training](./t5#training).
-        decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size,
-        target_sequence_length)`, *optional*):
-            Default behavior: generate a tensor that ignores pad tokens in
-            `decoder_input_ids`. Causal mask will also be used by default.
-        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers,
-        num_heads)`, *optional*):
-            Mask to nullify selected heads of the self-attention modules in the
-            encoder. Mask values selected in `[0, 1]`:
-
-            - 1 indicates the head is **not masked**,
-            - 0 indicates the head is **masked**.
-
-        decoder_head_mask (`torch.FloatTensor` of shape `(num_heads,)` or
-        `(num_layers, num_heads)`, *optional*):
-            Mask to nullify selected heads of the self-attention modules in the
-            decoder. Mask values selected in `[0, 1]`:
-
-            - 1 indicates the head is **not masked**,
-            - 0 indicates the head is **masked**.
-
-        cross_attn_head_mask (`torch.Tensor` of shape `(num_heads,)` or
-        `(num_layers, num_heads)`, *optional*):
-                Mask to nullify selected heads of the cross-attention modules in
-                the decoder. Mask values selected in `[0, 1]`:
+                To know more on how to prepare `decoder_input_ids` for pretraining
+                take a look at [T5 Training](./t5#training).
+            decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
+                Default behavior: generate a tensor that ignores pad tokens in
+                `decoder_input_ids`. Causal mask will also be used by default.
+            head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+                Mask to nullify selected heads of the self-attention modules in the
+                encoder. Mask values selected in `[0, 1]`:
 
                 - 1 indicates the head is **not masked**,
                 - 0 indicates the head is **masked**.
 
-        encoder_outputs (`tuple(tuple(torch.FloatTensor)`, *optional*):
-            Tuple consists of (`last_hidden_state`, `optional`: *hidden_states*,
-            `optional`: *attentions*) `last_hidden_state` of shape `(batch_size,
-            sequence_length, hidden_size)` is a sequence of hidden states at the
-            output of the last layer of the encoder. Used in the cross-attention
-            of the decoder.
-        past_key_values (`tuple(tuple(torch.FloatTensor))` of length
-        `config.n_layers` with each tuple having 4 tensors of shape
-        `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
-            Contains precomputed key and value hidden states of the attention
-            blocks. Can be used to speed up decoding.
+            decoder_head_mask (`torch.FloatTensor` of shape `(num_heads,)` or
+                `(num_layers, num_heads)`, *optional*):
+                Mask to nullify selected heads of the self-attention modules in the
+                decoder. Mask values selected in `[0, 1]`:
 
-            If `past_key_values` are used, the user can optionally input only
-            the last `decoder_input_ids` (those that don't have their past key
-            value states given to this model) of shape `(batch_size, 1)` instead
-            of all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
-        inputs_embeds (`torch.FloatTensor` of shape `(batch_size,
-        sequence_length, hidden_size)`, *optional*):
-            Optionally, instead of passing `input_ids` you can choose to
-            directly pass an embedded representation. This is useful if you want
-            more control over how to convert `input_ids` indices into associated
-            vectors than the model's internal embedding lookup matrix.
-        decoder_inputs_embeds (`torch.FloatTensor` of shape `(batch_size,
-        target_sequence_length, hidden_size)`, *optional*):
-            Optionally, instead of passing `decoder_input_ids` you can choose to
-            directly pass an embedded representation. If `past_key_values` is
-            used, optionally only the last `decoder_inputs_embeds` have to be
-            input (see `past_key_values`). This is useful if you want more
-            control over how to convert `decoder_input_ids` indices into
-            associated vectors than the model's internal embedding lookup
-            matrix.
+                - 1 indicates the head is **not masked**,
+                - 0 indicates the head is **masked**.
 
-            If `decoder_input_ids` and `decoder_inputs_embeds` are both unset,
-            `decoder_inputs_embeds` takes the value of `inputs_embeds`.
+            cross_attn_head_mask (`torch.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+                    Mask to nullify selected heads of the cross-attention modules in
+                    the decoder. Mask values selected in `[0, 1]`:
 
-        use_cache (`bool`, *optional*):
-            If set to `True`, `past_key_values` key value states are returned
-            and can be used to speed up decoding (see `past_key_values`).
+                    - 1 indicates the head is **not masked**,
+                    - 0 indicates the head is **masked**.
 
-        output_attentions (`bool`, *optional*):
-            Whether or not to return the attentions tensors of all attention
-            layers. See `attentions` under returned tensors for more detail.
-        output_hidden_states (`bool`, *optional*):
-            Whether or not to return the hidden states of all layers. See
-            `hidden_states` under returned tensors for more detail.
-        return_dict (`bool`, *optional*):
-            Whether or not to return a [`~utils.ModelOutput`] instead of a plain
-            tuple.
-        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
-            Labels for computing the sequence classification/regression loss.
-            Indices should be in `[-100, 0, ..., config.vocab_size - 1]`. All
-            labels set to `-100` are ignored (masked), the loss is only computed
-            for labels in `[0, ..., config.vocab_size]`
+            encoder_outputs (`tuple(tuple(torch.FloatTensor)`, *optional*):
+                Tuple consists of (`last_hidden_state`, `optional`: *hidden_states*,
+                `optional`: *attentions*) `last_hidden_state` of shape `(batch_size,
+                sequence_length, hidden_size)` is a sequence of hidden states at the
+                output of the last layer of the encoder. Used in the cross-attention
+                of the decoder.
+            past_key_values (`tuple(tuple(torch.FloatTensor))` of length
+                `config.n_layers` with each tuple having 4 tensors of shape
+                `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+
+                Contains precomputed key and value hidden states of the attention
+                blocks. Can be used to speed up decoding.
+
+                If `past_key_values` are used, the user can optionally input only
+                the last `decoder_input_ids` (those that don't have their past key
+                value states given to this model) of shape `(batch_size, 1)` instead
+                of all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+                Optionally, instead of passing `input_ids` you can choose to
+                directly pass an embedded representation. This is useful if you want
+                more control over how to convert `input_ids` indices into associated
+                vectors than the model's internal embedding lookup matrix.
+            decoder_inputs_embeds (`torch.FloatTensor` of shape `(batch_size, target_sequence_length, hidden_size)`,
+                *optional*):
+                Optionally, instead of passing `decoder_input_ids` you can choose to
+                directly pass an embedded representation. If `past_key_values` is
+                used, optionally only the last `decoder_inputs_embeds` have to be
+                input (see `past_key_values`). This is useful if you want more
+                control over how to convert `decoder_input_ids` indices into
+                associated vectors than the model's internal embedding lookup
+                matrix.
+
+                If `decoder_input_ids` and `decoder_inputs_embeds` are both unset,
+                `decoder_inputs_embeds` takes the value of `inputs_embeds`.
+
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned
+                and can be used to speed up decoding (see `past_key_values`).
+
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention
+                layers. See `attentions` under returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See
+                `hidden_states` under returned tensors for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain
+                tuple.
+            labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+                Labels for computing the sequence classification/regression loss.
+                Indices should be in `[-100, 0, ..., config.vocab_size - 1]`. All
+                labels set to `-100` are ignored (masked), the loss is only computed
+                for labels in `[0, ..., config.vocab_size]`
 
         Returns:
 
         Examples:
 
-        ```python >>> from transformers import T5Tokenizer,
-        T5ForConditionalGeneration
+        >>> from transformers import T5Tokenizer, T5ForConditionalGeneration
 
         >>> tokenizer = T5Tokenizer.from_pretrained("t5-small")
         >>> model = T5ForConditionalGeneration.from_pretrained("t5-small")
@@ -290,7 +284,7 @@ class T5ForConditionalGeneration(T5PreTrainedModel):
         >>> outputs = model.generate(input_ids)
         >>> print(tokenizer.decode(outputs[0], skip_special_tokens=True))
         >>> # studies have shown that owning a dog is good for you.
-        ```"""
+        """
         use_cache = use_cache if use_cache is not None else self.config.use_cache
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
diff --git a/modelscope/models/nlp/__init__.py b/modelscope/models/nlp/__init__.py
index 0c72a4a0..66a53a00 100644
--- a/modelscope/models/nlp/__init__.py
+++ b/modelscope/models/nlp/__init__.py
@@ -14,20 +14,31 @@ if TYPE_CHECKING:
         BertForDocumentSegmentation,
         BertModel,
         BertConfig,
+        SiameseUieModel,
     )
+    from .bloom import BloomModel
+    from .codegeex import CodeGeeXForCodeTranslation, CodeGeeXForCodeGeneration
     from .csanmt import CsanmtForTranslation
     from .deberta_v2 import DebertaV2ForMaskedLM, DebertaV2Model
     from .gpt_neo import GPTNeoModel
     from .gpt2 import GPT2Model
     from .gpt3 import GPT3ForTextGeneration, DistributedGPT3
     from .gpt_moe import GPTMoEForTextGeneration, DistributedGPTMoE
-    from .heads import SequenceClassificationHead
+    from .heads import TextClassificationHead
+    from .hf_transformers import TransformersModel
+    from .lstm import (
+        LSTMModel,
+        LSTMForTokenClassificationWithCRF,
+    )
     from .megatron_bert import (
         MegatronBertConfig,
         MegatronBertForMaskedLM,
         MegatronBertModel,
     )
+    from .mglm import MGLMForTextSummarization
     from .palm_v2 import PalmForTextGeneration
+    from .plug_mental import (PlugMentalConfig, PlugMentalModel,
+                              PlugMentalForSequenceClassification)
     from .ponet import PoNetForMaskedLM, PoNetModel, PoNetConfig
     from .space import SpaceForDialogIntent, SpaceForDialogModeling, SpaceForDST
     from .space_T_cn import TableQuestionAnswering
@@ -40,36 +51,51 @@ if TYPE_CHECKING:
         SbertModel,
     )
     from .T5 import T5ForConditionalGeneration
-    from .mglm import MGLMForTextSummarization
-    from .codegeex import CodeGeeXForCodeTranslation, CodeGeeXForCodeGeneration
     from .task_models import (
-        FeatureExtractionModel,
-        InformationExtractionModel,
-        LSTMCRFForNamedEntityRecognition,
-        LSTMCRFForWordSegmentation,
-        LSTMCRFForPartOfSpeech,
-        SequenceClassificationModel,
+        ModelForFeatureExtraction,
+        ModelForInformationExtraction,
+        ModelForTextClassification,
         SingleBackboneTaskModelBase,
-        TaskModelForTextGeneration,
-        TokenClassificationModel,
-        TransformerCRFForNamedEntityRecognition,
-        TransformerCRFForWordSegmentation,
+        ModelForTextGeneration,
+        ModelForTextRanking,
+        ModelForTokenClassification,
+        ModelForTokenClassificationWithCRF,
     )
+    from .unite import UniTEForTranslationEvaluation
+    from .use import UserSatisfactionEstimation
     from .veco import (VecoConfig, VecoForMaskedLM,
                        VecoForSequenceClassification,
                        VecoForTokenClassification, VecoModel)
-    from .bloom import BloomModel
-    from .unite import UniTEModel
-    from .use import UserSatisfactionEstimation
+    from .dgds import (DocumentGroundedDialogGenerateModel,
+                       DocumentGroundedDialogRetrievalModel,
+                       DocumentGroundedDialogRerankModel)
+    from .xlm_roberta import XLMRobertaConfig, XLMRobertaModel
+
 else:
     _import_structure = {
-        'backbones': ['SbertModel'],
         'bart': ['BartForTextErrorCorrection'],
+        'bert': [
+            'BertForMaskedLM',
+            'BertForTextRanking',
+            'BertForSentenceEmbedding',
+            'BertForSequenceClassification',
+            'BertForTokenClassification',
+            'BertForDocumentSegmentation',
+            'BertModel',
+            'BertConfig',
+            'SiameseUieModel',
+        ],
+        'bloom': ['BloomModel'],
         'csanmt': ['CsanmtForTranslation'],
-        'heads': ['SequenceClassificationHead'],
+        'codegeex':
+        ['CodeGeeXForCodeTranslation', 'CodeGeeXForCodeGeneration'],
+        'deberta_v2': ['DebertaV2ForMaskedLM', 'DebertaV2Model'],
+        'heads': ['TextClassificationHead'],
+        'hf_transformers': ['TransformersModel'],
         'gpt2': ['GPT2Model'],
         'gpt3': ['GPT3ForTextGeneration', 'DistributedGPT3'],
         'gpt_moe': ['GPTMoEForTextGeneration', 'DistributedGPTMoE'],
+        'gpt_neo': ['GPTNeoModel'],
         'structbert': [
             'SbertForFaqQuestionAnswering',
             'SbertForMaskedLM',
@@ -84,50 +110,47 @@ else:
             'VecoForTokenClassification',
             'VecoModel',
         ],
-        'bert': [
-            'BertForMaskedLM',
-            'BertForTextRanking',
-            'BertForSentenceEmbedding',
-            'BertForSequenceClassification',
-            'BertForTokenClassification',
-            'BertForDocumentSegmentation',
-            'BertModel',
-            'BertConfig',
+        'lstm': [
+            'LSTM',
+            'LSTMForTokenClassificationWithCRF',
         ],
         'megatron_bert': [
             'MegatronBertConfig',
             'MegatronBertForMaskedLM',
             'MegatronBertModel',
         ],
-        'ponet': ['PoNetForMaskedLM', 'PoNetModel', 'PoNetConfig'],
+        'mglm': ['MGLMForTextSummarization'],
         'palm_v2': ['PalmForTextGeneration'],
-        'deberta_v2': ['DebertaV2ForMaskedLM', 'DebertaV2Model'],
+        'plug_mental': [
+            'PlugMentalConfig',
+            'PlugMentalModel',
+            'PlugMentalForSequenceClassification',
+        ],
+        'ponet': ['PoNetForMaskedLM', 'PoNetModel', 'PoNetConfig'],
         'space_T_en': ['StarForTextToSql'],
         'space_T_cn': ['TableQuestionAnswering'],
         'space':
         ['SpaceForDialogIntent', 'SpaceForDialogModeling', 'SpaceForDST'],
         'task_models': [
-            'FeatureExtractionModel',
-            'InformationExtractionModel',
-            'LSTMCRFForNamedEntityRecognition',
-            'LSTMCRFForWordSegmentation',
-            'LSTMCRFForPartOfSpeech',
-            'SequenceClassificationModel',
+            'ModelForFeatureExtraction',
+            'ModelForInformationExtraction',
+            'ModelForTextClassification',
             'SingleBackboneTaskModelBase',
-            'TaskModelForTextGeneration',
-            'TokenClassificationModel',
-            'TransformerCRFForNamedEntityRecognition',
-            'TransformerCRFForWordSegmentation',
+            'ModelForTextGeneration',
+            'ModelForTextRanking',
+            'ModelForTokenClassification',
+            'ModelForTokenClassificationWithCRF',
         ],
         'sentence_embedding': ['SentenceEmbedding'],
         'T5': ['T5ForConditionalGeneration'],
-        'mglm': ['MGLMForTextSummarization'],
-        'codegeex':
-        ['CodeGeeXForCodeTranslation', 'CodeGeeXForCodeGeneration'],
-        'gpt_neo': ['GPTNeoModel'],
-        'bloom': ['BloomModel'],
-        'unite': ['UniTEModel'],
-        'use': ['UserSatisfactionEstimation']
+        'unite': ['UniTEForTranslationEvaluation'],
+        'use': ['UserSatisfactionEstimation'],
+        'dgds': [
+            'DocumentGroundedDialogGenerateModel',
+            'DocumentGroundedDialogRetrievalModel',
+            'DocumentGroundedDialogRerankModel'
+        ],
+        'xlm_roberta': ['XLMRobertaConfig', 'XLMRobertaModel'],
     }
 
     import sys
diff --git a/modelscope/models/nlp/bart/text_error_correction.py b/modelscope/models/nlp/bart/text_error_correction.py
index 9ff619f1..97c3a7a9 100644
--- a/modelscope/models/nlp/bart/text_error_correction.py
+++ b/modelscope/models/nlp/bart/text_error_correction.py
@@ -60,21 +60,14 @@ class BartForTextErrorCorrection(TorchModel):
         """return the result by the model
 
         Args:
-            input (Dict[str, Tensor]): the preprocessed data
-            Example:
-                1 sent:
-                {'net_input':
-                    {'src_tokens':tensor([2478,242,24,4]),
-                    'src_lengths': tensor([4])}
-                }
+            input (Dict[str, Tensor]): the preprocessed data which contains following:
+                - src_tokens: tensor with shape (2478,242,24,4),
+                - src_lengths: tensor with shape (4)
 
 
         Returns:
-            Dict[str, Tensor]: results
-                Example:
-                    {
-                        'predictions': Tensor([1377, 4959, 2785, 6392...]), # tokens need to be decode by tokenizer
-                    }
+            Dict[str, Tensor]: results which contains following:
+                - predictions: tokens need to be decode by tokenizer with shape [1377, 4959, 2785, 6392...]
         """
         import fairseq.utils
 
diff --git a/modelscope/models/nlp/bert/__init__.py b/modelscope/models/nlp/bert/__init__.py
index 28a10f57..6578a0d7 100644
--- a/modelscope/models/nlp/bert/__init__.py
+++ b/modelscope/models/nlp/bert/__init__.py
@@ -16,6 +16,7 @@ if TYPE_CHECKING:
     from .text_classification import BertForSequenceClassification
     from .token_classification import BertForTokenClassification
     from .document_segmentation import BertForDocumentSegmentation
+    from .siamese_uie import SiameseUieModel
 else:
     _import_structure = {
         'backbone': [
@@ -29,6 +30,7 @@ else:
         'text_classification': ['BertForSequenceClassification'],
         'token_classification': ['BertForTokenClassification'],
         'document_segmentation': ['BertForDocumentSegmentation'],
+        'siamese_uie': ['SiameseUieModel'],
     }
 
     import sys
diff --git a/modelscope/models/nlp/bert/backbone.py b/modelscope/models/nlp/bert/backbone.py
index 82c576d0..827a00d8 100755
--- a/modelscope/models/nlp/bert/backbone.py
+++ b/modelscope/models/nlp/bert/backbone.py
@@ -734,88 +734,88 @@ class BertModel(BertPreTrainedModel):
                 **kwargs) -> AttentionBackboneModelOutput:
         r"""
         Args:
-        input_ids (`torch.LongTensor` of shape `((batch_size, sequence_length)`):
-            Indices of input sequence tokens in the vocabulary.
+            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+                Indices of input sequence tokens in the vocabulary.
 
-            Indices can be obtained using [`BertTokenizer`]. See
-            [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`]
-            for details.
+                Indices can be obtained using [`BertTokenizer`]. See
+                [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`]
+                for details.
 
-            [What are input IDs?](../glossary#input-ids)
-        attention_mask (`torch.FloatTensor` of shape `((batch_size, sequence_length)`, *optional*):
-            Mask to avoid performing attention on padding token indices. Mask
-            values selected in `[0, 1]`:
+                [What are input IDs?](../glossary#input-ids)
+            attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask
+                values selected in `[0, 1]`:
 
-            - 1 for tokens that are **not masked**,
-            - 0 for tokens that are **masked**.
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
 
-            [What are attention masks?](../glossary#attention-mask)
-        token_type_ids (`torch.LongTensor` of shape `((batch_size, sequence_length)`, *optional*):
-            Segment token indices to indicate first and second portions of the
-            inputs. Indices are selected in `[0, 1]`:
+                [What are attention masks?](../glossary#attention-mask)
+            token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Segment token indices to indicate first and second portions of the
+                inputs. Indices are selected in `[0, 1]`:
 
-            - 0 corresponds to a *sentence A* token,
-            - 1 corresponds to a *sentence B* token.
+                - 0 corresponds to a *sentence A* token,
+                - 1 corresponds to a *sentence B* token.
 
-            [What are token type IDs?](../glossary#token-type-ids)
-        position_ids (`torch.LongTensor` of shape `((batch_size, sequence_length)`, *optional*):
-            Indices of positions of each input sequence tokens in the position
-            embeddings. Selected in the range `[0,
-            config.max_position_embeddings - 1]`.
+                [What are token type IDs?](../glossary#token-type-ids)
+            position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Indices of positions of each input sequence tokens in the position
+                embeddings. Selected in the range `[0,
+                config.max_position_embeddings - 1]`.
 
-            [What are position IDs?](../glossary#position-ids)
-        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers,
-        num_heads)`, *optional*):
-            Mask to nullify selected heads of the self-attention modules. Mask
-            values selected in `[0, 1]`:
+                [What are position IDs?](../glossary#position-ids)
+            head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers,
+                num_heads)`, *optional*):
+                Mask to nullify selected heads of the self-attention modules. Mask
+                values selected in `[0, 1]`:
 
-            - 1 indicates the head is **not masked**,
-            - 0 indicates the head is **masked**.
+                - 1 indicates the head is **not masked**,
+                - 0 indicates the head is **masked**.
 
-        inputs_embeds (`torch.FloatTensor` of shape `((batch_size, sequence_length, hidden_size)`,
-        *optional*):
-            Optionally, instead of passing `input_ids` you can choose to
-            directly pass an embedded representation. This is useful if you want
-            more control over how to convert `input_ids` indices into associated
-            vectors than the model's internal embedding lookup matrix.
-        output_attentions (`bool`, *optional*):
-            Whether or not to return the attentions tensors of all attention
-            layers. See `attentions` under returned tensors for more detail.
-        output_hidden_states (`bool`, *optional*):
-            Whether or not to return the hidden states of all layers. See
-            `hidden_states` under returned tensors for more detail.
-        return_dict (`bool`, *optional*):
-            Whether or not to return a [`~file_utils.ModelOutput`] instead of a
-            plain tuple.
-        encoder_hidden_states  (`torch.FloatTensor` of shape `(batch_size,
-        sequence_length, hidden_size)`, *optional*):
-            Sequence of hidden-states at the output of the last layer of the
-            encoder. Used in the cross-attention if the model is configured as a
-            decoder.
-        encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size,
-        sequence_length)`, *optional*):
-            Mask to avoid performing attention on the padding token indices of
-            the encoder input. This mask is used in the cross-attention if the
-            model is configured as a decoder. Mask values selected in `[0, 1]`:
+            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`,
+                *optional*):
+                Optionally, instead of passing `input_ids` you can choose to
+                directly pass an embedded representation. This is useful if you want
+                more control over how to convert `input_ids` indices into associated
+                vectors than the model's internal embedding lookup matrix.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention
+                layers. See `attentions` under returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See
+                `hidden_states` under returned tensors for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~file_utils.ModelOutput`] instead of a
+                plain tuple.
+            encoder_hidden_states  (`torch.FloatTensor` of shape `(batch_size,
+                sequence_length, hidden_size)`, *optional*):
+                Sequence of hidden-states at the output of the last layer of the
+                encoder. Used in the cross-attention if the model is configured as a
+                decoder.
+            encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size,
+                sequence_length)`, *optional*):
+                Mask to avoid performing attention on the padding token indices of
+                the encoder input. This mask is used in the cross-attention if the
+                model is configured as a decoder. Mask values selected in `[0, 1]`:
 
-            - 1 for tokens that are **not masked**,
-            - 0 for tokens that are **masked**.
-        past_key_values (`tuple(tuple(torch.FloatTensor))` of length
-        `config.n_layers` with each tuple having 4 tensors of shape
-        `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
-            Contains precomputed key and value hidden states of the attention
-            blocks. Can be used to speed up decoding.
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+            past_key_values (`tuple(tuple(torch.FloatTensor))` of length
+                `config.n_layers` with each tuple having 4 tensors of shape
+                `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+                Contains precomputed key and value hidden states of the attention
+                blocks. Can be used to speed up decoding.
 
-            If `past_key_values` are used, the user can optionally input only
-            the last `decoder_input_ids` (those that don't have their past key
-            value states given to this model) of shape `(batch_size, 1)` instead
-            of all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
-        use_cache (`bool`, *optional*):
-            If set to `True`, `past_key_values` key value states are returned
-            and can be used to speed up decoding (see `past_key_values`).
-        Others (**kwargs)
-            some additional parameters might passed in from upstream pipeline,
-            which not influence the results.
+                If `past_key_values` are used, the user can optionally input only
+                the last `decoder_input_ids` (those that don't have their past key
+                value states given to this model) of shape `(batch_size, 1)` instead
+                of all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned
+                and can be used to speed up decoding (see `past_key_values`).
+            Others (**kwargs)
+                some additional parameters might passed in from upstream pipeline,
+                which not influence the results.
         """
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
diff --git a/modelscope/models/nlp/bert/configuration.py b/modelscope/models/nlp/bert/configuration.py
index 6a8441c4..b1f837d4 100644
--- a/modelscope/models/nlp/bert/configuration.py
+++ b/modelscope/models/nlp/bert/configuration.py
@@ -93,7 +93,7 @@ class BertConfig(PretrainedConfig):
 
     Examples:
 
-    ```python >>> from transformers import BertModel, BertConfig
+    >>> from transformers import BertModel, BertConfig
 
     >>> # Initializing a BERT bert-base-uncased style configuration
     >>> configuration = BertConfig()
@@ -103,7 +103,7 @@ class BertConfig(PretrainedConfig):
 
     >>> # Accessing the model configuration
     >>> configuration = model.config
-    ```"""
+    """
     model_type = 'bert'
 
     def __init__(self,
diff --git a/modelscope/models/nlp/bert/fill_mask.py b/modelscope/models/nlp/bert/fill_mask.py
index 8ce6f9b9..5c625eca 100644
--- a/modelscope/models/nlp/bert/fill_mask.py
+++ b/modelscope/models/nlp/bert/fill_mask.py
@@ -1,299 +1,16 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
-# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
-# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
-# All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import torch
-import torch.nn as nn
-import torch.utils.checkpoint
-from torch.nn import CrossEntropyLoss
-from transformers.activations import ACT2FN
 
-from modelscope.metainfo import Models
+from modelscope.metainfo import Heads, Models
 from modelscope.models.builder import MODELS
-from modelscope.outputs import AttentionFillMaskModelOutput
+from modelscope.models.nlp.task_models.fill_mask import ModelForFillMask
 from modelscope.utils import logger as logging
 from modelscope.utils.constant import Tasks
-from .backbone import BertModel, BertPreTrainedModel
-from .configuration import BertConfig
 
 logger = logging.get_logger()
 
 
-class BertPredictionHeadTransform(nn.Module):
-
-    def __init__(self, config):
-        super().__init__()
-        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
-        if isinstance(config.hidden_act, str):
-            self.transform_act_fn = ACT2FN[config.hidden_act]
-        else:
-            self.transform_act_fn = config.hidden_act
-        self.LayerNorm = nn.LayerNorm(
-            config.hidden_size, eps=config.layer_norm_eps)
-
-    def forward(self, hidden_states):
-        hidden_states = self.dense(hidden_states)
-        hidden_states = self.transform_act_fn(hidden_states)
-        hidden_states = self.LayerNorm(hidden_states)
-        return hidden_states
-
-
-class BertLMPredictionHead(nn.Module):
-
-    def __init__(self, config):
-        super().__init__()
-        self.transform = BertPredictionHeadTransform(config)
-
-        # The output weights are the same as the input embeddings, but there is
-        # an output-only bias for each token.
-        self.decoder = nn.Linear(
-            config.hidden_size, config.vocab_size, bias=False)
-
-        self.bias = nn.Parameter(torch.zeros(config.vocab_size))
-
-        # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
-        self.decoder.bias = self.bias
-
-    def forward(self, hidden_states):
-        hidden_states = self.transform(hidden_states)
-        hidden_states = self.decoder(hidden_states)
-        return hidden_states
-
-
-class BertOnlyMLMHead(nn.Module):
-
-    def __init__(self, config):
-        super().__init__()
-        self.predictions = BertLMPredictionHead(config)
-
-    def forward(self, sequence_output):
-        prediction_scores = self.predictions(sequence_output)
-        return prediction_scores
-
-
-class BertOnlyNSPHead(nn.Module):
-
-    def __init__(self, config):
-        super().__init__()
-        self.seq_relationship = nn.Linear(config.hidden_size, 2)
-
-    def forward(self, pooled_output):
-        seq_relationship_score = self.seq_relationship(pooled_output)
-        return seq_relationship_score
-
-
-class BertPreTrainingHeads(nn.Module):
-
-    def __init__(self, config):
-        super().__init__()
-        self.predictions = BertLMPredictionHead(config)
-        self.seq_relationship = nn.Linear(config.hidden_size, 2)
-
-    def forward(self, sequence_output, pooled_output):
-        prediction_scores = self.predictions(sequence_output)
-        seq_relationship_score = self.seq_relationship(pooled_output)
-        return prediction_scores, seq_relationship_score
-
-
 @MODELS.register_module(Tasks.fill_mask, module_name=Models.bert)
-class BertForMaskedLM(BertPreTrainedModel):
-    r"""Bert Model with a `language modeling` head on top.
+class BertForMaskedLM(ModelForFillMask):
 
-    This model inherits from :class:`~transformers.PreTrainedModel`. Check the superclass documentation for the generic
-    methods the library implements for all its model (such as downloading or saving, resizing the input embeddings,
-    pruning heads etc.)
-
-    This model is also a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`__
-    subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to
-    general usage and behavior.
-
-    Preprocessor:
-        This is the fill_mask model of Structbert, the preprocessor of this model
-        is `modelscope.preprocessors.FillMaskTransformersPreprocessor`.
-
-    Parameters:
-        config (:class:`~modelscope.models.nlp.structbert.SbertConfig`): Model configuration class with
-            all the parameters of the model.
-            Initializing with a config file does not load the weights associated with the model, only the
-            configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model
-            weights.
-    """
-
-    _keys_to_ignore_on_load_unexpected = [r'pooler']
-    _keys_to_ignore_on_load_missing = [
-        r'position_ids', r'predictions.decoder.bias'
-    ]
-
-    def __init__(self, config: BertConfig, **kwargs):
-        super().__init__(config)
-
-        if config.is_decoder:
-            logger.warning(
-                'If you want to use `BertForMaskedLM` make sure `config.is_decoder=False` for '
-                'bi-directional self-attention.')
-
-        self.bert = BertModel(config, add_pooling_layer=False)
-        self.cls = BertOnlyMLMHead(config)
-
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    def get_output_embeddings(self):
-        return self.cls.predictions.decoder
-
-    def set_output_embeddings(self, new_embeddings):
-        self.cls.predictions.decoder = new_embeddings
-
-    def forward(
-        self,
-        input_ids=None,
-        attention_mask=None,
-        token_type_ids=None,
-        position_ids=None,
-        head_mask=None,
-        inputs_embeds=None,
-        encoder_hidden_states=None,
-        encoder_attention_mask=None,
-        labels=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-    ):
-        r"""
-        Args:
-        input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
-            Indices of input sequence tokens in the vocabulary.
-
-            Indices can be obtained using :class:`~modelscope.models.nlp.structbert.SbertTokenizer`. See
-            :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
-            details.
-
-            `What are input IDs? <../glossary.html#input-ids>`__
-        attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
-            Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
-
-            - 1 for tokens that are **not masked**,
-            - 0 for tokens that are **masked**.
-
-            `What are attention masks? <../glossary.html#attention-mask>`__
-        token_type_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
-            Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0,
-            1]``:
-
-            - 0 corresponds to a `sentence A` token,
-            - 1 corresponds to a `sentence B` token.
-
-            `What are token type IDs? <../glossary.html#token-type-ids>`_
-        position_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
-            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
-            config.max_position_embeddings - 1]``.
-
-            `What are position IDs? <../glossary.html#position-ids>`_
-        head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`):
-            Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``:
-
-            - 1 indicates the head is **not masked**,
-            - 0 indicates the head is **masked**.
-
-        inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
-            Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
-            This is useful if you want more control over how to convert :obj:`input_ids` indices into associated
-            vectors than the model's internal embedding lookup matrix.
-        output_attentions (:obj:`bool`, `optional`):
-            Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
-            tensors for more detail.
-        output_hidden_states (:obj:`bool`, `optional`):
-            Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
-            more detail.
-        return_dict (:obj:`bool`, `optional`):
-            Whether or not to return a :class:`~transformers.ModelOutput` instead of a plain tuple.
-        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`,
-        *optional*):
-            Labels for computing the masked language modeling loss. Indices
-            should be in `[-100, 0, ..., config.vocab_size]` (see `input_ids`
-            docstring) Tokens with indices set to `-100` are ignored (masked),
-            the loss is only computed for the tokens with labels in `[0, ...,
-            config.vocab_size]`
-
-        Returns:
-            Returns `modelscope.outputs.AttentionFillMaskModelOutput`
-
-        Examples:
-            >>> from modelscope.models import Model
-            >>> from modelscope.preprocessors import Preprocessor
-            >>> model = Model.from_pretrained('damo/nlp_bert_backbone_base_std')
-            >>> preprocessor = Preprocessor.from_pretrained('damo/nlp_bert_backbone_base_std')
-            >>> print(model(**preprocessor(('This is a test', 'This is also a test'))))
-        """
-
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        outputs = self.bert(
-            input_ids,
-            attention_mask=attention_mask,
-            token_type_ids=token_type_ids,
-            position_ids=position_ids,
-            head_mask=head_mask,
-            inputs_embeds=inputs_embeds,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-
-        sequence_output = outputs[0]
-        prediction_scores = self.cls(sequence_output)
-
-        masked_lm_loss = None
-        if labels is not None:
-            loss_fct = CrossEntropyLoss()  # -100 index = padding token
-            masked_lm_loss = loss_fct(
-                prediction_scores.view(-1, self.config.vocab_size),
-                labels.view(-1))
-
-        if not return_dict:
-            output = (prediction_scores, ) + outputs[2:]
-            return ((masked_lm_loss, )
-                    + output) if masked_lm_loss is not None else output
-
-        return AttentionFillMaskModelOutput(
-            loss=masked_lm_loss,
-            logits=prediction_scores,
-            hidden_states=outputs.hidden_states,
-            attentions=outputs.attentions,
-            input_ids=input_ids,
-        )
-
-    def prepare_inputs_for_generation(self,
-                                      input_ids,
-                                      attention_mask=None,
-                                      **model_kwargs):
-        input_shape = input_ids.shape
-        effective_batch_size = input_shape[0]
-
-        #  add a dummy token
-        if self.config.pad_token_id is None:
-            raise ValueError('The PAD token should be defined for generation')
-
-        padding_mask = attention_mask.new_zeros((attention_mask.shape[0], 1))
-        attention_mask = torch.cat([attention_mask, padding_mask], dim=-1)
-        dummy_token = torch.full((effective_batch_size, 1),
-                                 self.config.pad_token_id,
-                                 dtype=torch.long,
-                                 device=input_ids.device)
-        input_ids = torch.cat([input_ids, dummy_token], dim=1)
-
-        return {'input_ids': input_ids, 'attention_mask': attention_mask}
+    base_model_type = Models.bert
+    head_type = Heads.bert_mlm
diff --git a/modelscope/models/nlp/bert/sentence_embedding.py b/modelscope/models/nlp/bert/sentence_embedding.py
index 18cecd3c..92a9da50 100644
--- a/modelscope/models/nlp/bert/sentence_embedding.py
+++ b/modelscope/models/nlp/bert/sentence_embedding.py
@@ -1,23 +1,111 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 
+import torch
+from torch import nn
+
 from modelscope.metainfo import Models
 from modelscope.models import Model
 from modelscope.models.builder import MODELS
-from modelscope.outputs import BackboneModelOutput
+from modelscope.outputs import SentencEmbeddingModelOutput
 from modelscope.utils.constant import Tasks
 from .backbone import BertModel, BertPreTrainedModel
 
 
+class Pooler(nn.Module):
+    """
+    Parameter-free poolers to get the sentence embedding
+    'cls': [CLS] representation with BERT/RoBERTa's MLP pooler.
+    'cls_before_pooler': [CLS] representation without the original MLP pooler.
+    'avg': average of the last layers' hidden states at each token.
+    'avg_top2': average of the last two layers.
+    'avg_first_last': average of the first and the last layers.
+    """
+
+    def __init__(self, pooler_type):
+        super().__init__()
+        self.pooler_type = pooler_type
+        assert self.pooler_type in [
+            'cls', 'avg', 'avg_top2', 'avg_first_last'
+        ], 'unrecognized pooling type %s' % self.pooler_type
+
+    def forward(self, outputs, attention_mask):
+        last_hidden = outputs.last_hidden_state
+        hidden_states = outputs.hidden_states
+
+        if self.pooler_type in ['cls']:
+            return last_hidden[:, 0]
+        elif self.pooler_type == 'avg':
+            return ((last_hidden * attention_mask.unsqueeze(-1)).sum(1)
+                    / attention_mask.sum(-1).unsqueeze(-1))
+        elif self.pooler_type == 'avg_first_last':
+            first_hidden = hidden_states[1]
+            last_hidden = hidden_states[-1]
+            pooled_result = ((first_hidden + last_hidden) / 2.0
+                             * attention_mask.unsqueeze(-1)
+                             ).sum(1) / attention_mask.sum(-1).unsqueeze(-1)
+            return pooled_result
+        elif self.pooler_type == 'avg_top2':
+            second_last_hidden = hidden_states[-2]
+            last_hidden = hidden_states[-1]
+            pooled_result = ((last_hidden + second_last_hidden) / 2.0
+                             * attention_mask.unsqueeze(-1)
+                             ).sum(1) / attention_mask.sum(-1).unsqueeze(-1)
+            return pooled_result
+        else:
+            raise NotImplementedError
+
+
 @MODELS.register_module(Tasks.sentence_embedding, module_name=Models.bert)
 class BertForSentenceEmbedding(BertPreTrainedModel):
 
     def __init__(self, config, **kwargs):
         super().__init__(config)
         self.config = config
+        self.pooler_type = kwargs.get('pooler_type', 'cls')
+        self.pooler = Pooler(self.pooler_type)
         setattr(self, self.base_model_prefix,
                 BertModel(config, add_pooling_layer=False))
 
-    def forward(
+    def forward(self, query=None, docs=None, labels=None):
+        r"""
+        Args:
+            query (:obj: `dict`): Dict of pretrained models's input for the query sequence. See
+                :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__`
+                for details.
+            docs (:obj: `dict`): Dict of pretrained models's input for the query sequence. See
+                :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__`
+                for details.
+        Returns:
+            Returns `modelscope.outputs.SentencEmbeddingModelOutput
+        Examples:
+            >>> from modelscope.models import Model
+            >>> from modelscope.preprocessors import Preprocessor
+            >>> model = Model.from_pretrained('damo/nlp_corom_sentence-embedding_chinese-base')
+            >>> preprocessor = Preprocessor.from_pretrained('damo/nlp_corom_sentence-embedding_chinese-base')
+            >>> print(model(**preprocessor('source_sentence':['This is a test'])))
+        """
+        query_embeddings, doc_embeddings = None, None
+        if query is not None:
+            query_embeddings = self.encode(**query)
+        if docs is not None:
+            doc_embeddings = self.encode(**docs)
+        outputs = SentencEmbeddingModelOutput(
+            query_embeddings=query_embeddings, doc_embeddings=doc_embeddings)
+        if query_embeddings is None or doc_embeddings is None:
+            return outputs
+        if self.base_model.training:
+            loss_fct = nn.CrossEntropyLoss()
+            scores = torch.matmul(query_embeddings, doc_embeddings.T)
+            if labels is None:
+                labels = torch.arange(
+                    scores.size(0), device=scores.device, dtype=torch.long)
+                labels = labels * (
+                    doc_embeddings.size(0) // query_embeddings.size(0))
+            loss = loss_fct(scores, labels)
+            outputs.loss = loss
+        return outputs
+
+    def encode(
         self,
         input_ids=None,
         attention_mask=None,
@@ -28,62 +116,8 @@ class BertForSentenceEmbedding(BertPreTrainedModel):
         output_attentions=None,
         output_hidden_states=None,
         return_dict=None,
-    ) -> BackboneModelOutput:
-        r"""
-        Args:
-        input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
-            Indices of input sequence tokens in the vocabulary.
-
-            Indices can be obtained using :class:`~modelscope.models.nlp.structbert.SbertTokenizer`. See
-            :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
-            details.
-
-        attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
-            Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
-
-            - 1 for tokens that are **not masked**,
-            - 0 for tokens that are **masked**.
-
-        token_type_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
-            Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0,
-            1]``:
-
-            - 0 corresponds to a `sentence A` token,
-            - 1 corresponds to a `sentence B` token.
-
-        position_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
-            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
-            config.max_position_embeddings - 1]``.
-
-        head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`):
-            Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``:
-
-            - 1 indicates the head is **not masked**,
-            - 0 indicates the head is **masked**.
-
-        inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
-            Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
-            This is useful if you want more control over how to convert :obj:`input_ids` indices into associated
-            vectors than the model's internal embedding lookup matrix.
-        output_attentions (:obj:`bool`, `optional`):
-            Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
-            tensors for more detail.
-        output_hidden_states (:obj:`bool`, `optional`):
-            Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
-            more detail.
-        return_dict (:obj:`bool`, `optional`):
-            Whether or not to return a :class:`~transformers.ModelOutput` instead of a plain tuple.
-        Returns:
-            Returns `modelscope.outputs.AttentionTextClassificationModelOutput`
-
-        Examples:
-            >>> from modelscope.models import Model
-            >>> from modelscope.preprocessors import Preprocessor
-            >>> model = Model.from_pretrained('damo/nlp_corom_sentence-embedding_chinese-base')
-            >>> preprocessor = Preprocessor.from_pretrained('damo/nlp_corom_sentence-embedding_chinese-base')
-            >>> print(model(**preprocessor('This is a test')))
-        """
-        return self.base_model.forward(
+    ):
+        outputs = self.base_model.forward(
             input_ids,
             attention_mask=attention_mask,
             token_type_ids=token_type_ids,
@@ -93,6 +127,8 @@ class BertForSentenceEmbedding(BertPreTrainedModel):
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
             return_dict=return_dict)
+        outputs = self.pooler(outputs, attention_mask)
+        return outputs
 
     @classmethod
     def _instantiate(cls, **kwargs):
diff --git a/modelscope/models/nlp/bert/siamese_uie.py b/modelscope/models/nlp/bert/siamese_uie.py
new file mode 100644
index 00000000..10b4b478
--- /dev/null
+++ b/modelscope/models/nlp/bert/siamese_uie.py
@@ -0,0 +1,101 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from copy import deepcopy
+
+import torch
+from torch import nn
+
+from modelscope.metainfo import Models
+from modelscope.models.base import TorchModel
+from modelscope.models.builder import MODELS
+from modelscope.utils.constant import ModelFile, Tasks
+from .backbone import BertEncoder, BertModel, BertPreTrainedModel
+
+__all__ = ['SiameseUieModel']
+
+
+@MODELS.register_module(Tasks.siamese_uie, module_name=Models.bert)
+class SiameseUieModel(BertPreTrainedModel):
+    r"""SiameseUIE general information extraction model,
+        based on the construction idea of prompt (Prompt) + text (Text),
+        uses pointer network (Pointer Network) to
+        realize segment extraction (Span Extraction), so as to
+        realize named entity recognition (NER), relation extraction (RE),
+        Extraction of various tasks such as event extraction (EE),
+        attribute sentiment extraction (ABSA), etc. Different from
+        the existing general information extraction tasks on the market:
+    """
+
+    def __init__(self, config, **kwargs):
+        super().__init__(config)
+        self.config = config
+        self.plm = BertModel(self.config, add_pooling_layer=True)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.head_clsf = nn.Linear(config.hidden_size, 1)
+        self.tail_clsf = nn.Linear(config.hidden_size, 1)
+        self.set_crossattention_layer()
+
+    def set_crossattention_layer(self, num_hidden_layers=6):
+        crossattention_config = deepcopy(self.config)
+        crossattention_config.num_hidden_layers = num_hidden_layers
+        self.config.num_hidden_layers -= num_hidden_layers
+        self.crossattention = BertEncoder(crossattention_config)
+        self.crossattention.layer = self.plm.encoder.layer[self.config.
+                                                           num_hidden_layers:]
+        self.plm.encoder.layer = self.plm.encoder.layer[:self.config.
+                                                        num_hidden_layers]
+
+    def get_cross_attention_output(self, hidden_states, attention_mask,
+                                   encoder_hidden_states,
+                                   encoder_attention_mask):
+        cat_hidden_states = torch.cat([hidden_states, encoder_hidden_states],
+                                      dim=1)
+        cat_attention_mask = torch.cat(
+            [attention_mask, encoder_attention_mask], dim=1)
+        cat_attention_mask = self.plm.get_extended_attention_mask(
+            cat_attention_mask,
+            cat_hidden_states.size()[:2])
+        hidden_states = self.crossattention(
+            hidden_states=cat_hidden_states, attention_mask=cat_attention_mask
+        )[0][:, :hidden_states.size()[1], :]
+        return hidden_states
+
+    def get_plm_sequence_output(self,
+                                input_ids,
+                                attention_mask,
+                                position_ids=None,
+                                is_hint=False):
+        token_type_ids = torch.ones_like(attention_mask) if is_hint else None
+        sequence_output = self.plm(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids)[0]
+        return sequence_output
+
+    def forward(self, sequence_output, attention_masks, hint_ids,
+                cross_attention_masks):
+        """
+
+        Args:
+            sequence_output(tensor): 3-dimension tensor (batch size, sequence length, hidden size)
+            attention_masks(tensor): attention mask, 2-dimension tensor (batch size, sequence length)
+            hint_ids(tensor): token ids of prompt 2-dimension tensor (batch size, sequence length)
+            cross_attention_masks(tensor): cross attention mask, 2-dimension tensor (batch size, sequence length)
+        Default Returns:
+            head_probs(tensor): 2-dimension tensor(batch size, sequence length)
+            tail_probs(tensor): 2-dimension tensor(batch size, sequence length)
+        """
+        position_ids = torch.arange(hint_ids.size(1)).expand(
+            (1, -1)) + sequence_output.size(1)
+        position_ids = position_ids.to(sequence_output.device)
+        hint_sequence_output = self.get_plm_sequence_output(
+            hint_ids, cross_attention_masks, position_ids, is_hint=True)
+        sequence_output = self.get_cross_attention_output(
+            sequence_output, attention_masks, hint_sequence_output,
+            cross_attention_masks)
+        # (b, l, n)
+        head_logits = self.head_clsf(sequence_output).squeeze(-1)
+        tail_logits = self.tail_clsf(sequence_output).squeeze(-1)
+        head_probs = head_logits + (1 - attention_masks) * -10000
+        tail_probs = tail_logits + (1 - attention_masks) * -10000
+        return head_probs, tail_probs
diff --git a/modelscope/models/nlp/bert/text_classification.py b/modelscope/models/nlp/bert/text_classification.py
index df227064..0a38202f 100644
--- a/modelscope/models/nlp/bert/text_classification.py
+++ b/modelscope/models/nlp/bert/text_classification.py
@@ -1,31 +1,10 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
-# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
-# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
-# All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-import torch.nn as nn
-import torch.utils.checkpoint
-from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 
 from modelscope.metainfo import Models
 from modelscope.models.builder import MODELS
-from modelscope.outputs import AttentionTextClassificationModelOutput
+from modelscope.models.nlp import ModelForTextClassification
 from modelscope.utils import logger as logging
 from modelscope.utils.constant import Tasks
-from .backbone import BertModel, BertPreTrainedModel
 
 logger = logging.get_logger()
 
@@ -37,172 +16,16 @@ logger = logging.get_logger()
 @MODELS.register_module(Tasks.sentence_similarity, module_name=Models.bert)
 @MODELS.register_module(
     Tasks.zero_shot_classification, module_name=Models.bert)
-class BertForSequenceClassification(BertPreTrainedModel):
+class BertForSequenceClassification(ModelForTextClassification):
     r"""Bert Model transformer with a sequence classification/regression head on top
     (a linear layer on top of the pooled output) e.g. for GLUE tasks.
 
-    This model inherits from :class:`~transformers.PreTrainedModel`. Check the superclass documentation for the generic
+    This model inherits from :class:`SequenceClassificationModel`. Check the superclass documentation for the generic
     methods the library implements for all its model (such as downloading or saving, resizing the input embeddings,
     pruning heads etc.)
 
     This model is also a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`__
     subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to
     general usage and behavior.
-
-    Preprocessor:
-        This is the fill_mask model of Bert, the preprocessor of this model
-        is `modelscope.preprocessors.TextClassificationTransformersPreprocessor`.
-
-    Trainer:
-        This model is a normal PyTorch model, and can be trained by variable trainers, like EpochBasedTrainer,
-        NlpEpochBasedTrainer, or trainers from other frameworks.
-        The preferred trainer in ModelScope is NlpEpochBasedTrainer.
-
-    Parameters:
-        config (:class:`~modelscope.models.nlp.structbert.SbertConfig`): Model configuration class with
-            all the parameters of the model.
-            Initializing with a config file does not load the weights associated with the model, only the
-            configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model
-            weights.
     """
-
-    def __init__(self, config, **kwargs):
-        super().__init__(config)
-        self.num_labels = config.num_labels
-        self.config = config
-
-        setattr(self, self.base_model_prefix, BertModel(config))
-        classifier_dropout = (
-            config.classifier_dropout if config.classifier_dropout is not None
-            else config.hidden_dropout_prob)
-        self.dropout = nn.Dropout(classifier_dropout)
-        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
-
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    def forward(
-        self,
-        input_ids=None,
-        attention_mask=None,
-        token_type_ids=None,
-        position_ids=None,
-        head_mask=None,
-        inputs_embeds=None,
-        labels=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-    ):
-        r"""
-        Args:
-        input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
-            Indices of input sequence tokens in the vocabulary.
-
-            Indices can be obtained using :class:`~modelscope.models.nlp.structbert.SbertTokenizer`. See
-            :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
-            details.
-
-        attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
-            Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
-
-            - 1 for tokens that are **not masked**,
-            - 0 for tokens that are **masked**.
-
-        token_type_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
-            Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0,
-            1]``:
-
-            - 0 corresponds to a `sentence A` token,
-            - 1 corresponds to a `sentence B` token.
-
-        position_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
-            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
-            config.max_position_embeddings - 1]``.
-
-        head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`):
-            Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``:
-
-            - 1 indicates the head is **not masked**,
-            - 0 indicates the head is **masked**.
-
-        inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
-            Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
-            This is useful if you want more control over how to convert :obj:`input_ids` indices into associated
-            vectors than the model's internal embedding lookup matrix.
-        output_attentions (:obj:`bool`, `optional`):
-            Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
-            tensors for more detail.
-        output_hidden_states (:obj:`bool`, `optional`):
-            Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
-            more detail.
-        return_dict (:obj:`bool`, `optional`):
-            Whether or not to return a :class:`~transformers.ModelOutput` instead of a plain tuple.
-        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
-            Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ...,
-            config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
-            If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
-
-        Returns:
-            Returns `modelscope.outputs.AttentionTextClassificationModelOutput`
-
-        Examples:
-            >>> from modelscope.models import Model
-            >>> from modelscope.preprocessors import Preprocessor
-            >>> model = Model.from_pretrained('damo/nlp_structbert_sentence-similarity_chinese-base')
-            >>> preprocessor = Preprocessor.from_pretrained('damo/nlp_structbert_sentence-similarity_chinese-base')
-            >>> print(model(**preprocessor(('This is a test', 'This is also a test'))))
-        """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        outputs = self.base_model.forward(
-            input_ids,
-            attention_mask=attention_mask,
-            token_type_ids=token_type_ids,
-            position_ids=position_ids,
-            head_mask=head_mask,
-            inputs_embeds=inputs_embeds,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-
-        pooled_output = outputs[1]
-
-        pooled_output = self.dropout(pooled_output)
-        logits = self.classifier(pooled_output)
-
-        loss = None
-        if labels is not None:
-            if self.config.problem_type is None:
-                if self.num_labels == 1:
-                    self.config.problem_type = 'regression'
-                elif self.num_labels > 1 and (labels.dtype == torch.long
-                                              or labels.dtype == torch.int):
-                    self.config.problem_type = 'single_label_classification'
-                else:
-                    self.config.problem_type = 'multi_label_classification'
-
-            if self.config.problem_type == 'regression':
-                loss_fct = MSELoss()
-                if self.num_labels == 1:
-                    loss = loss_fct(logits.squeeze(), labels.squeeze())
-                else:
-                    loss = loss_fct(logits, labels)
-            elif self.config.problem_type == 'single_label_classification':
-                loss_fct = CrossEntropyLoss()
-                loss = loss_fct(
-                    logits.view(-1, self.num_labels), labels.view(-1))
-            elif self.config.problem_type == 'multi_label_classification':
-                loss_fct = BCEWithLogitsLoss()
-                loss = loss_fct(logits, labels)
-        if not return_dict:
-            output = (logits, ) + outputs[2:]
-            return ((loss, ) + output) if loss is not None else output
-
-        return AttentionTextClassificationModelOutput(
-            loss=loss,
-            logits=logits,
-            hidden_states=outputs.hidden_states,
-            attentions=outputs.attentions,
-        )
+    base_model_type = 'bert'
diff --git a/modelscope/models/nlp/bert/text_ranking.py b/modelscope/models/nlp/bert/text_ranking.py
index 0d1ca1fd..c366b95a 100644
--- a/modelscope/models/nlp/bert/text_ranking.py
+++ b/modelscope/models/nlp/bert/text_ranking.py
@@ -1,93 +1,25 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 
-import torch
-import torch.utils.checkpoint
-
 from modelscope.metainfo import Models
-from modelscope.models import Model
 from modelscope.models.builder import MODELS
-from modelscope.outputs import AttentionTextClassificationModelOutput
+from modelscope.models.nlp import ModelForTextRanking
 from modelscope.utils import logger as logging
 from modelscope.utils.constant import Tasks
-from .backbone import BertModel
-from .text_classification import BertForSequenceClassification
 
 logger = logging.get_logger()
 
 
 @MODELS.register_module(Tasks.text_ranking, module_name=Models.bert)
-class BertForTextRanking(BertForSequenceClassification):
+class BertForTextRanking(ModelForTextRanking):
+    r"""Bert Model transformer with a sequence classification/regression head on top
+    (a linear layer on top of the pooled output) e.g. for GLUE tasks.
 
-    def __init__(self, config, *args, **kwargs):
-        super().__init__(config)
-        neg_sample = kwargs.get('neg_sample', 8)
-        self.neg_sample = neg_sample
-        setattr(self, self.base_model_prefix,
-                BertModel(self.config, add_pooling_layer=True))
+    This model inherits from :class:`SequenceClassificationModel`. Check the superclass documentation for the generic
+    methods the library implements for all its model (such as downloading or saving, resizing the input embeddings,
+    pruning heads etc.)
 
-    def forward(self,
-                input_ids=None,
-                attention_mask=None,
-                token_type_ids=None,
-                position_ids=None,
-                head_mask=None,
-                inputs_embeds=None,
-                labels=None,
-                output_attentions=None,
-                output_hidden_states=None,
-                return_dict=None,
-                *args,
-                **kwargs) -> AttentionTextClassificationModelOutput:
-        outputs = self.base_model.forward(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            token_type_ids=token_type_ids,
-            position_ids=position_ids,
-            head_mask=head_mask,
-            inputs_embeds=inputs_embeds,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict)
-
-        # backbone model should return pooled_output as its second output
-        pooled_output = outputs[1]
-        pooled_output = self.dropout(pooled_output)
-        logits = self.classifier(pooled_output)
-        if self.base_model.training:
-            scores = logits.view(-1, self.neg_sample + 1)
-            batch_size = scores.size(0)
-            loss_fct = torch.nn.CrossEntropyLoss()
-            target_label = torch.zeros(
-                batch_size, dtype=torch.long, device=scores.device)
-            loss = loss_fct(scores, target_label)
-            return AttentionTextClassificationModelOutput(
-                loss=loss,
-                logits=logits,
-            )
-        return AttentionTextClassificationModelOutput(logits=logits, )
-
-    @classmethod
-    def _instantiate(cls, **kwargs):
-        """Instantiate the model.
-
-        Args:
-            kwargs: Input args.
-                    model_dir: The model dir used to load the checkpoint and the label information.
-                    num_labels: An optional arg to tell the model how many classes to initialize.
-                                    Method will call utils.parse_label_mapping if num_labels not supplied.
-                                    If num_labels is not found, the model will use the default setting (1 classes).
-
-        Returns:
-            The loaded model, which is initialized by transformers.PreTrainedModel.from_pretrained
-        """
-        num_labels = kwargs.get('num_labels', 1)
-        neg_sample = kwargs.get('neg_sample', 4)
-        model_args = {} if num_labels is None else {'num_labels': num_labels}
-        if neg_sample is not None:
-            model_args['neg_sample'] = neg_sample
-
-        model_dir = kwargs.get('model_dir')
-        model = super(Model, cls).from_pretrained(
-            pretrained_model_name_or_path=model_dir, **model_args)
-        model.model_dir = model_dir
-        return model
+    This model is also a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`__
+    subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to
+    general usage and behavior.
+    """
+    base_model_type = 'bert'
diff --git a/modelscope/models/nlp/bert/token_classification.py b/modelscope/models/nlp/bert/token_classification.py
index 5fb92302..fb21a657 100644
--- a/modelscope/models/nlp/bert/token_classification.py
+++ b/modelscope/models/nlp/bert/token_classification.py
@@ -15,17 +15,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import torch
-import torch.nn as nn
-import torch.utils.checkpoint
-from torch.nn import CrossEntropyLoss
-
-from modelscope.metainfo import Models
+from modelscope.metainfo import Heads, Models
 from modelscope.models.builder import MODELS
-from modelscope.outputs import AttentionTokenClassificationModelOutput
+from modelscope.models.nlp.task_models.token_classification import (
+    ModelForTokenClassification, ModelForTokenClassificationWithCRF)
 from modelscope.utils import logger as logging
 from modelscope.utils.constant import Tasks
-from .backbone import BertModel, BertPreTrainedModel
 
 logger = logging.get_logger()
 
@@ -33,11 +28,11 @@ logger = logging.get_logger()
 @MODELS.register_module(Tasks.token_classification, module_name=Models.bert)
 @MODELS.register_module(Tasks.part_of_speech, module_name=Models.bert)
 @MODELS.register_module(Tasks.word_segmentation, module_name=Models.bert)
-class BertForTokenClassification(BertPreTrainedModel):
+class BertForTokenClassification(ModelForTokenClassification):
     r"""Bert Model with a token classification head on top (a linear layer on top of
     the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks, word-segmentation.
 
-    This model inherits from :class:`~transformers.PreTrainedModel`. Check the superclass documentation for the generic
+    This model inherits from :class:`TokenClassificationModel`. Check the superclass documentation for the generic
     methods the library implements for all its model (such as downloading or saving, resizing the input embeddings,
     pruning heads etc.)
 
@@ -45,182 +40,6 @@ class BertForTokenClassification(BertPreTrainedModel):
     subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to
     general usage and behavior.
 
-    Preprocessor:
-        This is the fill_mask model of Bert, the preprocessor of this model
-        is `modelscope.preprocessors.TokenClassificationTransformersPreprocessor`.
-
-    Trainer:
-        This model is a normal PyTorch model, and can be trained by variable trainers, like EpochBasedTrainer,
-        NlpEpochBasedTrainer, or trainers from other frameworks.
-        The preferred trainer in ModelScope is NlpEpochBasedTrainer.
-
-    Parameters:
-        config (:class:`~modelscope.models.nlp.structbert.SbertConfig`): Model configuration class with
-            all the parameters of the model.
-            Initializing with a config file does not load the weights associated with the model, only the
-            configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model
-            weights.
     """
-    _keys_to_ignore_on_load_unexpected = [r'pooler']
 
-    def __init__(self, config, **kwargs):
-        super().__init__(config)
-        self.num_labels = config.num_labels
-
-        setattr(self, self.base_model_prefix,
-                BertModel(config, add_pooling_layer=False))
-        classifier_dropout = (
-            config.classifier_dropout if config.classifier_dropout is not None
-            else config.hidden_dropout_prob)
-        self.dropout = nn.Dropout(classifier_dropout)
-        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
-
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    def forward(
-        self,
-        input_ids=None,
-        attention_mask=None,
-        token_type_ids=None,
-        position_ids=None,
-        head_mask=None,
-        inputs_embeds=None,
-        labels=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-        offset_mapping=None,
-        label_mask=None,
-    ):
-        r"""
-        Args: input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size,
-        sequence_length)`):
-            Indices of input sequence tokens in the vocabulary.
-
-            Indices can be obtained using
-            :class:`~modelscope.models.nlp.structbert.SbertTokenizer`. See
-            :meth:`transformers.PreTrainedTokenizer.encode` and
-            :meth:`transformers.PreTrainedTokenizer.__call__` for details.
-
-        attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size,
-        sequence_length)`, `optional`):
-            Mask to avoid performing attention on padding token indices. Mask
-            values selected in ``[0, 1]``:
-
-            - 1 for tokens that are **not masked**,
-            - 0 for tokens that are **masked**.
-
-        token_type_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size,
-        sequence_length)`, `optional`):
-            Segment token indices to indicate first and second portions of the
-            inputs. Indices are selected in ``[0, 1]``:
-
-            - 0 corresponds to a `sentence A` token,
-            - 1 corresponds to a `sentence B` token.
-
-        position_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size,
-        sequence_length)`, `optional`):
-            Indices of positions of each input sequence tokens in the position
-            embeddings. Selected in the range ``[0,
-            config.max_position_embeddings - 1]``.
-
-        head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or
-        :obj:`(num_layers, num_heads)`, `optional`):
-            Mask to nullify selected heads of the self-attention modules. Mask
-            values selected in ``[0, 1]``:
-
-            - 1 indicates the head is **not masked**,
-            - 0 indicates the head is **masked**.
-
-        inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size,
-        sequence_length, hidden_size)`, `optional`):
-            Optionally, instead of passing :obj:`input_ids` you can choose to
-            directly pass an embedded representation. This is useful if you want
-            more control over how to convert :obj:`input_ids` indices into
-            associated vectors than the model's internal embedding lookup
-            matrix.
-        output_attentions (:obj:`bool`, `optional`):
-            Whether or not to return the attentions tensors of all attention
-            layers. See ``attentions`` under returned tensors for more detail.
-        output_hidden_states (:obj:`bool`, `optional`):
-            Whether or not to return the hidden states of all layers. See
-            ``hidden_states`` under returned tensors for more detail.
-        return_dict (:obj:`bool`, `optional`):
-            Whether or not to return a :class:`~transformers.ModelOutput`
-            instead of a plain tuple.
-        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`,
-        `optional`):
-            Labels for computing the sequence classification/regression loss.
-            Indices should be in :obj:`[0, ..., config.num_labels - 1]`. If
-            :obj:`config.num_labels == 1` a regression loss is computed
-            (Mean-Square loss), If :obj:`config.num_labels > 1` a classification
-            loss is computed (Cross-Entropy).
-        offset_mapping (:obj:`torch.FloatTensor` of shape :obj:`(batch_size,
-        sequence_length)`, `optional`):
-            Indices of positions of each input sequence tokens in the sentence.
-            Selected in the range ``[0, sequence_length - 1]``.
-        label_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size,
-        sequence_length)`, `optional`):
-            Mask to avoid performing attention on padding token indices. Mask
-            values selected in ``[0, 1]``:
-
-            - 1 for tokens that are **not masked**,
-            - 0 for tokens that are **masked**.
-
-        Returns:
-            Returns `modelscope.outputs.AttentionTokenClassificationModelOutput`
-
-        Examples:
-            >>> from modelscope.models import Model
-            >>> from modelscope.preprocessors import Preprocessor
-            >>> model = Model.from_pretrained('damo/nlp_bert_word-segmentation_chinese-base')
-            >>> preprocessor = Preprocessor.from_pretrained('damo/nlp_bert_word-segmentation_chinese-base')
-            >>> print(model(**preprocessor(('This is a test', 'This is also a test'))))
-        """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        outputs = self.bert(
-            input_ids,
-            attention_mask=attention_mask,
-            token_type_ids=token_type_ids,
-            position_ids=position_ids,
-            head_mask=head_mask,
-            inputs_embeds=inputs_embeds,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-
-        sequence_output = outputs[0]
-
-        sequence_output = self.dropout(sequence_output)
-        logits = self.classifier(sequence_output)
-
-        loss = None
-        if labels is not None:
-            loss_fct = CrossEntropyLoss()
-            # Only keep active parts of the loss
-            if attention_mask is not None:
-                active_loss = attention_mask.view(-1) == 1
-                active_logits = logits.view(-1, self.num_labels)
-                active_labels = torch.where(
-                    active_loss, labels.view(-1),
-                    torch.tensor(loss_fct.ignore_index).type_as(labels))
-                loss = loss_fct(active_logits, active_labels)
-            else:
-                loss = loss_fct(
-                    logits.view(-1, self.num_labels), labels.view(-1))
-
-        if not return_dict:
-            output = (logits, ) + outputs[2:]
-            return ((loss, ) + output) if loss is not None else output
-
-        return AttentionTokenClassificationModelOutput(
-            loss=loss,
-            logits=logits,
-            hidden_states=outputs.hidden_states,
-            attentions=outputs.attentions,
-            offset_mapping=offset_mapping,
-            label_mask=label_mask,
-        )
+    base_model_type = 'bert'
diff --git a/modelscope/models/nlp/csanmt/translation.py b/modelscope/models/nlp/csanmt/translation.py
index 657c26f4..6ece4255 100644
--- a/modelscope/models/nlp/csanmt/translation.py
+++ b/modelscope/models/nlp/csanmt/translation.py
@@ -25,14 +25,20 @@ class CsanmtForTranslation(Model):
         """
         super().__init__(model_dir, *args, **kwargs)
         self.params = kwargs
+        print(self.params)
 
     def __call__(self,
                  input: Dict[str, Tensor],
-                 label: Dict[str, Tensor] = None) -> Dict[str, Tensor]:
+                 label: Dict[str, Tensor] = None,
+                 prefix: Dict[str, Tensor] = None,
+                 prefix_hit: Dict[bool, Tensor] = None) -> Dict[str, Tensor]:
         """return the result by the model
 
         Args:
-            input: the preprocessed data
+            input: the preprocessed input source sequence
+            label: the ground truth target data for model training
+            prefix: the preprocessed input target prefix sequence for interactive translation
+            prefix_hit: the preprocessed target prefix subword vector for interactive translation
 
         Returns:
             output_seqs: output sequence of target ids
@@ -40,7 +46,11 @@ class CsanmtForTranslation(Model):
         if label is None:
             with tf.compat.v1.variable_scope('NmtModel'):
                 output_seqs, output_scores = self.beam_search(
-                    input, self.params)
+                    {
+                        'input_wids': input,
+                        'prefix_wids': prefix,
+                        'prefix_hit': prefix_hit
+                    }, self.params)
             return {
                 'output_seqs': output_seqs,
                 'output_scores': output_scores,
@@ -85,7 +95,7 @@ class CsanmtForTranslation(Model):
         src_bias = tf.compat.v1.get_variable('encoder_input_bias',
                                              [hidden_size])
 
-        eos_padding = tf.zeros([tf.shape(input=features)[0], 1], tf.int64)
+        eos_padding = tf.zeros_like(features, dtype=tf.int64)[:, :1]
         src_seq = tf.concat([features, eos_padding], 1)
         src_mask = tf.cast(tf.not_equal(src_seq, 0), dtype=tf.float32)
         shift_src_mask = src_mask[:, :-1]
@@ -135,7 +145,7 @@ class CsanmtForTranslation(Model):
             embedding_mat = tf.compat.v1.get_variable(
                 'Weights', [vocab_size, hidden_size], initializer=initializer)
 
-        eos_padding = tf.zeros([tf.shape(input=features)[0], 1], tf.int64)
+        eos_padding = tf.zeros_like(features, dtype=tf.int64)[:, :1]
         input_seq = tf.concat([features, eos_padding], 1)
         input_mask = tf.cast(tf.not_equal(input_seq, 0), dtype=tf.float32)
         shift_input_mask = input_mask[:, :-1]
@@ -233,7 +243,7 @@ class CsanmtForTranslation(Model):
                     'Weights', [trg_vocab_size, hidden_size],
                     initializer=initializer)
 
-        eos_padding = tf.zeros([tf.shape(input=labels)[0], 1], tf.int64)
+        eos_padding = tf.zeros_like(labels, dtype=tf.int64)[:, :1]
         trg_seq = tf.concat([labels, eos_padding], 1)
         trg_mask = tf.cast(tf.not_equal(trg_seq, 0), dtype=tf.float32)
         shift_trg_mask = trg_mask[:, :-1]
@@ -441,7 +451,8 @@ class CsanmtForTranslation(Model):
                        trg_seq,
                        states_key,
                        states_val,
-                       params={}):
+                       params={},
+                       is_prefix=False):
         trg_vocab_size = params['trg_vocab_size']
         hidden_size = params['hidden_size']
 
@@ -468,9 +479,10 @@ class CsanmtForTranslation(Model):
             tensor=decoder_input, paddings=[[0, 0], [1, 0], [0, 0]])[:, :-1, :]
         if params['position_info_type'] == 'absolute':
             decoder_input = add_timing_signal(decoder_input)
-
-        decoder_input = decoder_input[:, -1:, :]
-        decoder_self_attention_bias = decoder_self_attention_bias[:, :, -1:, :]
+        if not is_prefix:
+            decoder_input = decoder_input[:, -1:, :]
+            decoder_self_attention_bias = decoder_self_attention_bias[:, :,
+                                                                      -1:, :]
         decoder_output, attention_weights = transformer_decoder(
             decoder_input,
             encoder_output,
@@ -480,8 +492,12 @@ class CsanmtForTranslation(Model):
             states_val=states_val,
             embedding_augmentation=feature_output,
             params=params)
-        decoder_output_last = decoder_output[:, -1, :]
-        attention_weights_last = attention_weights[:, -1, :]
+        if not is_prefix:
+            decoder_output_last = decoder_output[:, -1, :]
+            attention_weights_last = attention_weights[:, -1, :]
+        else:
+            decoder_output_last = decoder_output
+            attention_weights_last = attention_weights
 
         if params['shared_embedding_and_softmax_weights']:
             embedding_scope = \
@@ -502,34 +518,35 @@ class CsanmtForTranslation(Model):
         num_decoder_layers = params['num_decoder_layers']
         lp_rate = params['lp_rate']
         max_decoded_trg_len = params['max_decoded_trg_len']
-        batch_size = tf.shape(input=features)[0]
+        src_input = features['input_wids']
+        if 'prefix_wids' in features:
+            prefix = features['prefix_wids']
+            prefix_hit = features['prefix_hit']
+        else:
+            prefix = None
+            prefix_hit = None
+        batch_size = tf.shape(src_input)[0]
 
-        features = tile_to_beam_size(features, beam_size)
-        features = merge_first_two_dims(features)
+        src_input = tile_to_beam_size(src_input, beam_size)
+        src_input = merge_first_two_dims(src_input)
+        if prefix is not None:
+            prefix = tf.cast(tile_to_beam_size(prefix, beam_size), tf.int32)
+            prefix_hit = tile_to_beam_size(prefix_hit, beam_size)
 
         encoder_output, encoder_self_attention_bias = self.encoding_graph(
-            features, params)
+            src_input, params)
         source_name = 'source'
         if params['shared_source_target_embedding']:
             source_name = None
         feature_output = self.semantic_encoding_graph(
-            features, params, name=source_name)
-
-        init_seqs = tf.fill([batch_size, beam_size, 1], 0)
-        init_log_probs = \
-            tf.constant([[0.] + [tf.float32.min] * (beam_size - 1)])
-        init_log_probs = tf.tile(init_log_probs, [batch_size, 1])
-        init_scores = tf.zeros_like(init_log_probs)
-        fin_seqs = tf.zeros([batch_size, beam_size, 1], tf.int32)
-        fin_scores = tf.fill([batch_size, beam_size], tf.float32.min)
-        fin_flags = tf.zeros([batch_size, beam_size], tf.bool)
+            src_input, params, name=source_name)
 
         states_key = [
-            tf.zeros([batch_size, 0, hidden_size])
+            tf.fill([batch_size, 0, hidden_size], 0.0)
             for layer in range(num_decoder_layers)
         ]
         states_val = [
-            tf.zeros([batch_size, 0, hidden_size])
+            tf.fill([batch_size, 0, hidden_size], 0.0)
             for layer in range(num_decoder_layers)
         ]
         for layer in range(num_decoder_layers):
@@ -545,6 +562,66 @@ class CsanmtForTranslation(Model):
             tile_to_beam_size(states_val[layer], beam_size)
             for layer in range(num_decoder_layers)
         ]
+        fixed_length = 1
+        if prefix is not None:
+            init_seqs = tf.concat(
+                [prefix, tf.fill([batch_size, beam_size, 1], 0)], axis=2)
+            fixed_length = tf.shape(init_seqs)[-1]
+            flat_seqs = merge_first_two_dims(init_seqs)
+            flat_states_key = [
+                merge_first_two_dims(states_key[layer])
+                for layer in range(num_decoder_layers)
+            ]
+            flat_states_val = [
+                merge_first_two_dims(states_val[layer])
+                for layer in range(num_decoder_layers)
+            ]
+
+            step_log_probs, step_attn_weights, step_states_key, step_states_val = self.inference_func(
+                encoder_output,
+                feature_output,
+                encoder_self_attention_bias,
+                flat_seqs,
+                flat_states_key,
+                flat_states_val,
+                params=params,
+                is_prefix=True)
+
+            states_key = [
+                split_first_two_dims(step_states_key[layer], batch_size,
+                                     beam_size)
+                for layer in range(num_decoder_layers)
+            ]
+            states_val = [
+                split_first_two_dims(step_states_val[layer], batch_size,
+                                     beam_size)
+                for layer in range(num_decoder_layers)
+            ]
+
+            prefix_hit = merge_first_two_dims(prefix_hit)
+            log_probs = tf.where(
+                prefix_hit, step_log_probs[:, -1, :],
+                tf.ones_like(step_log_probs[:, -1, :]) * tf.float32.min)
+
+            init_seqs = tf.concat([
+                flat_seqs[:, :-1],
+                tf.expand_dims(
+                    tf.cast(tf.argmax(log_probs, -1), tf.int32), -1)
+            ], -1)
+
+            init_seqs = split_first_two_dims(init_seqs, batch_size, beam_size)
+            init_seqs = tf.concat(
+                [init_seqs, tf.fill([batch_size, beam_size, 1], 0)], axis=2)
+        else:
+            init_seqs = tf.fill([batch_size, beam_size, 1], 0)
+
+        init_log_probs = \
+            tf.constant([[0.] + [tf.float32.min] * (beam_size - 1)])
+        init_log_probs = tf.tile(init_log_probs, [batch_size, 1])
+        init_scores = tf.zeros_like(init_log_probs)
+        fin_seqs = init_seqs
+        fin_scores = tf.fill([batch_size, beam_size], tf.float32.min)
+        fin_flags = tf.cast(tf.fill([batch_size, beam_size], 0), tf.bool)
 
         state = BeamSearchState(
             inputs=(init_seqs, init_log_probs, init_scores),
@@ -573,7 +650,8 @@ class CsanmtForTranslation(Model):
                 flat_seqs,
                 flat_states_key,
                 flat_states_val,
-                params=params)
+                params=params,
+                is_prefix=False)
 
             step_log_probs = split_first_two_dims(step_log_probs, batch_size,
                                                   beam_size)
@@ -737,7 +815,7 @@ class CsanmtForTranslation(Model):
             tf.reduce_any(input_tensor=final_flags, axis=1), final_scores,
             alive_scores)
 
-        final_seqs = final_seqs[:, :, :-1]
+        final_seqs = final_seqs[:, :, fixed_length - 1:-1]
         return final_seqs, final_scores
 
 
@@ -936,7 +1014,7 @@ def transformer_encoder(encoder_input,
     layer_postproc = params['layer_postproc']
     x = encoder_input
     mask = tf.expand_dims(mask, 2)
-    with tf.compat.v1.variable_scope(name):
+    with tf.compat.v1.variable_scope(name, reuse=tf.compat.v1.AUTO_REUSE):
         for layer in range(num_encoder_layers):
             with tf.compat.v1.variable_scope('layer_%d' % layer):
                 max_relative_dis = params['max_relative_dis'] \
@@ -1032,7 +1110,7 @@ def transformer_decoder(decoder_input,
     layer_preproc = params['layer_preproc']
     layer_postproc = params['layer_postproc']
     x = decoder_input
-    with tf.compat.v1.variable_scope(name):
+    with tf.compat.v1.variable_scope(name, reuse=tf.compat.v1.AUTO_REUSE):
         for layer in range(num_decoder_layers):
             with tf.compat.v1.variable_scope('layer_%d' % layer):
                 max_relative_dis = params['max_relative_dis'] \
@@ -1117,7 +1195,8 @@ def attention_bias(inputs, mode, inf=-1e9, dtype=None):
 
     elif mode == 'causal':
         length = inputs
-        lower_triangle = tf.linalg.band_part(tf.ones([length, length]), -1, 0)
+        lower_triangle = tf.linalg.band_part(
+            tf.fill([length, length], 1.0), -1, 0)
         ret = inf * (1.0 - lower_triangle)
         ret = tf.reshape(ret, [1, 1, length, length])
     else:
diff --git a/modelscope/models/nlp/deberta_v2/backbone.py b/modelscope/models/nlp/deberta_v2/backbone.py
index 11f27a20..2ecb3190 100644
--- a/modelscope/models/nlp/deberta_v2/backbone.py
+++ b/modelscope/models/nlp/deberta_v2/backbone.py
@@ -74,7 +74,6 @@ class XSoftmax(torch.autograd.Function):
 
     Example:
 
-    ```python
     >>> import torch
     >>> from transformers.models.deberta_v2.modeling_deberta_v2 import XSoftmax
 
@@ -88,7 +87,7 @@ class XSoftmax(torch.autograd.Function):
     >>> dim = -1
 
     >>> y = XSoftmax.apply(x, mask, dim)
-    ```"""
+    """
 
     @staticmethod
     def forward(self, input, mask, dim):
@@ -1104,38 +1103,38 @@ class DebertaV2Model(DebertaV2PreTrainedModel):
     ) -> Union[Tuple, AttentionBackboneModelOutput]:
         r"""
         Args:
-        input_ids (`torch.LongTensor` of shape `('batch_size, sequence_length')`):
-            Indices of input sequence tokens in the vocabulary.
+            input_ids (`torch.LongTensor` of shape `('batch_size, sequence_length')`):
+                Indices of input sequence tokens in the vocabulary.
 
-        attention_mask (`torch.FloatTensor` of shape `('batch_size, sequence_length')`, *optional*):
-            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+            attention_mask (`torch.FloatTensor` of shape `('batch_size, sequence_length')`, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
 
-            - 1 for tokens that are **not masked**,
-            - 0 for tokens that are **masked**.
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
 
-        token_type_ids (`torch.LongTensor` of shape `('batch_size, sequence_length')`, *optional*):
-            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
-            1]`:
+            token_type_ids (`torch.LongTensor` of shape `('batch_size, sequence_length')`, *optional*):
+                Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
+                1]`:
 
-            - 0 corresponds to a *sentence A* token,
-            - 1 corresponds to a *sentence B* token.
+                - 0 corresponds to a *sentence A* token,
+                - 1 corresponds to a *sentence B* token.
 
-        position_ids (`torch.LongTensor` of shape `('batch_size, sequence_length')`, *optional*):
-            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
-            config.max_position_embeddings - 1]`.
+            position_ids (`torch.LongTensor` of shape `('batch_size, sequence_length')`, *optional*):
+                Indices of positions of each input sequence tokens in the position embeddings. Selected in the range
+                `[0,config.max_position_embeddings - 1]`.
 
-        inputs_embeds (`torch.FloatTensor` of shape `('batch_size, sequence_length', hidden_size)`, *optional*):
-            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
-            is useful if you want more control over how to convert *input_ids* indices into associated vectors than the
-            model's internal embedding lookup matrix.
-        output_attentions (`bool`, *optional*):
-            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
-            tensors for more detail.
-        output_hidden_states (`bool`, *optional*):
-            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
-            more detail.
-        return_dict (`bool`, *optional*):
-            Whether or not to return a dataclass instead of a plain tuple.
+            inputs_embeds (`torch.FloatTensor` of shape `('batch_size, sequence_length', hidden_size)`, *optional*):
+                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+                This is useful if you want more control over how to convert *input_ids* indices into associated
+                vectors than the model's internal embedding lookup matrix.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+                tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+                more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a dataclass instead of a plain tuple.
 
         Returns:
             Returns `modelscope.outputs.AttentionBackboneModelOutput`
diff --git a/modelscope/models/nlp/dgds/__init__.py b/modelscope/models/nlp/dgds/__init__.py
new file mode 100644
index 00000000..836b74d3
--- /dev/null
+++ b/modelscope/models/nlp/dgds/__init__.py
@@ -0,0 +1,28 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .document_grounded_dialog_generate import DocumentGroundedDialogGenerateModel
+    from .document_grounded_dialog_retrieval import DocumentGroundedDialogRerankModel
+    from .document_grounded_dialog_retrieval import DocumentGroundedDialogRetrievalModel
+else:
+    _import_structure = {
+        'document_grounded_dialog_generate':
+        ['DocumentGroundedDialogGenerateModel'],
+        'document_grounded_dialog_rerank':
+        ['DocumentGroundedDialogRerankModel'],
+        'document_grounded_dialog_retrieval':
+        ['DocumentGroundedDialogRetrievalModel']
+    }
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/models/nlp/dgds/backbone.py b/modelscope/models/nlp/dgds/backbone.py
new file mode 100644
index 00000000..a4f9ef42
--- /dev/null
+++ b/modelscope/models/nlp/dgds/backbone.py
@@ -0,0 +1,191 @@
+# Copyright 2021-2022 The Alibaba DAMO Team Authors. All rights reserved.
+# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch BERT model."""
+
+from __future__ import absolute_import, division, print_function
+import os.path
+
+import torch
+import torch.nn.functional as F
+from torch import nn
+from torch.utils.checkpoint import checkpoint
+from transformers import (AutoConfig, DPRConfig, DPRQuestionEncoder,
+                          MT5ForConditionalGeneration, RagTokenForGeneration,
+                          XLMRobertaForSequenceClassification, XLMRobertaModel,
+                          XLMRobertaTokenizer)
+
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+
+class Wrapper(nn.Module):
+
+    def __init__(self, encoder):
+        super(Wrapper, self).__init__()
+        self.encoder = encoder
+
+    def forward(self, input_ids, attention_mask, dummy_tensor):
+        return self.encoder(input_ids, attention_mask).pooler_output
+
+
+class DPRModel(nn.Module):
+
+    def __init__(self, model_dir, config):
+        super().__init__()
+        self.config = config
+
+        qry_encoder = XLMRobertaModel(
+            config=AutoConfig.from_pretrained(
+                os.path.join(model_dir, 'qry_encoder')))
+        ctx_encoder = XLMRobertaModel(
+            config=AutoConfig.from_pretrained(
+                os.path.join(model_dir, 'ctx_encoder')))
+        self.qry_encoder = Wrapper(qry_encoder)
+        self.ctx_encoder = Wrapper(ctx_encoder)
+        self.loss_fct = nn.CrossEntropyLoss()
+
+    @staticmethod
+    def encode(model, input_ids, attention_mask, gck_segment=32):
+        dummy_tensor = torch.ones(1, dtype=torch.float32, requires_grad=True)
+        pooled_output = []
+        for mini_batch in range(0, input_ids.shape[0], gck_segment):
+            mini_batch_input_ids = input_ids[mini_batch:mini_batch
+                                             + gck_segment]
+            mini_batch_attention_mask = attention_mask[mini_batch:mini_batch
+                                                       + gck_segment]
+            mini_batch_pooled_output = checkpoint(model, mini_batch_input_ids,
+                                                  mini_batch_attention_mask,
+                                                  dummy_tensor)
+            pooled_output.append(mini_batch_pooled_output)
+        return torch.cat(pooled_output, dim=0)
+
+    def forward(self,
+                query_input_ids,
+                query_attention_mask,
+                context_input_ids,
+                context_attention_mask,
+                labels,
+                gck_segment=32):
+        query_vector = self.encode(self.qry_encoder, query_input_ids,
+                                   query_attention_mask, gck_segment)
+        context_vector = self.encode(self.ctx_encoder, context_input_ids,
+                                     context_attention_mask, gck_segment)
+        logits = torch.matmul(query_vector, context_vector.T)
+        loss = self.loss_fct(logits, labels)
+        return loss, logits
+
+
+class ClassifyRerank(nn.Module):
+
+    def __init__(self, model_dir):
+        super().__init__()
+        self.base_model = XLMRobertaForSequenceClassification.from_pretrained(
+            model_dir)
+
+    def forward(self,
+                input_ids=None,
+                attention_mask=None,
+                token_type_ids=None,
+                position_ids=None,
+                head_mask=None,
+                inputs_embeds=None,
+                labels=None,
+                output_attentions=None,
+                output_hidden_states=None,
+                return_dict=None,
+                *args,
+                **kwargs):
+        outputs = self.base_model.forward(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict)
+        return outputs
+
+
+class Rerank(nn.Module):
+
+    def __init__(self, encoder, top_k):
+        super().__init__()
+        self.encoder = encoder
+        self.top_k = top_k
+
+    def forward(self, inputs):
+        model = self.encoder
+        logits = F.log_softmax(model(**inputs)[0], dim=-1)[:, 1]
+        logits = logits.view(-1, self.top_k)
+        logprobs = F.log_softmax(logits, dim=-1)
+        return logprobs
+
+
+class Re2GModel(nn.Module):
+
+    def __init__(self, model_dir, config):
+        super(Re2GModel, self).__init__()
+        self.config = config
+        self.top_k = self.config['top_k']
+        encoder = XLMRobertaForSequenceClassification(
+            config=AutoConfig.from_pretrained(
+                os.path.join(model_dir, 'rerank')))
+        generator = MT5ForConditionalGeneration(
+            config=AutoConfig.from_pretrained(
+                os.path.join(model_dir, 'generation')))
+
+        self.rerank = Rerank(encoder, self.top_k)
+
+        dpr_config = DPRConfig()
+        dpr_config.vocab_size = encoder.config.vocab_size
+        rag_model = RagTokenForGeneration(
+            question_encoder=DPRQuestionEncoder(dpr_config),
+            generator=generator)
+        rag_model.rag.question_encoder = None
+        self.generator = rag_model
+
+    def forward(self, rerank_input_ids, input_ids, attention_mask, label_ids):
+        doc_scores = self.rerank(rerank_input_ids)
+
+        outputs = self.generator(
+            labels=label_ids,
+            context_input_ids=input_ids,
+            context_attention_mask=attention_mask,
+            doc_scores=doc_scores,
+            n_docs=self.top_k)
+        return outputs
+
+    def generate(self, rerank_input_ids, input_ids, attention_mask):
+        doc_scores = self.rerank(rerank_input_ids)
+
+        beam_search_output = self.generator.generate(
+            n_docs=self.top_k,
+            encoder_input_ids=input_ids,
+            context_input_ids=input_ids,
+            context_attention_mask=attention_mask,
+            doc_scores=doc_scores,
+            num_beams=self.config['num_beams'],
+            max_length=self.config['target_sequence_length'],
+            early_stopping=True,
+            no_repeat_ngram_size=self.config['no_repeat_ngram_size'],
+            return_dict_in_generate=True,
+            output_scores=True)
+        generated_ids = beam_search_output.detach().cpu().numpy()
+
+        return generated_ids
diff --git a/modelscope/models/nlp/dgds/document_grounded_dialog_generate.py b/modelscope/models/nlp/dgds/document_grounded_dialog_generate.py
new file mode 100644
index 00000000..7c2f6327
--- /dev/null
+++ b/modelscope/models/nlp/dgds/document_grounded_dialog_generate.py
@@ -0,0 +1,45 @@
+import os
+from typing import Dict
+
+import torch
+
+from modelscope.metainfo import Models
+from modelscope.models.base import Tensor, TorchModel
+from modelscope.models.builder import MODELS
+from modelscope.utils.config import Config
+from modelscope.utils.constant import ModelFile, Tasks
+from .backbone import Re2GModel
+
+
+@MODELS.register_module(
+    Tasks.document_grounded_dialog_generate, module_name=Models.doc2bot)
+class DocumentGroundedDialogGenerateModel(TorchModel):
+    _backbone_prefix = ''
+
+    def __init__(self, model_dir, *args, **kwargs):
+        super().__init__(model_dir, *args, **kwargs)
+        self.config = Config.from_file(
+            os.path.join(self.model_dir, ModelFile.CONFIGURATION))
+        self.model = Re2GModel(model_dir, self.config)
+        state_dict = torch.load(
+            os.path.join(self.model_dir, ModelFile.TORCH_MODEL_BIN_FILE),
+            map_location='cpu')
+        self.model.load_state_dict(state_dict)
+
+    def forward(self, input: Dict[str, Tensor]):
+        rerank_input_ids = input['rerank_input_ids']
+        input_ids = input['input_ids']
+        attention_mask = input['attention_mask']
+        label_ids = input['label_ids']
+
+        outputs = self.model(rerank_input_ids, input_ids, attention_mask,
+                             label_ids)
+        return outputs
+
+    def generate(self, input: Dict[str, Tensor]):
+        rerank_input_ids = input['rerank_input_ids']
+        input_ids = input['input_ids']
+        attention_mask = input['attention_mask']
+        outputs = self.model.generate(rerank_input_ids, input_ids,
+                                      attention_mask)
+        return outputs
diff --git a/modelscope/models/nlp/dgds/document_grounded_dialog_rerank.py b/modelscope/models/nlp/dgds/document_grounded_dialog_rerank.py
new file mode 100644
index 00000000..2b6fc5f6
--- /dev/null
+++ b/modelscope/models/nlp/dgds/document_grounded_dialog_rerank.py
@@ -0,0 +1,34 @@
+import os
+from typing import Dict
+
+import torch
+from torch import nn
+
+from modelscope.metainfo import Models
+from modelscope.models.base import Model, Tensor, TorchModel
+from modelscope.models.builder import MODELS
+from modelscope.utils.config import Config
+from modelscope.utils.constant import ModelFile, Tasks
+from .backbone import ClassifyRerank
+
+
+@MODELS.register_module(
+    Tasks.document_grounded_dialog_rerank, module_name=Models.doc2bot)
+class DocumentGroundedDialogRerankModel(TorchModel):
+    _backbone_prefix = ''
+
+    def __init__(self, model_dir, **kwargs):
+        super().__init__(model_dir, **kwargs)
+        self.model = ClassifyRerank(model_dir)
+
+    def forward(self, input: Dict[str, Tensor]):
+        outputs = self.model(
+            input_ids=input['input_ids'],
+            attention_mask=input['attention_mask'])
+        return outputs
+
+    def resize_token_embeddings(self, size):
+        self.model.base_model.resize_token_embeddings(size)
+
+    def save_pretrained(self, addr):
+        self.model.base_model.save_pretrained(addr)
diff --git a/modelscope/models/nlp/dgds/document_grounded_dialog_retrieval.py b/modelscope/models/nlp/dgds/document_grounded_dialog_retrieval.py
new file mode 100644
index 00000000..bd8e05d6
--- /dev/null
+++ b/modelscope/models/nlp/dgds/document_grounded_dialog_retrieval.py
@@ -0,0 +1,52 @@
+import os
+from typing import Dict
+
+import torch
+
+from modelscope.metainfo import Models
+from modelscope.models.base import Tensor, TorchModel
+from modelscope.models.builder import MODELS
+from modelscope.utils.config import Config
+from modelscope.utils.constant import ModelFile, Tasks
+from .backbone import DPRModel
+
+
+@MODELS.register_module(
+    Tasks.document_grounded_dialog_retrieval, module_name=Models.doc2bot)
+class DocumentGroundedDialogRetrievalModel(TorchModel):
+    _backbone_prefix = ''
+
+    def __init__(self, model_dir, *args, **kwargs):
+        super().__init__(model_dir, *args, **kwargs)
+        self.config = Config.from_file(
+            os.path.join(self.model_dir, ModelFile.CONFIGURATION))
+        self.model = DPRModel(model_dir, self.config)
+        state_dict = torch.load(
+            os.path.join(self.model_dir, ModelFile.TORCH_MODEL_BIN_FILE),
+            map_location='cpu')
+        self.model.load_state_dict(state_dict)
+
+    def forward(self, input: Dict[str, Tensor], gck_segment=32):
+        query_input_ids = input['query_input_ids']
+        query_attention_mask = input['query_attention_mask']
+        context_input_ids = input['context_input_ids']
+        context_attention_mask = input['context_attention_mask']
+        labels = input['labels']
+        outputs = self.model(query_input_ids, query_attention_mask,
+                             context_input_ids, context_attention_mask, labels,
+                             gck_segment)
+        return outputs
+
+    def encode_query(self, input: Dict[str, Tensor]):
+        query_input_ids = input['query_input_ids']
+        query_attention_mask = input['query_attention_mask']
+        query_vector = self.model.qry_encoder(query_input_ids,
+                                              query_attention_mask, None)
+        return query_vector
+
+    def encode_context(self, input: Dict[str, Tensor]):
+        context_input_ids = input['context_input_ids']
+        context_attention_mask = input['context_attention_mask']
+        context_vector = self.model.ctx_encoder(context_input_ids,
+                                                context_attention_mask, None)
+        return context_vector
diff --git a/modelscope/models/nlp/fid_plug/__init__.py b/modelscope/models/nlp/fid_plug/__init__.py
new file mode 100644
index 00000000..fe455e84
--- /dev/null
+++ b/modelscope/models/nlp/fid_plug/__init__.py
@@ -0,0 +1,36 @@
+# Copyright 2021-2022 The Alibaba DAMO NLP Team Authors.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .configuration import PlugConfig
+    from .text_generation import (PlugV2Chat, PlugV2FidChat)
+else:
+    _import_structure = {
+        'configuration': ['PlugConfig'],
+        'text_generation': ['PlugV2Chat', 'PlugV2FidChat'],
+    }
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/models/nlp/fid_plug/backbone.py b/modelscope/models/nlp/fid_plug/backbone.py
new file mode 100644
index 00000000..e3e91606
--- /dev/null
+++ b/modelscope/models/nlp/fid_plug/backbone.py
@@ -0,0 +1,1148 @@
+# Copyright 2021-2022 The Alibaba DAMO NLP Team Authors.
+# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+import math
+import os
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional, Union
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+from torch import Tensor, nn
+from torch.nn.init import xavier_uniform_
+from transformers import (BertConfig, BertModel, BertTokenizer, RobertaConfig,
+                          RobertaModel, RobertaTokenizer, logging)
+from transformers.activations import ACT2FN
+from transformers.modeling_utils import PreTrainedModel
+
+from .configuration import PlugConfig
+
+CONFIG_NAME = 'config.json'
+WEIGHTS_NAME = 'pytorch_model.bin'
+
+
+class MultiHeadedAttention(nn.Module):  # SelfAttention
+    """
+    Multi-Head Attention module from
+    "Attention is All You Need"
+    :cite:`DBLP:journals/corr/VaswaniSPUJGKP17`.
+
+    Similar to standard `dot` attention but uses
+    multiple attention distributions simulataneously
+    to select relevant items.
+
+    .. mermaid::
+
+       graph BT
+          A[key]
+          B[value]
+          C[query]
+          O[output]
+          subgraph Attn
+            D[Attn 1]
+            E[Attn 2]
+            F[Attn N]
+          end
+          A --> D
+          C --> D
+          A --> E
+          C --> E
+          A --> F
+          C --> F
+          D --> O
+          E --> O
+          F --> O
+          B --> O
+
+    Also includes several additional tricks.
+
+    Args:
+       head_count (int): number of parallel heads
+       model_dim (int): the dimension of keys/values/queries,
+           must be divisible by head_count
+       dropout (float): dropout parameter
+    """
+
+    def __init__(self,
+                 head_count,
+                 model_dim,
+                 dropout=0.1,
+                 use_final_linear=True):
+        assert model_dim % head_count == 0
+        self.dim_per_head = model_dim // head_count
+        self.model_dim = model_dim
+
+        super().__init__()
+        self.head_count = head_count
+
+        self.linear_keys = nn.Linear(model_dim, head_count * self.dim_per_head)
+        self.linear_values = nn.Linear(model_dim,
+                                       head_count * self.dim_per_head)
+        self.linear_query = nn.Linear(model_dim,
+                                      head_count * self.dim_per_head)
+        self.softmax = nn.Softmax(dim=-1)
+        self.dropout = nn.Dropout(dropout)
+        self.use_final_linear = use_final_linear
+        if (self.use_final_linear):
+            self.final_linear = nn.Linear(model_dim, model_dim)
+
+    def forward(self,
+                key,
+                value,
+                query,
+                mask=None,
+                layer_cache=None,
+                type=None,
+                predefined_graph_1=None,
+                return_attn=False):
+        """
+        Compute the context vector and the attention vectors.
+
+        Args:
+           key (`FloatTensor`): set of `key_len`
+                key vectors `[batch, key_len, dim]`
+           value (`FloatTensor`): set of `key_len`
+                value vectors `[batch, key_len, dim]`
+           query (`FloatTensor`): set of `query_len`
+                 query vectors  `[batch, query_len, dim]`
+           mask: binary mask indicating which keys have
+                 non-zero attention `[batch, query_len, key_len]`
+        Returns:
+           (`FloatTensor`, `FloatTensor`) :
+
+           * output context vectors `[batch, query_len, dim]`
+           * one of the attention vectors `[batch, query_len, key_len]`
+        """
+
+        batch_size = key.size(0)
+        dim_per_head = self.dim_per_head
+        head_count = self.head_count
+
+        def shape(x):
+            """  projection """
+            return x.view(batch_size, -1, head_count, dim_per_head) \
+                .transpose(1, 2)
+
+        def unshape(x):
+            """  compute context """
+            return x.transpose(1, 2).contiguous() \
+                .view(batch_size, -1, head_count * dim_per_head)
+
+        # 1) Project key, value, and query.
+        if layer_cache is not None:
+            if type == 'self':
+                query, key, value = self.linear_query(query), self.linear_keys(
+                    query), self.linear_values(query)
+
+                key = shape(key)
+                value = shape(value)
+
+                device = key.device
+                if layer_cache['self_keys'] is not None:
+                    key = torch.cat((layer_cache['self_keys'].to(device), key),
+                                    dim=2)
+                if layer_cache['self_values'] is not None:
+                    value = torch.cat(
+                        (layer_cache['self_values'].to(device), value), dim=2)
+                layer_cache['self_keys'] = key
+                layer_cache['self_values'] = value
+            elif type == 'context':
+                query = self.linear_query(query)
+                if layer_cache['memory_keys'] is None:
+                    key, value = self.linear_keys(key), self.linear_values(
+                        value)
+                    key = shape(key)
+                    value = shape(value)
+                else:
+                    key, value = layer_cache['memory_keys'], layer_cache[
+                        'memory_values']
+                layer_cache['memory_keys'] = key
+                layer_cache['memory_values'] = value
+        else:
+            key = self.linear_keys(key)
+            value = self.linear_values(value)
+            query = self.linear_query(query)
+            key = shape(key)
+            value = shape(value)
+
+        query = shape(query)
+
+        # 2) Calculate and scale scores.
+        query = query / math.sqrt(dim_per_head)
+        scores = torch.matmul(query, key.transpose(2, 3))
+
+        if mask is not None:
+            mask = mask.unsqueeze(1).expand_as(scores)
+            scores = scores.masked_fill(mask, float('-inf'))
+
+        # 3) Apply attention dropout and compute context vectors.
+
+        attn = self.softmax(scores)
+
+        if (predefined_graph_1 is not None):
+            attn_masked = attn[:, -1] * predefined_graph_1
+            attn_masked = attn_masked / (
+                torch.sum(attn_masked, 2).unsqueeze(2) + 1e-9)
+
+            attn = torch.cat([attn[:, :-1], attn_masked.unsqueeze(1)], 1)
+
+        drop_attn = self.dropout(attn)
+        if (self.use_final_linear):
+            context = unshape(torch.matmul(drop_attn, value))
+            output = self.final_linear(context)
+            if return_attn:
+                return output, attn
+            else:
+                return output
+        else:
+            context = torch.matmul(drop_attn, value)
+            if return_attn:
+                return context, attn
+            else:
+                return context
+
+
+class PositionwiseFeedForward(nn.Module):  # Output
+    """ A two-layer Feed-Forward-Network with residual layer norm.
+
+    Args:
+        d_model (int): the size of input for the first-layer of the FFN.
+        d_ff (int): the hidden layer size of the second-layer
+            of the FNN.
+        dropout (float): dropout probability in :math:`[0, 1)`.
+    """
+
+    def __init__(self, d_model, d_ff, dropout=0.1):
+        super().__init__()
+        self.layer_norm = nn.LayerNorm(d_model, eps=1e-6)
+        self.w_1 = nn.Linear(d_model, d_ff)
+        self.actv = ACT2FN['gelu_new']
+        self.dropout_1 = nn.Dropout(dropout)
+        self.w_2 = nn.Linear(d_ff, d_model)
+        self.dropout_2 = nn.Dropout(dropout)
+
+    def forward(self, x):
+        inter = self.dropout_1(self.actv(self.w_1(self.layer_norm(x))))
+        output = self.dropout_2(self.w_2(inter))
+        return output + x
+
+
+class TransformerDecoderLayer(nn.Module):  # Layer
+    """
+    Args:
+      d_model (int): the dimension of keys/values/queries in
+                       MultiHeadedAttention, also the input size of
+                       the first-layer of the PositionwiseFeedForward.
+      heads (int): the number of heads for MultiHeadedAttention.
+      d_ff (int): the second-layer of the PositionwiseFeedForward.
+      dropout (float): dropout probability(0-1.0).
+      self_attn_type (string): type of self-attention scaled-dot, average
+    """
+    MAX_SIZE = 5000
+
+    def __init__(self, d_model, heads, d_ff, dropout):
+        super().__init__()
+
+        self.self_attn = MultiHeadedAttention(heads, d_model, dropout=dropout)
+
+        self.context_attn = MultiHeadedAttention(
+            heads, d_model, dropout=dropout)
+        self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout)
+        self.layer_norm_1 = nn.LayerNorm(d_model, eps=1e-6)
+        self.layer_norm_2 = nn.LayerNorm(d_model, eps=1e-6)
+        self.drop = nn.Dropout(dropout)
+        mask = self._get_attn_subsequent_mask(self.MAX_SIZE)
+        # Register self.mask as a buffer in TransformerDecoderLayer, so
+        # it gets TransformerDecoderLayer's cuda behavior automatically.
+        self.register_buffer('mask', mask)
+
+    def forward(self,
+                inputs,
+                memory_bank,
+                src_pad_mask,
+                tgt_pad_mask,
+                previous_input=None,
+                layer_cache=None,
+                step=None):
+        """
+        Args:
+            inputs (`FloatTensor`): `[batch_size x 1 x model_dim]`
+            memory_bank (`FloatTensor`): `[batch_size x src_len x model_dim]`
+            src_pad_mask (`LongTensor`): `[batch_size x 1 x src_len]`
+            tgt_pad_mask (`LongTensor`): `[batch_size x 1 x 1]`
+
+        Returns:
+            (`FloatTensor`, `FloatTensor`, `FloatTensor`):
+
+            * output `[batch_size x 1 x model_dim]`
+            * attn `[batch_size x 1 x src_len]`
+            * all_input `[batch_size x current_step x model_dim]`
+
+        """
+        dec_mask = torch.gt(
+            tgt_pad_mask.type(torch.uint8)
+            + self.mask[:, :tgt_pad_mask.size(1), :tgt_pad_mask.size(1)].type(
+                torch.uint8), 0)
+        input_norm = self.layer_norm_1(inputs)
+        all_input = input_norm
+        if previous_input is not None:
+            all_input = torch.cat((previous_input, input_norm), dim=1)
+            dec_mask = None
+
+        query = self.self_attn(
+            all_input,
+            all_input,
+            input_norm,
+            mask=dec_mask,
+            layer_cache=layer_cache,
+            type='self')
+
+        query = self.drop(query) + inputs
+
+        query_norm = self.layer_norm_2(query)
+        mid, attn = self.context_attn(
+            memory_bank,
+            memory_bank,
+            query_norm,
+            mask=src_pad_mask,
+            layer_cache=layer_cache,
+            type='context',
+            return_attn=True)
+        output = self.feed_forward(self.drop(mid) + query)
+
+        return output, attn, all_input
+
+    def _get_attn_subsequent_mask(self, size):
+        """
+        Get an attention mask to avoid using the subsequent info.
+
+        Args:
+            size: int
+
+        Returns:
+            (`LongTensor`):
+
+            * subsequent_mask `[1 x size x size]`
+        """
+        attn_shape = (1, size, size)
+        subsequent_mask = np.triu(np.ones(attn_shape), k=1).astype('uint8')
+        subsequent_mask = torch.from_numpy(subsequent_mask)
+        return subsequent_mask
+
+
+class PositionalEncoding(nn.Module):
+
+    def __init__(self, dropout, dim, max_len=5000):
+        super().__init__()
+        pe = torch.zeros(max_len, dim)
+        position = torch.arange(0, max_len).unsqueeze(1)
+        div_term = torch.exp((torch.arange(0, dim, 2, dtype=torch.float)
+                              * -(math.log(10000.0) / dim)))
+        pe[:, 0::2] = torch.sin(position.float() * div_term)
+        pe[:, 1::2] = torch.cos(position.float() * div_term)
+        pe = pe.unsqueeze(0)
+        self.register_buffer('pe', pe)
+        self.dropout = nn.Dropout(dropout)
+        self.dim = dim
+
+    def forward(self, emb, step=None):
+        emb = emb * math.sqrt(self.dim)
+        if (step):
+            emb = emb + self.pe[:, step][:, None, :]
+
+        else:
+            emb = emb + self.pe[:, :emb.size(1)]
+        emb = self.dropout(emb)
+        return emb
+
+    def get_emb(self, emb):
+        return self.pe[:, :emb.size(1)]
+
+
+class TransformerDecoderState:
+
+    def __init__(self, src: Tensor, cache_num_layers: int = -1):
+        self.src: Tensor = src
+        self.previous_input: Tensor = None
+        self.previous_layer_inputs: Tensor = None
+        self.cache: Optional[Dict[str, Any]] = None
+        if cache_num_layers != -1:
+            self._init_cache(cache_num_layers)
+
+    def update_state(self, new_input, previous_layer_inputs):
+        self.previous_input = new_input
+        self.previous_layer_inputs = previous_layer_inputs
+        self.cache = None
+
+    def _init_cache(self, num_layers):
+        self.cache = {}
+        for layer in range(num_layers):
+            layer_cache = {'memory_keys': None, 'memory_values': None}
+            layer_cache['self_keys'] = None
+            layer_cache['self_values'] = None
+            self.cache['layer_{}'.format(layer)] = layer_cache
+
+    def map_batch_fn(self, fn):
+
+        def _recursive_map(struct, batch_dim=0):
+            for k, v in struct.items():
+                if v is not None:
+                    if isinstance(v, dict):
+                        _recursive_map(v)
+                    else:
+                        struct[k] = fn(v, batch_dim)
+
+        self.src = fn(self.src, 0)
+        if self.cache is not None:
+            _recursive_map(self.cache)
+
+
+class TransformerDecoder(nn.Module):  # Decoder
+    """
+    The Transformer decoder from "Attention is All You Need".
+
+
+    .. mermaid::
+
+       graph BT
+          A[input]
+          B[multi-head self-attn]
+          BB[multi-head src-attn]
+          C[feed forward]
+          O[output]
+          A --> B
+          B --> BB
+          BB --> C
+          C --> O
+
+
+    Args:
+       num_layers (int): number of encoder layers.
+       d_model (int): size of the model
+       heads (int): number of heads
+       d_ff (int): size of the inner FF layer
+       dropout (float): dropout parameters
+       embeddings (:obj:`onmt.modules.Embeddings`):
+          embeddings to use, should have positional encodings
+       attn_type (str): if using a seperate copy attention
+    """
+    decoder_type = 'transformer'
+
+    def __init__(self, num_layers, d_model, heads, d_ff, dropout, embeddings):
+        super().__init__()
+
+        # Basic attributes.
+        self.num_layers = num_layers
+        self.embeddings = embeddings
+        self.pos_emb = PositionalEncoding(dropout,
+                                          self.embeddings.embedding_dim)
+
+        # Build TransformerDecoder.
+        self.transformer_layers = nn.ModuleList([
+            TransformerDecoderLayer(d_model, heads, d_ff, dropout)
+            for _ in range(num_layers)
+        ])
+        self.layer_norm = nn.LayerNorm(d_model, eps=1e-6)
+        self.state = None
+
+    def forward(self,
+                state: TransformerDecoderState,
+                tgt: Tensor,
+                memory_bank: Tensor,
+                step: int = None,
+                memory_masks: Tensor = None):
+        src_words = state.src
+        tgt_words = tgt
+        src_batch, src_len = src_words.size()
+        tgt_batch, tgt_len = tgt_words.size()
+
+        # Run the forward pass of the TransformerDecoder.
+        # emb = self.embeddings(tgt, step=step)
+        emb = self.embeddings(tgt)
+        assert emb.dim() == 3  # len x batch x embedding_dim
+        output = self.pos_emb(emb, step)
+
+        src_memory_bank = memory_bank
+        padding_idx = self.embeddings.padding_idx
+        tgt_pad_mask = tgt_words.data.eq(padding_idx).unsqueeze(1) \
+            .expand(tgt_batch, tgt_len, tgt_len)
+
+        if (memory_masks is not None):
+            src_len = memory_masks.size(-1)
+            src_pad_mask = memory_masks.expand(src_batch, tgt_len, src_len)
+        else:
+            src_pad_mask = src_words.data.eq(padding_idx).unsqueeze(1) \
+                .expand(src_batch, tgt_len, src_len)
+
+        if state.cache is None:
+            saved_inputs = []
+        attns = []
+        for i in range(self.num_layers):
+            prev_layer_input = None
+            if state.cache is None:
+                if state.previous_input is not None:
+                    prev_layer_input = state.previous_layer_inputs[i]
+            output, attn, all_input \
+                = self.transformer_layers[i](
+                    output, src_memory_bank,
+                    src_pad_mask, tgt_pad_mask,
+                    previous_input=prev_layer_input,
+                    layer_cache=state.cache['layer_{}'.format(i)]
+                    if state.cache is not None else None,
+                    step=step)
+            if state.cache is None:
+                saved_inputs.append(all_input)
+            attns.append(attn)
+
+        if state.cache is None:
+            saved_inputs = torch.stack(saved_inputs)
+
+        output = self.layer_norm(output)
+
+        # Process the result and update the attentions.
+        if state.cache is None:
+            state.update_state(tgt, saved_inputs)
+
+        return output, attns, state
+
+
+class PlugPointerGenerator(nn.Module):
+
+    def __init__(self, hidden_size, vocab_size):
+        super().__init__()
+        self.dense = nn.Linear(hidden_size, vocab_size)
+        self.gen_func = nn.LogSoftmax(-1)
+
+    def forward(self, x):
+        x = self.dense(x)
+        x = self.gen_func(x)
+        return x
+
+
+class PlugPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = PlugConfig
+    base_model_prefix = 'plug'
+
+    @classmethod
+    def from_pretrained(
+            cls, pretrained_model_name_or_path: Optional[Union[str,
+                                                               os.PathLike]]):
+        config_file = os.path.join(pretrained_model_name_or_path, CONFIG_NAME)
+        config = PlugConfig.from_json_file(config_file) if os.path.isfile(
+            config_file) else PlugConfig()
+        config.encoder_pth = os.path.join(pretrained_model_name_or_path,
+                                          config.encoder_pth)
+        checkpoint_file = os.path.join(pretrained_model_name_or_path,
+                                       WEIGHTS_NAME)
+        checkpoint = torch.load(checkpoint_file) if os.path.isfile(
+            checkpoint_file) else None
+        return cls(config, checkpoint)
+
+
+class PlugModel(PlugPreTrainedModel):  # Model
+
+    def __init__(self, config, checkpoint=None):
+        super().__init__(config)
+        self.config = config
+        if config.encoder == 'bert' or config.encoder == 'zh_bert':
+            self.bert = BertModel(
+                BertConfig.from_pretrained(config.encoder_pth))
+        elif config.encoder == 'roberta':
+            self.bert = RobertaModel(
+                RobertaConfig.from_pretrained(config.encoder_pth))
+
+        if (config.max_pos > 512):
+            my_pos_embeddings = nn.Embedding(
+                config.max_pos, self.bert.model.config.hidden_size)
+            my_pos_embeddings.weight.data[:
+                                          512] = self.bert.embeddings.position_embeddings.weight.data
+            my_pos_embeddings.weight.data[
+                512:] = self.bert.embeddings.position_embeddings.weight.data[
+                    -1][None, :].repeat(config.max_pos - 512, 1)
+            self.bert.model.embeddings.position_embeddings = my_pos_embeddings
+        self.vocab_size = self.bert.config.vocab_size
+        tgt_embeddings = nn.Embedding(
+            self.vocab_size,
+            self.bert.config.hidden_size,
+            padding_idx=1 if config.encoder == 'roberta' else 0)
+
+        if config.share_emb:
+            tgt_embeddings.weight = copy.deepcopy(
+                self.bert.model.embeddings.word_embeddings.weight)
+        self.decoder = TransformerDecoder(
+            config.dec_layers,
+            config.dec_hidden_size,
+            heads=config.dec_heads,
+            d_ff=config.dec_ff_size,
+            dropout=config.dec_dropout,
+            embeddings=tgt_embeddings)
+        self.generator = PlugPointerGenerator(config.dec_hidden_size,
+                                              self.vocab_size)
+        self.generator.dense.weight = self.decoder.embeddings.weight
+
+        if checkpoint is not None:
+            for key in list(checkpoint['model'].keys()):
+                if key.startswith('module.'):
+                    checkpoint['model'][key.replace(
+                        'module.', '')] = checkpoint['model'][key]
+                    checkpoint['model'].pop(key)
+                if key.startswith('plug.'):
+                    checkpoint['model'][key.replace(
+                        'plug.', '')] = checkpoint['model'][key]
+                    checkpoint['model'].pop(key)
+            msg = self.load_state_dict(checkpoint['model'], strict=False)
+            print(msg)
+        else:
+            for module in self.decoder.modules():
+                if isinstance(module, (nn.Linear, nn.Embedding)):
+                    module.weight.data.normal_(mean=0.0, std=0.02)
+                elif isinstance(module, nn.LayerNorm):
+                    module.bias.data.zero_()
+                    module.weight.data.fill_(1.0)
+                if isinstance(module, nn.Linear) and module.bias is not None:
+                    module.bias.data.zero_()
+            for p in self.generator.parameters():
+                if p.dim() > 1:
+                    xavier_uniform_(p)
+                else:
+                    p.data.zero_()
+            if config.use_bert_emb:
+                if config.encoder == 'roberta':
+                    tgt_embeddings = nn.Embedding(
+                        self.vocab_size,
+                        self.bert.config.hidden_size,
+                        padding_idx=1)
+                else:
+                    tgt_embeddings = nn.Embedding(
+                        self.vocab_size,
+                        self.bert.config.hidden_size,
+                        padding_idx=0)
+                tgt_embeddings.weight = copy.deepcopy(
+                    self.bert.embeddings.word_embeddings.weight)
+                self.decoder.embeddings = tgt_embeddings
+            self.generator.dense.weight = self.decoder.embeddings.weight
+
+    def forward(self, src, tgt, mask_src, token_type_ids):
+        top_vec, _ = self.bert(
+            src, mask_src, token_type_ids=token_type_ids, return_dict=False)
+        state = TransformerDecoderState(src)
+        decoder_outputs, attns, _ = self.decoder(state, tgt[:, :-1], top_vec)
+        return decoder_outputs, attns[-1], top_vec
+
+
+class LabelSmoothingLoss(nn.Module):
+    """
+    With label smoothing,
+    KL-divergence between q_{smoothed ground truth prob.}(w)
+    and p_{prob. computed by model}(w) is minimized.
+    """
+
+    def __init__(self, label_smoothing, tgt_vocab_size, ignore_index=-100):
+        assert 0.0 < label_smoothing <= 1.0
+        self.padding_idx = ignore_index
+        super(LabelSmoothingLoss, self).__init__()
+
+        smoothing_value = label_smoothing / (tgt_vocab_size - 2)
+        one_hot = torch.full((tgt_vocab_size, ), smoothing_value)
+        one_hot[self.padding_idx] = 0
+        self.register_buffer('one_hot', one_hot.unsqueeze(0))
+        self.confidence = 1.0 - label_smoothing
+
+    def forward(self, output, target):
+        """
+        output (FloatTensor): batch_size x n_classes
+        target (LongTensor): batch_size
+        """
+        model_prob = self.one_hot.repeat(target.size(0), 1)
+        model_prob.scatter_(1, target.unsqueeze(1), self.confidence)
+        model_prob.masked_fill_((target == self.padding_idx).unsqueeze(1), 0)
+
+        return F.kl_div(output, model_prob, reduction='sum')
+
+
+class NMTLossCompute(nn.Module):
+    """
+    Standard NMT Loss Computation.
+    """
+
+    def __init__(self, generator, symbols, vocab_size, label_smoothing=0.0):
+        super().__init__()
+        self.generator = generator
+        self.padding_idx = symbols['PAD']
+        if label_smoothing > 0:
+            self.criterion = LabelSmoothingLoss(
+                label_smoothing, vocab_size, ignore_index=self.padding_idx)
+        else:
+            self.criterion = nn.NLLLoss(
+                ignore_index=self.padding_idx, reduction='sum')
+
+    def _bottle(self, _v):
+        return _v.view(-1, _v.size(2))
+
+    def _unbottle(self, _v, batch_size):
+        return _v.view(-1, batch_size, _v.size(1))
+
+    def forward(self, tgt, output):
+        target = tgt[:, 1:]
+        batch_size, decoder_length = target.size(0), target.size(1)
+        normalization = target.ne(self.padding_idx).sum()
+        bottled_output = self._bottle(output)
+        scores = self.generator(bottled_output)
+        gtruth = target.contiguous().view(-1)
+        loss = self.criterion(scores, gtruth)
+        loss = loss.div(float(normalization))
+        return loss, scores.view(batch_size, decoder_length, -1)
+
+
+class PlugForConditionalGeneration(PlugPreTrainedModel):
+
+    @dataclass
+    class Batch:
+        batch_size: int
+        src: torch.Tensor
+        tgt: torch.Tensor
+        mask_src: torch.Tensor
+        token_type_ids: torch.Tensor
+        query_id: List[None] = None
+        src_str: List[List[str]] = None
+        tgt_str: List[str] = None
+
+    def __init__(self, config, checkpoint=None, dataset: str = 'default'):
+        super().__init__(config)
+        self.logger = logging.get_logger(__name__)
+        self.config = config
+        if config.encoder == 'roberta':
+            tokenizer = RobertaTokenizer.from_pretrained(
+                config.encoder_pth, do_lower_case=False)
+            symbols = {
+                'BOS': tokenizer.cls_token_id,
+                'EOS': tokenizer.sep_token_id,
+                'PAD': tokenizer.pad_token_id,
+                'EOQ': tokenizer.unk_token_id
+            }
+        elif config.encoder == 'bert' or config.encoder == 'zh_bert':
+            tokenizer = BertTokenizer.from_pretrained(
+                config.encoder_pth, do_lower_case=True)
+            symbols = {
+                'BOS': tokenizer.vocab['[CLS]'],
+                'EOS': tokenizer.vocab['[SEP]'],
+                'PAD': tokenizer.vocab['[PAD]'],
+                'EOQ': tokenizer.vocab['[unused2]']
+            }
+        self.tokenizer = tokenizer
+        self.symbols = symbols
+        self.plug = PlugModel(config, checkpoint)
+        self.loss = NMTLossCompute(self.plug.generator, symbols,
+                                   self.plug.vocab_size,
+                                   config.label_smoothing)
+        # for generation
+        self.config.dataset = dataset
+        self.start_token = self.symbols['BOS']
+        self.end_token = self.symbols['EOS']
+
+    def forward(self, src, tgt, mask_src=None, token_type_ids=None):
+        if mask_src is None:
+            mask_src = src.ne(self.symbols['PAD']).long()
+        output = self.plug(src, tgt, mask_src, token_type_ids)[0]
+        loss = self.loss(tgt, output)
+        return loss
+
+    def translate_batch(self,
+                        batch: 'Batch',
+                        fast: bool = False,
+                        *args,
+                        **kwargs):
+        """
+        Translate a batch of sentences.
+
+        Mostly a wrapper around :obj:`Beam`.
+
+        Args:
+           batch (:obj:`Batch`): a batch from a dataset object
+           data (:obj:`Dataset`): the dataset object
+           fast (bool): enables fast beam search (may not support all features)
+
+        Todo:
+           Shouldn't need the original dataset.
+        """
+        self.plug.eval()
+        with torch.no_grad():
+            return self._fast_translate_batch(batch, *args, **kwargs)
+
+    def _tile(self, x, count, dim=0):
+        perm = list(range(len(x.size())))
+        if dim != 0:
+            perm[0], perm[dim] = perm[dim], perm[0]
+            x = x.permute(perm).contiguous()
+        out_size = list(x.size())
+        out_size[0] *= count
+        batch = x.size(0)
+        x = x.view(batch, -1) \
+            .transpose(0, 1) \
+            .repeat(count, 1) \
+            .transpose(0, 1) \
+            .contiguous() \
+            .view(*out_size)
+        if dim != 0:
+            x = x.permute(perm).contiguous()
+        return x
+
+    def _top_k_top_p_filtering(self,
+                               logits,
+                               top_k=10,
+                               top_p=1.0,
+                               filter_value=-float('Inf'),
+                               min_tokens_to_keep=1):
+        if top_k > 0:
+            top_k = min(max(top_k, min_tokens_to_keep),
+                        logits.size(-1))  # Safety check
+            # Remove all tokens with a probability less than the last token of the top-k
+            indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1,
+                                                                      None]
+            logits[indices_to_remove] = filter_value
+
+        if top_p < 1.0:
+            sorted_logits, sorted_indices = torch.sort(logits, descending=True)
+            cumulative_probs = torch.cumsum(
+                F.softmax(sorted_logits, dim=-1), dim=-1)
+
+            # Remove tokens with cumulative probability above the threshold (token with 0 are kept)
+            sorted_indices_to_remove = cumulative_probs > top_p
+            if min_tokens_to_keep > 1:
+                # Keep at least min_tokens_to_keep (set to min_tokens_to_keep-1 because we add the first one below)
+                sorted_indices_to_remove[..., :min_tokens_to_keep] = 0
+            # Shift the indices to the right to keep also the first token above the threshold
+            sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[
+                ..., :-1].clone()
+            sorted_indices_to_remove[..., 0] = 0
+
+            # scatter sorted tensors to original indexing
+            indices_to_remove = sorted_indices_to_remove.scatter(
+                1, sorted_indices, sorted_indices_to_remove)
+            logits[indices_to_remove] = filter_value
+        return logits
+
+    def _fast_translate_batch(self,
+                              batch: 'Batch',
+                              max_length: int = 80,
+                              min_length: int = 10,
+                              bad_words_ids=None,
+                              early_stopping=True,
+                              num_beams=3,
+                              length_penalty=1.2,
+                              repetition_penalty=1.2,
+                              no_repeat_ngram_size=4,
+                              *args,
+                              **kwargs):
+        # TODO: faster code path for beam_size == 1.
+        # TODO: support these blacklisted features.
+
+        num_beams = num_beams
+        batch_size = batch.batch_size
+        src = batch.src
+        mask_src = batch.mask_src
+        token_type_ids = batch.token_type_ids
+
+        src_features, _ = self.plug.bert(
+            src, mask_src, token_type_ids=token_type_ids, return_dict=False)
+        state = TransformerDecoderState(src, self.plug.decoder.num_layers)
+        device = src_features.device
+
+        # Tile states and memory beam_size times.
+        state.map_batch_fn(
+            lambda state, dim: self._tile(state, num_beams, dim=dim))
+        src_features = self._tile(src_features, num_beams, dim=0)
+        batch_offset = torch.arange(
+            batch_size, dtype=torch.long, device=device)
+        beam_offset = torch.arange(
+            0,
+            batch_size * num_beams,
+            step=num_beams,
+            dtype=torch.long,
+            device=device)
+        alive_seq = torch.full([batch_size * num_beams, 1],
+                               self.start_token,
+                               dtype=torch.long,
+                               device=device)
+
+        # cal bad_words_ids pre dict
+        bad_words_prefix_dict = {}
+        bad_words_prefix_len = set([])
+        if bad_words_ids is not None:
+            for bw_id in bad_words_ids:
+                key = tuple(bw_id[:-1])
+                value = bw_id[-1]
+                bad_words_prefix_dict[key] = bad_words_prefix_dict.get(
+                    key, []) + [value]
+                bad_words_prefix_len.add(len(key))
+
+        # Give full probability to the first beam on the first step.
+        topk_log_probs = (
+            torch.tensor(
+                [0.0] + [float('-inf')] * (num_beams - 1),
+                device=device).repeat(batch_size))
+
+        # Structure that holds finished hypotheses.
+        hypotheses = [[] for _ in range(batch_size)]  # noqa: F812
+
+        results = {}
+        results['predictions'] = [[] for _ in range(batch_size)]  # noqa: F812
+        results['scores'] = [[] for _ in range(batch_size)]  # noqa: F812
+        results['gold_score'] = [0] * batch_size
+        results['batch'] = batch
+
+        for step in range(max_length):
+            self.logger.info(f'step: {step + 1} / {max_length}')
+            decoder_input = alive_seq[:, -1].view(1, -1)
+
+            # Decoder forward.
+            decoder_input = decoder_input.transpose(0, 1)
+            dec_out, attns, state = self.plug.decoder(
+                state, decoder_input, src_features, step=step)
+
+            # Generator forward.
+            log_probs = self.plug.generator.forward(
+                dec_out.transpose(0, 1).squeeze(0))
+            vocab_size = log_probs.size(-1)
+
+            if step < min_length:
+                log_probs[:, self.end_token] = -1e20
+
+            # filter bad word
+            if len(bad_words_prefix_dict) > 0:
+                # cal bad word banned token: batch_size * num_beams
+                num_hypos = alive_seq.size(0)
+                bad_word_banned_token = []
+                for i in range(num_hypos):
+                    curr_banned_token = []
+                    for pre_len in bad_words_prefix_len:
+                        pre_key = tuple(alive_seq[i, step + 1 - pre_len:step
+                                                  + 1].cpu().numpy().tolist())
+                        curr_banned_token += bad_words_prefix_dict.get(
+                            pre_key, [])
+                    bad_word_banned_token.append(set(curr_banned_token))
+                # set banned word prob=-1e20
+                assert log_probs.size(0) == num_hypos
+                for i in range(num_hypos):
+                    for banned_token in bad_word_banned_token[i]:
+                        log_probs[i, banned_token] = -1e20
+
+            # do repetition_penalty
+            if repetition_penalty > 1.0:
+                """repetition penalty (from CTRL paper https://arxiv.org/abs/1909.05858). """
+                # calculate prev_output_tokens for repetition_penalty: batch_size * num_beams
+                prev_output_tokens = self.calc_banned_tokens(
+                    alive_seq, alive_seq.size(0), no_repeat_ngram_size,
+                    step + 1)
+                # batch_size * num_beams
+                for i in range(log_probs.size(0)):
+                    for previous_token in set(prev_output_tokens[i]):
+                        if log_probs[i, previous_token] < 0:
+                            log_probs[i, previous_token] *= repetition_penalty
+                        else:
+                            log_probs[i, previous_token] /= repetition_penalty
+
+            # Multiply probs by the beam probability.
+
+            curr_length_penalty = (step + 1)**length_penalty
+            # '''
+            if self.config.sample_topk:
+                temperature = self.config.temperature
+                _scores = log_probs / temperature
+                _scores = self._top_k_top_p_filtering(
+                    _scores,
+                    top_k=self.config.top_k,
+                    top_p=self.config.top_p,
+                    min_tokens_to_keep=1
+                )  # (batch_size * num_beams, vocab_size)
+                # Sample 2 next words for each beam (so we have some spare tokens
+                # and match output of greedy beam search)
+                topk_ids = torch.multinomial(
+                    F.softmax(_scores, dim=-1),
+                    num_samples=1)  # (batch_size * num_beams, 2)
+                # Compute next scores
+                _scores = F.log_softmax(
+                    _scores, dim=1)  # (batch_size * num_beams, vocab_size)
+
+                _scores += topk_log_probs.view(-1).unsqueeze(1)
+                _scores = _scores / curr_length_penalty
+                topk_scores = torch.gather(
+                    _scores, -1, topk_ids)  # (batch_size * num_beams, 2)
+                # log_probs +=   # (batch_size * num_beams, 2)
+                # Match shape of greedy beam search
+                topk_ids = topk_ids.view(
+                    -1, num_beams)  # (batch_size, 2 * num_beams)
+                topk_scores = topk_scores.view(
+                    -1, num_beams)  # (batch_size, 2 * num_beams)
+            # '''
+            else:
+                log_probs += topk_log_probs.view(-1).unsqueeze(1)
+                curr_scores = log_probs / curr_length_penalty
+
+                curr_scores = curr_scores.reshape(-1, num_beams * vocab_size)
+                topk_scores, topk_ids = curr_scores.topk(num_beams, dim=-1)
+            if (self.config.block_trigram):
+                cur_len = alive_seq.size(1)
+                if (cur_len > 3):
+                    for i in range(alive_seq.size(0)):
+                        fail = False
+                        words = [int(w) for w in alive_seq[i]]
+                        if self.config.encoder == 'roberta':
+                            # words = [self.vocab.convert_ids_to_tokens[w] for w in words]
+                            words = self.tokenizer.decode(
+                                words).strip().split()
+                        else:
+                            words = [
+                                self.tokenizer.ids_to_tokens[w] for w in words
+                            ]
+                            words = ' '.join(words).replace(' ##', '').split()
+                        if (len(words) <= 3):
+                            continue
+                        trigrams = [(words[i - 1], words[i], words[i + 1])
+                                    for i in range(1,
+                                                   len(words) - 1)]
+                        trigram = tuple(trigrams[-1])
+                        if trigram in trigrams[:-1]:
+                            fail = True
+                        if fail:
+                            curr_scores[i] = -10e20
+            # Recover log probs.
+            topk_log_probs = topk_scores * curr_length_penalty
+
+            # Resolve beam origin and true word ids.
+            # topk_beam_index = topk_ids.div(vocab_size)
+            topk_beam_index = topk_ids // vocab_size
+            topk_ids = topk_ids.fmod(vocab_size)
+
+            # Map beam_index to batch_index in the flat representation.
+            batch_index = (
+                topk_beam_index
+                + beam_offset[:topk_beam_index.size(0)].unsqueeze(1))
+            select_indices = batch_index.view(-1)
+
+            # Append last prediction.
+            alive_seq = torch.cat([
+                alive_seq.index_select(0, select_indices),
+                topk_ids.view(-1, 1)
+            ], -1)
+
+            is_finished = topk_ids.eq(self.end_token)
+            if step + 1 == max_length:
+                is_finished.fill_(self.end_token)
+            # End condition is top beam is finished.
+            end_condition = is_finished[:, 0].eq(1)
+            # Save finished hypotheses.
+            if is_finished.any():
+                predictions = alive_seq.view(-1, num_beams, alive_seq.size(-1))
+                for i in range(is_finished.size(0)):
+                    b = batch_offset[i]
+                    if end_condition[i]:
+                        is_finished[i].fill_(self.end_token)
+                    finished_hyp = is_finished[i].nonzero().view(-1)
+                    # Store finished hypotheses for this batch.
+                    for j in finished_hyp:
+                        hypotheses[b].append(
+                            (topk_scores[i, j], predictions[i, j, 1:]))
+                        if early_stopping and len(hypotheses) == num_beams:
+                            end_condition[i] = True
+                    # If the batch reached the end, save the n_best hypotheses.
+                    if end_condition[i]:
+                        best_hyp = sorted(
+                            hypotheses[b], key=lambda x: x[0], reverse=True)
+                        if self.config.dataset == 'qg_ranking_test' or (
+                                self.config.dataset == 'paraphrase'
+                                and not self.config.sample_topk):
+                            for each in best_hyp[:num_beams]:
+                                score, pred = each
+                                results['scores'][b].append(score)
+                                results['predictions'][b].append(pred)
+                        else:
+                            score, pred = best_hyp[0]
+                            results['scores'][b].append(score)
+                            results['predictions'][b].append(pred)
+                non_finished = end_condition.eq(0).nonzero().view(-1)
+                # If all sentences are translated, no need to go further.
+                if len(non_finished) == 0:
+                    break
+                # Remove finished batches for the next step.
+                topk_log_probs = topk_log_probs.index_select(0, non_finished)
+                batch_index = batch_index.index_select(0, non_finished)
+                batch_offset = batch_offset.index_select(0, non_finished)
+                alive_seq = predictions.index_select(0, non_finished) \
+                    .view(-1, alive_seq.size(-1))
+
+            # Reorder states.
+            select_indices = batch_index.view(-1)
+            src_features = src_features.index_select(0, select_indices)
+            state.map_batch_fn(
+                lambda state, dim: state.index_select(dim, select_indices))
+
+        return results
+
+    def calc_banned_tokens(self, prev_input_ids, num_hypos,
+                           no_repeat_ngram_size, cur_len):
+        # Copied from fairseq for no_repeat_ngram in beam_search"""
+        if cur_len + 1 < no_repeat_ngram_size:
+            # return no banned tokens if we haven't generated no_repeat_ngram_size tokens yet
+            return [[] for _ in range(num_hypos)]
+        generated_ngrams = [{} for _ in range(num_hypos)]
+        for idx in range(num_hypos):
+            gen_tokens = prev_input_ids[idx].cpu().numpy().tolist()
+            generated_ngram = generated_ngrams[idx]
+            for ngram in zip(
+                    *[gen_tokens[i:] for i in range(no_repeat_ngram_size)]):
+                prev_ngram_tuple = tuple(ngram[:-1])
+                generated_ngram[prev_ngram_tuple] = generated_ngram.get(
+                    prev_ngram_tuple, []) + [ngram[-1]]
+
+        def _get_generated_ngrams(hypo_idx):
+            # Before decoding the next token, prevent decoding of ngrams that have already appeared
+            start_idx = cur_len + 1 - no_repeat_ngram_size
+            ngram_idx = tuple(
+                prev_input_ids[hypo_idx,
+                               start_idx:cur_len].cpu().numpy().tolist())
+            return generated_ngrams[hypo_idx].get(ngram_idx, [])
+
+        banned_tokens = [
+            _get_generated_ngrams(hypo_idx) for hypo_idx in range(num_hypos)
+        ]
+        return banned_tokens
+
+    def translate(self,
+                  input_ids: torch.Tensor,
+                  attention_mask: torch.Tensor = None,
+                  token_type_ids=None,
+                  *args,
+                  **kwargs) -> Dict[str, torch.Tensor]:
+        if attention_mask is None:
+            attention_mask = input_ids.ne(self.symbols['PAD']).long()
+        batch = self.Batch(
+            batch_size=input_ids.size()[0],
+            src=input_ids,
+            tgt=None,
+            token_type_ids=token_type_ids,
+            mask_src=attention_mask)
+        translation_batch = self.translate_batch(batch, *args, **kwargs)
+
+        preds = translation_batch['predictions']
+        return {'predictions': preds}
diff --git a/modelscope/models/nlp/fid_plug/configuration.py b/modelscope/models/nlp/fid_plug/configuration.py
new file mode 100644
index 00000000..ec8e0635
--- /dev/null
+++ b/modelscope/models/nlp/fid_plug/configuration.py
@@ -0,0 +1,103 @@
+# Copyright 2021-2022 The Alibaba DAMO NLP Team Authors.
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PLUG model configuration """
+from transformers.configuration_utils import PretrainedConfig
+
+
+class PlugConfig(PretrainedConfig):
+    r"""
+    Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
+    outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
+
+
+    Args:
+        vocab_size (:obj:`int`, `optional`, defaults to 30522):
+            Vocabulary size of the BERT model. Defines the number of different tokens that can be represented by the
+            :obj:`inputs_ids` passed when calling :class:`~transformers.BertModel` or
+            :class:`~transformers.TFBertModel`.
+        hidden_size (:obj:`int`, `optional`, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (:obj:`int`, `optional`, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (:obj:`int`, `optional`, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (:obj:`int`, `optional`, defaults to 3072):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
+        hidden_act (:obj:`str` or :obj:`Callable`, `optional`, defaults to :obj:`"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string,
+            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
+        hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
+            The dropout ratio for the attention probabilities.
+        max_position_embeddings (:obj:`int`, `optional`, defaults to 512):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        type_vocab_size (:obj:`int`, `optional`, defaults to 2):
+            The vocabulary size of the :obj:`token_type_ids` passed when calling :class:`~transformers.BertModel` or
+            :class:`~transformers.TFBertModel`.
+        initializer_range (:obj:`float`, `optional`, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layernorm_epsilon (:obj:`float`, `optional`, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        dec_hidden_layers (:obj:`int`, `optional`, defaults to 12):
+            Number of hidden layers in the Transformer decoder.
+        attn_separate (:obj:`bool`, `optional`, defaults to false):
+            Whether or not to separate the q, k, v of attention.
+
+    Examples::
+
+        >>> import PlugModel, PlugConfig
+        >>> configuration = PlugConfig()
+
+        >>> # Initializing a model from the configuration
+        >>> model = PlugModel(configuration)
+
+        >>> # Accessing the model configuration
+        >>> configuration = model.config
+    """
+    model_type = 'plug'
+
+    def __init__(self,
+                 encoder='roberta',
+                 encoder_pth='roberta-base',
+                 max_pos=512,
+                 share_emb=False,
+                 dec_layers=12,
+                 dec_hidden_size=768,
+                 dec_heads=8,
+                 dec_ff_size=3072,
+                 dec_dropout=0.2,
+                 use_bert_emb=True,
+                 label_smoothing=0.1,
+                 sample_topk=False,
+                 block_trigram=False,
+                 **kwargs):
+        super().__init__(**kwargs)
+        self.encoder = encoder
+        self.encoder_pth = encoder_pth
+        self.max_pos = max_pos
+        self.share_emb = share_emb
+        self.dec_layers = dec_layers
+        self.dec_hidden_size = dec_hidden_size
+        self.dec_heads = dec_heads
+        self.dec_ff_size = dec_ff_size
+        self.dec_dropout = dec_dropout
+        self.use_bert_emb = use_bert_emb
+        self.label_smoothing = label_smoothing
+        # Translator
+        self.sample_topk = sample_topk
+        self.block_trigram = block_trigram
diff --git a/modelscope/models/nlp/fid_plug/text_generation.py b/modelscope/models/nlp/fid_plug/text_generation.py
new file mode 100644
index 00000000..2fe7cb72
--- /dev/null
+++ b/modelscope/models/nlp/fid_plug/text_generation.py
@@ -0,0 +1,180 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import io
+import os
+
+import torch
+from transformers.modeling_outputs import Seq2SeqLMOutput
+
+from modelscope.metainfo import Models
+from modelscope.models import Model
+from modelscope.models.base import TorchModel
+from modelscope.models.builder import MODELS
+from modelscope.outputs import TextGenerationModelOutput, TokenGeneratorOutput
+from modelscope.utils import logger as logging
+from modelscope.utils.constant import Tasks
+from .backbone import PlugForConditionalGeneration
+from .configuration import PlugConfig
+
+CONFIG_NAME = 'config.json'
+WEIGHTS_NAME = 'pytorch_model.bin'
+
+
+class PlugV2Chat(TorchModel):
+
+    def __init__(self, model_dir, *args, **kwargs):
+        super().__init__(model_dir, *args, **kwargs)
+        # init model
+        plug_config_file = os.path.join(model_dir, CONFIG_NAME)
+        plug_config = PlugConfig.from_json_file(plug_config_file)
+        self.backbone = PlugForConditionalGeneration(plug_config)
+        # load weights
+        pretrained_model_path = os.path.join(model_dir, WEIGHTS_NAME)
+        with io.open(pretrained_model_path, 'rb') as f:
+            checkpoint = torch.load(f, map_location='cpu')
+            if 'model' in checkpoint:
+                checkpoint = checkpoint['model']
+            for key in list(checkpoint.keys()):
+                # for old plugv2 version
+                if key.startswith('translator'):
+                    checkpoint.pop(key)
+                    continue
+                if key.startswith('module.'):
+                    checkpoint[key.replace('module.', '')] = checkpoint[key]
+                    checkpoint.pop(key)
+                if key.startswith('backbone.plug.bert.bert.'):
+                    checkpoint[key.replace('backbone.plug.bert.bert.',
+                                           'bert.')] = checkpoint[key]
+                    checkpoint.pop(key)
+                elif key.startswith('backbone.plug.'):
+                    checkpoint[key.replace('backbone.plug.',
+                                           '')] = checkpoint[key]
+                    checkpoint.pop(key)
+            msg = self.backbone.plug.load_state_dict(checkpoint, strict=False)
+            print(f'| {msg}')
+
+    def generate(self, input_ids, token_type_ids=None, *args, **kwargs):
+        pred_result = self.backbone.translate(
+            input_ids=input_ids,
+            token_type_ids=token_type_ids,
+            *args,
+            **kwargs)['predictions']
+        response = [x[0].tolist() for x in pred_result]
+        response = torch.tensor(response)
+        return response
+
+    def forward(self,
+                input_ids,
+                decoder_input_ids,
+                token_type_ids=None,
+                *args,
+                **kwargs):
+        loss = self.backbone.forward(
+            src=input_ids,
+            tgt=decoder_input_ids,
+            token_type_ids=token_type_ids,
+            **kwargs)
+        return Seq2SeqLMOutput(loss=loss[0], logits=loss[1])
+
+
+class PlugV2EncoderWrapper(torch.nn.Module):
+
+    def __init__(self, bert):
+        super().__init__()
+
+        self.bert = bert
+        self.n_passages = None
+
+    def set_n_passages(self, n_passages):
+        self.n_passages = n_passages
+
+    def forward(self,
+                input_ids=None,
+                attention_mask=None,
+                token_type_ids=None,
+                *args,
+                **kwargs):
+        # total_length = n_passages * passage_length
+        bsz, total_length = input_ids.shape
+        passage_length = total_length // self.n_passages
+        input_ids = input_ids.view(bsz * self.n_passages, passage_length)
+        if token_type_ids is not None:
+            token_type_ids = token_type_ids.view(bsz * self.n_passages,
+                                                 passage_length)
+        if attention_mask is not None:
+            attention_mask = attention_mask.view(bsz * self.n_passages,
+                                                 passage_length)
+        outputs = self.bert(
+            input_ids,
+            attention_mask,
+            token_type_ids=token_type_ids,
+            *args,
+            **kwargs)
+        if isinstance(outputs, tuple):
+            outputs = (outputs[0].view(bsz, self.n_passages * passage_length,
+                                       -1), ) + outputs[1:]
+        else:
+            outputs.last_hidden_state = outputs.last_hidden_state.view(
+                bsz, self.n_passages * passage_length, -1)
+        return outputs
+
+
+@MODELS.register_module(Tasks.fid_dialogue, module_name=Models.fid_plug)
+class PlugV2FidChat(PlugV2Chat):
+
+    def __init__(self, model_dir, *args, **kwargs):
+        super().__init__(model_dir, *args, **kwargs)
+        self.wrap_encoder()
+
+    def wrap_encoder(self):
+        self.backbone.plug.bert = PlugV2EncoderWrapper(self.backbone.plug.bert)
+
+    def unwrap_encoder(self):
+        self.backbone.plug.bert = self.backbone.plug.bert.bert
+
+    def load(self,
+             pretrained_model_path,
+             from_tf=False):  # only invoked when model is not onnx format
+        self.unwrap_encoder()
+        super().load(pretrained_model_path)
+        self.wrap_encoder()
+
+    def generate(self, inputs, *args, **kwargs):
+        input_ids = inputs.get('input_ids')
+        attention_mask = inputs.get('attention_mask', None)
+        token_type_ids = inputs.get('token_type_ids', None)
+        n_passages = input_ids.size(1)
+        self.backbone.plug.bert.set_n_passages(n_passages)
+        input_ids = input_ids.view(input_ids.size(0), -1)
+        if token_type_ids is not None:
+            token_type_ids = token_type_ids.view(token_type_ids.size(0), -1)
+        response = super().generate(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            *args,
+            **kwargs)
+        return TokenGeneratorOutput(sequences=response)
+
+    def forward(self,
+                input_ids,
+                decoder_input_ids,
+                token_type_ids=None,
+                *args,
+                **kwargs):
+        if input_ids is not None:
+            # inputs might have already be resized in the generate method
+            if input_ids.dim() == 3:
+                n_passages = input_ids.size(1)
+                self.backbone.plug.bert.set_n_passages(n_passages)
+            input_ids = input_ids.view(input_ids.size(0), -1)
+            if token_type_ids is not None:
+                token_type_ids = token_type_ids.view(input_ids.size(0), -1)
+        seq2seq_lm_output = super().forward(
+            input_ids,
+            decoder_input_ids=decoder_input_ids,
+            token_type_ids=token_type_ids,
+            *args,
+            **kwargs)
+        return TextGenerationModelOutput(
+            loss=seq2seq_lm_output.loss, logits=seq2seq_lm_output.logits)
diff --git a/modelscope/models/nlp/gpt3/text_generation.py b/modelscope/models/nlp/gpt3/text_generation.py
index 27ce09d6..b4a1182c 100644
--- a/modelscope/models/nlp/gpt3/text_generation.py
+++ b/modelscope/models/nlp/gpt3/text_generation.py
@@ -42,10 +42,11 @@ class GPT3ForTextGeneration(TorchModel):
 
         Returns:
             Dict[str, Tensor]: results
-                Example:
-                    {
-                        'logits': Tensor([[0.54, 0.32...])]), # logits
-                    }
+
+        Example:
+            >>> {
+            >>>     'logits': Tensor([[0.54, 0.32...])]), # logits
+            >>> }
         """
         return self.model(**input)
 
diff --git a/modelscope/models/nlp/heads/__init__.py b/modelscope/models/nlp/heads/__init__.py
index 19194d3a..e0bafd40 100644
--- a/modelscope/models/nlp/heads/__init__.py
+++ b/modelscope/models/nlp/heads/__init__.py
@@ -4,11 +4,11 @@ from typing import TYPE_CHECKING
 from modelscope.utils.import_utils import LazyImportModule
 
 if TYPE_CHECKING:
-    from .sequence_classification_head import SequenceClassificationHead
+    from .text_classification_head import TextClassificationHead
     from .torch_pretrain_head import BertMLMHead, RobertaMLMHead
 else:
     _import_structure = {
-        'sequence_classification_head': ['SequenceClassificationHead'],
+        'text_classification_head': ['TextClassificationHead'],
         'torch_pretrain_head': ['BertMLMHead', 'RobertaMLMHead'],
     }
 
diff --git a/modelscope/models/nlp/task_models/nncrf_for_named_entity_recognition.py b/modelscope/models/nlp/heads/crf_head.py
similarity index 64%
rename from modelscope/models/nlp/task_models/nncrf_for_named_entity_recognition.py
rename to modelscope/models/nlp/heads/crf_head.py
index a7e27d9d..1454ed36 100644
--- a/modelscope/models/nlp/task_models/nncrf_for_named_entity_recognition.py
+++ b/modelscope/models/nlp/heads/crf_head.py
@@ -1,290 +1,100 @@
-# Copyright 2021-2022 The Alibaba DAMO NLP Team Authors. All rights reserved.
-# The CRF implementation borrows mostly from AllenNLP CRF module (https://github.com/allenai/allennlp)
-# and pytorch-crf (https://github.com/kmkurn/pytorch-crf) with some modifications.
+# Copyright (c) Alibaba, Inc. and its affiliates.
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+#
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 
-import os
 from typing import Any, Dict, List, Optional
 
 import torch
-import torch.nn as nn
-from transformers import AutoConfig, AutoModel
+import torch.nn.functional as F
+from torch import nn
+from torch.nn import CrossEntropyLoss
+from transformers.activations import ACT2FN
 
-from modelscope.metainfo import Models
-from modelscope.models import TorchModel
-from modelscope.models.builder import MODELS
-from modelscope.outputs import AttentionTokenClassificationModelOutput
-from modelscope.utils.constant import ModelFile, Tasks
-
-__all__ = [
-    'TransformerCRFForNamedEntityRecognition',
-    'LSTMCRFForNamedEntityRecognition', 'LSTMCRFForWordSegmentation',
-    'LSTMCRFForPartOfSpeech'
-]
+from modelscope.metainfo import Heads
+from modelscope.models.base import TorchHead
+from modelscope.models.builder import HEADS
+from modelscope.outputs import (AttentionTokenClassificationModelOutput,
+                                ModelOutputBase, OutputKeys,
+                                TokenClassificationModelOutput)
+from modelscope.utils.constant import Tasks
 
 
-class SequenceLabelingForNamedEntityRecognition(TorchModel):
+@HEADS.register_module(Tasks.token_classification, module_name=Heads.lstm_crf)
+@HEADS.register_module(
+    Tasks.named_entity_recognition, module_name=Heads.lstm_crf)
+@HEADS.register_module(Tasks.word_segmentation, module_name=Heads.lstm_crf)
+@HEADS.register_module(Tasks.part_of_speech, module_name=Heads.lstm_crf)
+class LSTMCRFHead(TorchHead):
 
-    def __init__(self, model_dir, *args, **kwargs):
-        super().__init__(model_dir, *args, **kwargs)
-        self.model = self.init_model(model_dir, *args, **kwargs)
-
-        model_ckpt = os.path.join(model_dir, ModelFile.TORCH_MODEL_BIN_FILE)
-        self.model.load_state_dict(
-            torch.load(model_ckpt, map_location=torch.device('cpu')))
-
-    def init_model(self, model_dir, *args, **kwargs):
-        raise NotImplementedError
-
-    def train(self):
-        return self.model.train()
-
-    def eval(self):
-        return self.model.eval()
-
-    def forward(
-        self,
-        input_ids=None,
-        attention_mask=None,
-        token_type_ids=None,
-        position_ids=None,
-        head_mask=None,
-        inputs_embeds=None,
-        labels=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-        offset_mapping=None,
-        label_mask=None,
-    ) -> Dict[str, Any]:
-        r"""
-        Args:
-        input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
-            Indices of input sequence tokens in the vocabulary.
-
-            Indices can be obtained using :class:`~modelscope.models.nlp.structbert.SbertTokenizer`. See
-            :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
-            details.
-
-        attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
-            Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
-
-            - 1 for tokens that are **not masked**,
-            - 0 for tokens that are **masked**.
-
-        token_type_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
-            Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0,
-            1]``:
-
-            - 0 corresponds to a `sentence A` token,
-            - 1 corresponds to a `sentence B` token.
-
-        position_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
-            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
-            config.max_position_embeddings - 1]``.
-
-        head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`):
-            Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``:
-
-            - 1 indicates the head is **not masked**,
-            - 0 indicates the head is **masked**.
-
-        inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
-            Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
-            This is useful if you want more control over how to convert :obj:`input_ids` indices into associated
-            vectors than the model's internal embedding lookup matrix.
-        output_attentions (:obj:`bool`, `optional`):
-            Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
-            tensors for more detail.
-        output_hidden_states (:obj:`bool`, `optional`):
-            Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
-            more detail.
-        return_dict (:obj:`bool`, `optional`):
-            Whether or not to return a :class:`~transformers.ModelOutput` instead of a plain tuple.
-        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
-            Labels for computing the token classification loss. Indices should be in ``[0, ..., config.num_labels -
-            1]``.
-        offset_mapping (:obj:`torch.FloatTensor` of shape :obj:`(batch_size,
-        sequence_length)`, `optional`):
-            Indices of positions of each input sequence tokens in the sentence.
-            Selected in the range ``[0, sequence_length - 1]``.
-        label_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size,
-        sequence_length)`, `optional`):
-            Mask to avoid performing attention on padding token indices. Mask
-            values selected in ``[0, 1]``:
-
-            - 1 for tokens that are **not masked**,
-            - 0 for tokens that are **masked**.
-
-        Returns:
-            Returns `modelscope.outputs.AttentionTokenClassificationModelOutput`
-
-        Examples:
-            >>> from modelscope.models import Model
-            >>> from modelscope.preprocessors import Preprocessor
-            >>> model = Model.from_pretrained('damo/nlp_structbert_word-segmentation_chinese-base')
-            >>> preprocessor = Preprocessor.from_pretrained('damo/nlp_structbert_word-segmentation_chinese-base')
-            >>> print(model(**preprocessor(('This is a test', 'This is also a test'))))
-        """
-        input_tensor = {
-            'input_ids': input_ids,
-            'attention_mask': attention_mask,
-            'label_mask': label_mask,
-        }
-        output = {
-            'offset_mapping': offset_mapping,
-            **input_tensor,
-            **self.model(input_tensor)
-        }
-        return output
-
-    def postprocess(self, input: Dict[str, Any], **kwargs):
-        predicts = self.model.decode(input)
-        offset_mapping = input.get('offset_mapping')
-        mask = input.get('label_mask')
-
-        # revert predicts to original position with respect of label mask
-        masked_predict = torch.zeros_like(predicts)
-        for i in range(len(mask)):
-            masked_lengths = mask[i].sum(-1).long().cpu().item()
-            selected_predicts = torch.narrow(
-                predicts[i], 0, 0,
-                masked_lengths)  # index_select only move loc, not resize
-            mask_position = mask[i].byte()
-            masked_predict[i][mask_position] = selected_predicts
-        predicts = masked_predict
-
-        return AttentionTokenClassificationModelOutput(
-            loss=None,
-            logits=None,
-            hidden_states=None,
-            attentions=None,
-            label_mask=mask,
-            offset_mapping=offset_mapping,
-            predictions=predicts,
-        )
-
-
-@MODELS.register_module(
-    Tasks.named_entity_recognition, module_name=Models.tcrf)
-class TransformerCRFForNamedEntityRecognition(
-        SequenceLabelingForNamedEntityRecognition):
-    """This model wraps the TransformerCRF model to register into model sets.
-    """
-
-    def init_model(self, model_dir, *args, **kwargs):
-        self.config = AutoConfig.from_pretrained(model_dir)
-        num_labels = self.config.num_labels
-
-        model = TransformerCRF(model_dir, num_labels)
-        return model
-
-
-@MODELS.register_module(Tasks.word_segmentation, module_name=Models.tcrf_wseg)
-class TransformerCRFForWordSegmentation(TransformerCRFForNamedEntityRecognition
-                                        ):
-    """This model wraps the TransformerCRF model to register into model sets.
-    """
-    pass
-
-
-@MODELS.register_module(
-    Tasks.named_entity_recognition, module_name=Models.lcrf)
-class LSTMCRFForNamedEntityRecognition(
-        SequenceLabelingForNamedEntityRecognition):
-    """This model wraps the LSTMCRF model to register into model sets.
-    """
-
-    def init_model(self, model_dir, *args, **kwargs):
-        self.config = AutoConfig.from_pretrained(model_dir)
-        vocab_size = self.config.vocab_size
-        embed_width = self.config.embed_width
-        num_labels = self.config.num_labels
-        lstm_hidden_size = self.config.lstm_hidden_size
-
-        model = LSTMCRF(vocab_size, embed_width, num_labels, lstm_hidden_size)
-        return model
-
-
-@MODELS.register_module(Tasks.word_segmentation, module_name=Models.lcrf_wseg)
-@MODELS.register_module(Tasks.word_segmentation, module_name=Models.lcrf)
-class LSTMCRFForWordSegmentation(LSTMCRFForNamedEntityRecognition):
-    pass
-
-
-@MODELS.register_module(Tasks.part_of_speech, module_name=Models.lcrf)
-class LSTMCRFForPartOfSpeech(LSTMCRFForNamedEntityRecognition):
-    pass
-
-
-class TransformerCRF(nn.Module):
-    """A transformer based model to NER tasks.
-
-    This model will use transformers' backbones as its backbone.
-    """
-
-    def __init__(self, model_dir, num_labels, **kwargs):
-        super(TransformerCRF, self).__init__()
-
-        self.encoder = AutoModel.from_pretrained(model_dir)
-        self.linear = nn.Linear(self.encoder.config.hidden_size, num_labels)
+    def __init__(self, hidden_size=100, num_labels=None, **kwargs):
+        super().__init__(hidden_size=hidden_size, num_labels=num_labels)
+        assert num_labels is not None
+        self.ffn = nn.Linear(hidden_size * 2, num_labels)
         self.crf = CRF(num_labels, batch_first=True)
 
-    def forward(self, inputs):
-        embed = self.encoder(
-            inputs['input_ids'], attention_mask=inputs['attention_mask'])[0]
-        logits = self.linear(embed)
+    def forward(self,
+                inputs: ModelOutputBase,
+                attention_mask=None,
+                label=None,
+                label_mask=None,
+                offset_mapping=None,
+                **kwargs):
+        logits = self.ffn(inputs.last_hidden_state)
 
-        if 'label_mask' in inputs:
-            mask = inputs['label_mask']
-            masked_lengths = mask.sum(-1).long()
-            masked_logits = torch.zeros_like(logits)
-            for i in range(len(mask)):
-                masked_logits[
-                    i, :masked_lengths[i], :] = logits[i].masked_select(
-                        mask[i].unsqueeze(-1)).view(masked_lengths[i], -1)
-            logits = masked_logits
+        return TokenClassificationModelOutput(
+            loss=None,
+            logits=logits,
+        )
 
-        outputs = {'logits': logits}
-        return outputs
-
-    def decode(self, inputs):
-        seq_lens = inputs['label_mask'].sum(-1).long()
+    def decode(self, logits, label_mask):
+        seq_lens = label_mask.sum(-1).long()
         mask = torch.arange(
-            inputs['label_mask'].shape[1],
+            label_mask.shape[1],
             device=seq_lens.device)[None, :] < seq_lens[:, None]
-        predicts = self.crf.decode(inputs['logits'], mask=mask).squeeze(0)
+        predicts = self.crf.decode(logits, mask).squeeze(0)
         return predicts
 
 
-class LSTMCRF(nn.Module):
-    """
-    A standard bilstm-crf model for fast prediction.
-    """
+@HEADS.register_module(
+    Tasks.transformer_crf, module_name=Heads.transformer_crf)
+@HEADS.register_module(
+    Tasks.token_classification, module_name=Heads.transformer_crf)
+@HEADS.register_module(
+    Tasks.named_entity_recognition, module_name=Heads.transformer_crf)
+@HEADS.register_module(
+    Tasks.word_segmentation, module_name=Heads.transformer_crf)
+@HEADS.register_module(Tasks.part_of_speech, module_name=Heads.transformer_crf)
+class TransformersCRFHead(TorchHead):
 
-    def __init__(self,
-                 vocab_size,
-                 embed_width,
-                 num_labels,
-                 lstm_hidden_size=100,
-                 **kwargs):
-        super(LSTMCRF, self).__init__()
-        self.embedding = Embedding(vocab_size, embed_width)
-        self.lstm = nn.LSTM(
-            embed_width,
-            lstm_hidden_size,
-            num_layers=1,
-            bidirectional=True,
-            batch_first=True)
-        self.ffn = nn.Linear(lstm_hidden_size * 2, num_labels)
+    def __init__(self, hidden_size, num_labels, **kwargs):
+        super().__init__(
+            hidden_size=hidden_size, num_labels=num_labels, **kwargs)
+        self.linear = nn.Linear(hidden_size, num_labels)
         self.crf = CRF(num_labels, batch_first=True)
 
-    def forward(self, inputs):
-        embedding = self.embedding(inputs['input_ids'])
-        lstm_output, _ = self.lstm(embedding)
-        logits = self.ffn(lstm_output)
-
-        if 'label_mask' in inputs:
-            mask = inputs['label_mask']
+    def forward(self,
+                inputs: ModelOutputBase,
+                attention_mask=None,
+                label=None,
+                label_mask=None,
+                offset_mapping=None,
+                **kwargs):
+        logits = self.linear(inputs.last_hidden_state)
+        if label_mask is not None:
+            mask = label_mask
             masked_lengths = mask.sum(-1).long()
             masked_logits = torch.zeros_like(logits)
             for i in range(len(mask)):
@@ -293,15 +103,19 @@ class LSTMCRF(nn.Module):
                         mask[i].unsqueeze(-1)).view(masked_lengths[i], -1)
             logits = masked_logits
 
-        outputs = {'logits': logits}
-        return outputs
+        return AttentionTokenClassificationModelOutput(
+            loss=None,
+            logits=logits,
+            hidden_states=inputs.hidden_states,
+            attentions=inputs.attentions,
+        )
 
-    def decode(self, inputs):
-        seq_lens = inputs['label_mask'].sum(-1).long()
+    def decode(self, logits, label_mask):
+        seq_lens = label_mask.sum(-1).long()
         mask = torch.arange(
-            inputs['label_mask'].shape[1],
+            label_mask.shape[1],
             device=seq_lens.device)[None, :] < seq_lens[:, None]
-        predicts = self.crf.decode(inputs['logits'], mask=mask).squeeze(0)
+        predicts = self.crf.decode(logits, mask).squeeze(0)
         return predicts
 
 
@@ -610,8 +424,10 @@ class CRF(nn.Module):
             # Set score to the next score if this timestep is valid (mask == 1)
             # and save the index that produces the next score
             # shape: (batch_size, num_tags)
-            score = torch.where(mask[i].unsqueeze(-1), next_score, score)
-            indices = torch.where(mask[i].unsqueeze(-1), indices, oor_idx)
+            score = torch.where(mask[i].unsqueeze(-1).bool(), next_score,
+                                score)
+            indices = torch.where(mask[i].unsqueeze(-1).bool(), indices,
+                                  oor_idx)
             history_idx[i - 1] = indices
 
         # End transition score
@@ -639,7 +455,7 @@ class CRF(nn.Module):
             best_tags = torch.gather(history_idx[idx], 1, best_tags)
             best_tags_arr[idx] = best_tags.data.view(batch_size)
 
-        return torch.where(mask, best_tags_arr, oor_tag).transpose(0, 1)
+        return torch.where(mask.bool(), best_tags_arr, oor_tag).transpose(0, 1)
 
     def _viterbi_decode_nbest(
             self,
@@ -711,10 +527,10 @@ class CRF(nn.Module):
             # Set score to the next score if this timestep is valid (mask == 1)
             # and save the index that produces the next score
             # shape: (batch_size, num_tags, nbest)
-            score = torch.where(mask[i].unsqueeze(-1).unsqueeze(-1),
+            score = torch.where(mask[i].unsqueeze(-1).bool().unsqueeze(-1),
                                 next_score, score)
-            indices = torch.where(mask[i].unsqueeze(-1).unsqueeze(-1), indices,
-                                  oor_idx)
+            indices = torch.where(mask[i].unsqueeze(-1).unsqueeze(-1).bool(),
+                                  indices, oor_idx)
             history_idx[i - 1] = indices
 
         # End transition score shape: (batch_size, num_tags, nbest)
@@ -745,14 +561,3 @@ class CRF(nn.Module):
 
         return torch.where(mask.unsqueeze(-1), best_tags_arr,
                            oor_tag).permute(2, 1, 0)
-
-
-class Embedding(nn.Module):
-
-    def __init__(self, vocab_size, embed_width):
-        super(Embedding, self).__init__()
-
-        self.embedding = nn.Embedding(vocab_size, embed_width)
-
-    def forward(self, input_ids):
-        return self.embedding(input_ids)
diff --git a/modelscope/models/nlp/heads/fill_mask_head.py b/modelscope/models/nlp/heads/fill_mask_head.py
index 6b0c5e05..83640a26 100644
--- a/modelscope/models/nlp/heads/fill_mask_head.py
+++ b/modelscope/models/nlp/heads/fill_mask_head.py
@@ -26,27 +26,49 @@ from transformers.activations import ACT2FN
 from modelscope.metainfo import Heads
 from modelscope.models.base import TorchHead
 from modelscope.models.builder import HEADS
-from modelscope.outputs import OutputKeys
+from modelscope.outputs import (AttentionFillMaskModelOutput, ModelOutputBase,
+                                OutputKeys)
 from modelscope.utils.constant import Tasks
 
 
 @HEADS.register_module(Tasks.fill_mask, module_name=Heads.bert_mlm)
+@HEADS.register_module(Tasks.fill_mask, module_name=Heads.fill_mask)
 class BertFillMaskHead(TorchHead):
 
-    def __init__(self, **kwargs):
-        super().__init__(**kwargs)
+    def __init__(self,
+                 hidden_size=768,
+                 hidden_act='gelu',
+                 layer_norm_eps=1e-12,
+                 vocab_size=30522,
+                 **kwargs):
+        super().__init__(
+            hidden_size=hidden_size,
+            hidden_act=hidden_act,
+            layer_norm_eps=layer_norm_eps,
+            vocab_size=vocab_size)
         self.cls = BertOnlyMLMHead(self.config)
 
-    def forward(self, sequence_output):
-        prediction_scores = self.cls(sequence_output)
-        return {OutputKeys.LOGITS: prediction_scores}
+    def forward(self,
+                inputs: ModelOutputBase,
+                attention_mask=None,
+                labels=None,
+                **kwargs):
+        logits = self.cls(inputs.last_hidden_state)
+        loss = None
+        if labels is not None:
+            loss = self.compute_loss(logits, labels)
+        return AttentionFillMaskModelOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=inputs.hidden_states,
+            attentions=inputs.attentions,
+        )
 
-    def compute_loss(self, outputs: Dict[str, torch.Tensor],
-                     labels) -> Dict[str, torch.Tensor]:
+    def compute_loss(self, logits: torch.Tensor, labels) -> torch.Tensor:
         loss_fct = CrossEntropyLoss()  # -100 index = padding token
         masked_lm_loss = loss_fct(
-            outputs.view(-1, self.config.vocab_size), labels.view(-1))
-        return {OutputKeys.LOSS: masked_lm_loss}
+            logits.view(-1, self.config.vocab_size), labels.view(-1))
+        return masked_lm_loss
 
 
 class BertPredictionHeadTransform(nn.Module):
diff --git a/modelscope/models/nlp/heads/infromation_extraction_head.py b/modelscope/models/nlp/heads/infromation_extraction_head.py
index 626f1b59..e3cc58cd 100644
--- a/modelscope/models/nlp/heads/infromation_extraction_head.py
+++ b/modelscope/models/nlp/heads/infromation_extraction_head.py
@@ -5,6 +5,7 @@ from torch import nn
 from modelscope.metainfo import Heads
 from modelscope.models.base import TorchHead
 from modelscope.models.builder import HEADS
+from modelscope.outputs import InformationExtractionOutput, ModelOutputBase
 from modelscope.utils.constant import Tasks
 
 
@@ -14,19 +15,27 @@ from modelscope.utils.constant import Tasks
     Tasks.relation_extraction, module_name=Heads.information_extraction)
 class InformationExtractionHead(TorchHead):
 
-    def __init__(self, **kwargs):
-        super().__init__(**kwargs)
-        config = self.config
-        assert config.get('labels') is not None
-        self.labels = config.labels
-        self.s_layer = nn.Linear(config.hidden_size, 2)  # head, tail, bce
-        self.o_layer = nn.Linear(2 * config.hidden_size, 2)  # head, tail, bce
-        self.p_layer = nn.Linear(config.hidden_size,
-                                 len(self.labels))  # label, ce
-        self.mha = nn.MultiheadAttention(config.hidden_size, 4)
+    def __init__(self, hidden_size=768, labels=None, **kwargs):
+        super().__init__(hidden_size=hidden_size, labels=labels)
+        assert labels is not None
+        self.labels = labels
+        self.s_layer = nn.Linear(hidden_size, 2)  # head, tail, bce
+        self.o_layer = nn.Linear(2 * hidden_size, 2)  # head, tail, bce
+        self.p_layer = nn.Linear(hidden_size, len(self.labels))  # label, ce
+        self.mha = nn.MultiheadAttention(hidden_size, 4)
 
-    def forward(self, sequence_output, text, offsets, threshold=0.5):
-        # assert batch size == 1
+    def forward(self,
+                inputs: ModelOutputBase,
+                attention_mask=None,
+                labels=None,
+                text=None,
+                offsets=None,
+                threshold=0.5,
+                **kwargs) -> InformationExtractionOutput:
+
+        assert text is not None
+        assert offsets is not None
+        sequence_output = inputs.last_hidden_state
         spos = []
         s_head_logits, s_tail_logits = self.s_layer(sequence_output).split(
             1, dim=-1)  # (b, seq_len, 2)
@@ -64,7 +73,8 @@ class InformationExtractionHead(TorchHead):
                     if label[i] > threshold:
                         predicate = self.labels[i]
                         spos.append((subject, predicate, object))
-        return spos
+
+        return InformationExtractionOutput(spo_list=spos)
 
     def _get_masks_and_mentions(self,
                                 text,
diff --git a/modelscope/models/nlp/heads/sequence_classification_head.py b/modelscope/models/nlp/heads/sequence_classification_head.py
deleted file mode 100644
index fb03b7ff..00000000
--- a/modelscope/models/nlp/heads/sequence_classification_head.py
+++ /dev/null
@@ -1,43 +0,0 @@
-# Copyright (c) Alibaba, Inc. and its affiliates.
-from typing import Dict
-
-import torch
-import torch.nn.functional as F
-from torch import nn
-
-from modelscope.metainfo import Heads
-from modelscope.models.base import TorchHead
-from modelscope.models.builder import HEADS
-from modelscope.outputs import OutputKeys
-from modelscope.utils.constant import Tasks
-
-
-@HEADS.register_module(
-    Tasks.text_classification, module_name=Heads.text_classification)
-class SequenceClassificationHead(TorchHead):
-
-    def __init__(self, **kwargs):
-        super().__init__(**kwargs)
-        config = self.config
-        self.num_labels = config.num_labels
-        classifier_dropout = (
-            config['classifier_dropout'] if config.get('classifier_dropout')
-            is not None else config['hidden_dropout_prob'])
-        self.dropout = nn.Dropout(classifier_dropout)
-        self.classifier = nn.Linear(config['hidden_size'],
-                                    config['num_labels'])
-
-    def forward(self, inputs=None):
-        if isinstance(inputs, dict):
-            assert inputs.get('pooled_output') is not None
-            pooled_output = inputs.get('pooled_output')
-        else:
-            pooled_output = inputs
-        pooled_output = self.dropout(pooled_output)
-        logits = self.classifier(pooled_output)
-        return {OutputKeys.LOGITS: logits}
-
-    def compute_loss(self, outputs: Dict[str, torch.Tensor],
-                     labels) -> Dict[str, torch.Tensor]:
-        logits = outputs[OutputKeys.LOGITS]
-        return {OutputKeys.LOSS: F.cross_entropy(logits, labels)}
diff --git a/modelscope/models/nlp/heads/text_classification_head.py b/modelscope/models/nlp/heads/text_classification_head.py
new file mode 100644
index 00000000..427b7785
--- /dev/null
+++ b/modelscope/models/nlp/heads/text_classification_head.py
@@ -0,0 +1,59 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import Dict
+
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+from modelscope.metainfo import Heads
+from modelscope.models.base import TorchHead
+from modelscope.models.builder import HEADS
+from modelscope.outputs import (AttentionTextClassificationModelOutput,
+                                ModelOutputBase, OutputKeys)
+from modelscope.utils.constant import Tasks
+
+
+@HEADS.register_module(
+    Tasks.text_classification, module_name=Heads.text_classification)
+@HEADS.register_module(
+    Tasks.sentence_similarity, module_name=Heads.text_classification)
+@HEADS.register_module(Tasks.nli, module_name=Heads.text_classification)
+@HEADS.register_module(
+    Tasks.sentiment_classification, module_name=Heads.text_classification)
+class TextClassificationHead(TorchHead):
+
+    def __init__(self,
+                 hidden_size=768,
+                 classifier_dropout=0.1,
+                 num_labels=None,
+                 **kwargs):
+        super().__init__(
+            hidden_size=hidden_size,
+            classifier_dropout=classifier_dropout,
+            num_labels=num_labels,
+        )
+        assert num_labels is not None
+        self.dropout = nn.Dropout(classifier_dropout)
+        self.classifier = nn.Linear(hidden_size, num_labels)
+
+    def forward(self,
+                inputs: ModelOutputBase,
+                attention_mask=None,
+                labels=None,
+                **kwargs):
+        pooler_output = inputs.pooler_output
+        pooler_output = self.dropout(pooler_output)
+        logits = self.classifier(pooler_output)
+        loss = None
+        if labels is not None:
+            loss = self.compute_loss(logits, labels)
+
+        return AttentionTextClassificationModelOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=inputs.hidden_states,
+            attentions=inputs.attentions,
+        )
+
+    def compute_loss(self, logits: torch.Tensor, labels) -> torch.Tensor:
+        return F.cross_entropy(logits, labels)
diff --git a/modelscope/models/nlp/heads/text_generation_head.py b/modelscope/models/nlp/heads/text_generation_head.py
index ecb02e22..169760b6 100644
--- a/modelscope/models/nlp/heads/text_generation_head.py
+++ b/modelscope/models/nlp/heads/text_generation_head.py
@@ -24,7 +24,7 @@ class TextGenerationHead(TorchHead):
     def get_output_embeddings(self):
         return self.linear
 
-    def forward(self, inputs=None):
+    def forward(self, inputs=None, **kwargs):
         logits = self.linear(inputs)
         return logits
 
diff --git a/modelscope/models/nlp/heads/text_ranking_head.py b/modelscope/models/nlp/heads/text_ranking_head.py
new file mode 100644
index 00000000..9b52d0cc
--- /dev/null
+++ b/modelscope/models/nlp/heads/text_ranking_head.py
@@ -0,0 +1,60 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import Dict
+
+import torch
+from torch import nn
+
+from modelscope.metainfo import Heads
+from modelscope.models.base import TorchHead
+from modelscope.models.builder import HEADS
+from modelscope.outputs import (AttentionTextClassificationModelOutput,
+                                ModelOutputBase, OutputKeys)
+from modelscope.utils.constant import Tasks
+
+
+@HEADS.register_module(Tasks.text_ranking, module_name=Heads.text_ranking)
+class TextRankingHead(TorchHead):
+
+    def __init__(self,
+                 hidden_size=768,
+                 classifier_dropout=0.1,
+                 num_labels=1,
+                 neg_sample=4,
+                 **kwargs):
+        super().__init__(
+            hidden_size=hidden_size,
+            classifier_dropout=classifier_dropout,
+            num_labels=num_labels,
+            neg_sample=neg_sample,
+        )
+        self.neg_sample = neg_sample
+        self.dropout = nn.Dropout(classifier_dropout)
+        self.classifier = nn.Linear(hidden_size, num_labels)
+
+    def forward(self,
+                inputs: ModelOutputBase,
+                attention_mask=None,
+                labels=None,
+                **kwargs):
+        pooler_output = inputs.pooler_output
+        pooler_output = self.dropout(pooler_output)
+        logits = self.classifier(pooler_output)
+        loss = None
+        if self.training:
+            loss = self.compute_loss(logits)
+
+        return AttentionTextClassificationModelOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=inputs.hidden_states,
+            attentions=inputs.attentions,
+        )
+
+    def compute_loss(self, logits: torch.Tensor) -> torch.Tensor:
+        scores = logits.view(-1, self.neg_sample + 1)
+        batch_size = scores.size(0)
+        loss_fct = torch.nn.CrossEntropyLoss()
+        target_label = torch.zeros(
+            batch_size, dtype=torch.long, device=scores.device)
+        loss = loss_fct(scores, target_label)
+        return loss
diff --git a/modelscope/models/nlp/heads/token_classification_head.py b/modelscope/models/nlp/heads/token_classification_head.py
index 443b24e3..66f5bcd1 100644
--- a/modelscope/models/nlp/heads/token_classification_head.py
+++ b/modelscope/models/nlp/heads/token_classification_head.py
@@ -4,11 +4,13 @@ from typing import Dict
 import torch
 import torch.nn.functional as F
 from torch import nn
+from torch.nn import CrossEntropyLoss
 
 from modelscope.metainfo import Heads
 from modelscope.models.base import TorchHead
 from modelscope.models.builder import HEADS
-from modelscope.outputs import OutputKeys
+from modelscope.outputs import (AttentionTokenClassificationModelOutput,
+                                ModelOutputBase, OutputKeys)
 from modelscope.utils.constant import Tasks
 
 
@@ -20,28 +22,49 @@ from modelscope.utils.constant import Tasks
     Tasks.part_of_speech, module_name=Heads.token_classification)
 class TokenClassificationHead(TorchHead):
 
-    def __init__(self, **kwargs):
-        super().__init__(**kwargs)
-        config = self.config
-        self.num_labels = config.num_labels
-        classifier_dropout = (
-            config['classifier_dropout'] if config.get('classifier_dropout')
-            is not None else config['hidden_dropout_prob'])
+    def __init__(self,
+                 hidden_size=768,
+                 classifier_dropout=0.1,
+                 num_labels=None,
+                 **kwargs):
+        super().__init__(
+            num_labels=num_labels,
+            classifier_dropout=classifier_dropout,
+            hidden_size=hidden_size,
+        )
+        assert num_labels is not None
         self.dropout = nn.Dropout(classifier_dropout)
-        self.classifier = nn.Linear(config['hidden_size'],
-                                    config['num_labels'])
+        self.classifier = nn.Linear(hidden_size, num_labels)
 
-    def forward(self, inputs=None):
-        if isinstance(inputs, dict):
-            assert inputs.get('sequence_output') is not None
-            sequence_output = inputs.get('sequence_output')
-        else:
-            sequence_output = inputs
+    def forward(self,
+                inputs: ModelOutputBase,
+                attention_mask=None,
+                labels=None,
+                **kwargs):
+        sequence_output = inputs.last_hidden_state
         sequence_output = self.dropout(sequence_output)
         logits = self.classifier(sequence_output)
-        return logits
+        loss = None
+        if labels is not None:
+            loss = self.compute_loss(logits, attention_mask, labels)
 
-    def compute_loss(self, outputs: Dict[str, torch.Tensor],
-                     labels) -> Dict[str, torch.Tensor]:
-        logits = outputs[OutputKeys.LOGITS]
-        return F.cross_entropy(logits, labels)
+        return AttentionTokenClassificationModelOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=inputs.hidden_states,
+            attentions=inputs.attentions)
+
+    def compute_loss(self, logits: torch.Tensor, attention_mask,
+                     labels) -> torch.Tensor:
+        loss_fct = CrossEntropyLoss()
+        # Only keep active parts of the loss
+        if attention_mask is not None:
+            active_loss = attention_mask.view(-1) == 1
+            active_logits = logits.view(-1, self.num_labels)
+            active_labels = torch.where(
+                active_loss, labels.view(-1),
+                torch.tensor(loss_fct.ignore_index).type_as(labels))
+            loss = loss_fct(active_logits, active_labels)
+        else:
+            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+        return loss
diff --git a/modelscope/models/nlp/hf_transformers/__init__.py b/modelscope/models/nlp/hf_transformers/__init__.py
new file mode 100644
index 00000000..71acd693
--- /dev/null
+++ b/modelscope/models/nlp/hf_transformers/__init__.py
@@ -0,0 +1,21 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .backbone import TransformersModel
+else:
+    _import_structure = {
+        'backbone': ['TransformersModel'],
+    }
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/models/nlp/hf_transformers/backbone.py b/modelscope/models/nlp/hf_transformers/backbone.py
new file mode 100644
index 00000000..60321ae9
--- /dev/null
+++ b/modelscope/models/nlp/hf_transformers/backbone.py
@@ -0,0 +1,109 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch BERT model. """
+
+from transformers import AutoConfig, AutoModel
+from transformers.modeling_utils import PreTrainedModel
+
+from modelscope.metainfo import Models
+from modelscope.models import TorchModel
+from modelscope.models.builder import MODELS
+from modelscope.outputs import AttentionBackboneModelOutput
+from modelscope.utils.constant import Tasks
+from modelscope.utils.logger import get_logger
+from modelscope.utils.nlp.utils import parse_labels_in_order
+
+logger = get_logger()
+
+
+def _get_model_class(config, model_mapping):
+    supported_models = model_mapping[type(config)]
+    if not isinstance(supported_models, (list, tuple)):
+        return supported_models
+
+    name_to_model = {model.__name__: model for model in supported_models}
+    architectures = getattr(config, 'architectures', [])
+    for arch in architectures:
+        if arch in name_to_model:
+            return name_to_model[arch]
+        elif f'TF{arch}' in name_to_model:
+            return name_to_model[f'TF{arch}']
+        elif f'Flax{arch}' in name_to_model:
+            return name_to_model[f'Flax{arch}']
+
+    # If not architecture is set in the config or match the supported models, the first element of the tuple is the
+    # defaults.
+    return supported_models[0]
+
+
+@MODELS.register_module(
+    group_key=Tasks.backbone, module_name=Models.transformers)
+class TransformersModel(TorchModel, PreTrainedModel):
+    """The Bert Model transformer outputting raw hidden-states without any
+    specific head on top.
+
+    This model inherits from [`PreTrainedModel`]. Check the superclass
+    documentation for the generic methods the library implements for all its
+    model (such as downloading or saving, resizing the input embeddings, pruning
+    heads etc.)
+
+    This model is also a PyTorch
+    [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module)
+    subclass. Use it as a regular PyTorch Module and refer to the PyTorch
+    documentation for all matter related to general usage and behavior.
+
+    Parameters:
+        config ([`BertConfig`]): Model configuration class with all the
+        parameters of the model.
+            Initializing with a config file does not load the weights associated
+            with the model, only the configuration. Check out the
+            [`~PreTrainedModel.from_pretrained`] method to load the model
+            weights.
+
+    The model can behave as an encoder (with only self-attention) as well as a
+    decoder, in which case a layer of cross-attention is added between the
+    self-attention layers, following the architecture described in [Attention is
+    all you need](https://arxiv.org/abs/1706.03762) by Ashish Vaswani, Noam
+    Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz
+    Kaiser and Illia Polosukhin.
+
+    To behave as an decoder the model needs to be initialized with the
+    `is_decoder` argument of the configuration set to `True`. To be used in a
+    Seq2Seq model, the model needs to initialized with both `is_decoder`
+    argument and `add_cross_attention` set to `True`; an `encoder_hidden_states`
+    is then expected as an input to the forward pass.
+
+
+    """
+
+    @classmethod
+    def _instantiate(cls, model_dir=None, **config):
+        config, kwargs = AutoConfig.from_pretrained(
+            model_dir,
+            return_unused_kwargs=True,
+            trust_remote_code=False,
+            **config)
+        model_mapping = AutoModel._model_mapping
+        if type(config) in model_mapping.keys():
+            model_class = _get_model_class(config, model_mapping)
+            model = model_class(config)
+            model.model_dir = model_dir
+            return model
+
+        raise ValueError(
+            f'Unrecognized configuration class {config.__class__} for the AutoModel'
+            f"Model type should be one of {', '.join(c.__name__ for c in model_mapping.keys())}."
+        )
diff --git a/modelscope/models/nlp/lstm/__init__.py b/modelscope/models/nlp/lstm/__init__.py
new file mode 100644
index 00000000..61e94ee5
--- /dev/null
+++ b/modelscope/models/nlp/lstm/__init__.py
@@ -0,0 +1,23 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .backbone import LSTMModel
+    from .token_classification import LSTMForTokenClassificationWithCRF
+else:
+    _import_structure = {
+        'backbone': ['LSTM'],
+        'token_classification': ['LSTMForTokenClassificationWithCRF'],
+    }
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/models/nlp/lstm/backbone.py b/modelscope/models/nlp/lstm/backbone.py
new file mode 100644
index 00000000..137bb921
--- /dev/null
+++ b/modelscope/models/nlp/lstm/backbone.py
@@ -0,0 +1,41 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+"""PyTorch LSTM model. """
+
+import torch.nn as nn
+
+from modelscope.metainfo import Models
+from modelscope.models import TorchModel
+from modelscope.models.builder import MODELS
+from modelscope.outputs import BackboneModelOutput
+from modelscope.utils.constant import Tasks
+
+
+@MODELS.register_module(group_key=Tasks.backbone, module_name=Models.lstm)
+class LSTMModel(TorchModel):
+
+    def __init__(self, vocab_size, embed_width, hidden_size=100, **kwargs):
+        super().__init__()
+        hidden_size = kwargs.get('lstm_hidden_size', hidden_size)
+        self.embedding = Embedding(vocab_size, embed_width)
+        self.lstm = nn.LSTM(
+            embed_width,
+            hidden_size,
+            num_layers=1,
+            bidirectional=True,
+            batch_first=True)
+
+    def forward(self, input_ids, **kwargs) -> BackboneModelOutput:
+        embedding = self.embedding(input_ids)
+        lstm_output, _ = self.lstm(embedding)
+        return BackboneModelOutput(last_hidden_state=lstm_output)
+
+
+class Embedding(nn.Module):
+
+    def __init__(self, vocab_size, embed_width):
+        super(Embedding, self).__init__()
+
+        self.embedding = nn.Embedding(vocab_size, embed_width)
+
+    def forward(self, input_ids):
+        return self.embedding(input_ids)
diff --git a/modelscope/models/nlp/lstm/token_classification.py b/modelscope/models/nlp/lstm/token_classification.py
new file mode 100644
index 00000000..5ec2cc09
--- /dev/null
+++ b/modelscope/models/nlp/lstm/token_classification.py
@@ -0,0 +1,49 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from modelscope.metainfo import Heads, Models
+from modelscope.models.builder import MODELS
+from modelscope.models.nlp.task_models import (
+    ModelForTokenClassification, ModelForTokenClassificationWithCRF)
+from modelscope.utils import logger as logging
+from modelscope.utils.constant import Tasks
+
+logger = logging.get_logger()
+
+
+@MODELS.register_module(Tasks.token_classification, module_name=Models.lcrf)
+@MODELS.register_module(
+    Tasks.named_entity_recognition, module_name=Models.lcrf)
+@MODELS.register_module(Tasks.part_of_speech, module_name=Models.lcrf)
+@MODELS.register_module(Tasks.word_segmentation, module_name=Models.lcrf)
+@MODELS.register_module(Tasks.word_segmentation, module_name=Models.lcrf_wseg)
+class LSTMForTokenClassificationWithCRF(ModelForTokenClassificationWithCRF):
+    r"""Model with a token classification head on top (a linear layer on top of
+    the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks, word-segmentation.
+
+    """
+    override_base_model_type = True
+    base_model_type = Models.lstm
+    head_type = Heads.lstm_crf
+
+    def parse_head_cfg(self):
+        head_cfg = super(ModelForTokenClassification, self).parse_head_cfg()
+        head_cfg['hidden_size'] = (
+            head_cfg.hidden_size
+            if hasattr(head_cfg, 'hidden_size') else head_cfg.lstm_hidden_size)
+        head_cfg['num_labels'] = self.config.num_labels
+        return head_cfg
diff --git a/modelscope/models/nlp/megatron_bert/__init__.py b/modelscope/models/nlp/megatron_bert/__init__.py
index c39609e7..c0aa427d 100644
--- a/modelscope/models/nlp/megatron_bert/__init__.py
+++ b/modelscope/models/nlp/megatron_bert/__init__.py
@@ -11,7 +11,7 @@ else:
     _import_structure = {
         'configuration': ['MegatronBertConfig'],
         'backbone': ['MegatronBertModel'],
-        'distributed_plug': ['MegatronBertForMaskedLM'],
+        'fill_mask': ['MegatronBertForMaskedLM'],
     }
 
     import sys
diff --git a/modelscope/models/nlp/megatron_bert/backbone.py b/modelscope/models/nlp/megatron_bert/backbone.py
index 56bea4ae..92bc7475 100644
--- a/modelscope/models/nlp/megatron_bert/backbone.py
+++ b/modelscope/models/nlp/megatron_bert/backbone.py
@@ -712,88 +712,88 @@ class MegatronBertModel(MegatronBertPreTrainedModel):
                 **kwargs) -> AttentionBackboneModelOutput:
         r"""
         Args:
-        input_ids (`torch.LongTensor` of shape `((batch_size, sequence_length)`):
-            Indices of input sequence tokens in the vocabulary.
+            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+                Indices of input sequence tokens in the vocabulary.
 
-            Indices can be obtained using [`BertTokenizer`]. See
-            [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`]
-            for details.
+                Indices can be obtained using [`BertTokenizer`]. See
+                [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`]
+                for details.
 
-            [What are input IDs?](../glossary#input-ids)
-        attention_mask (`torch.FloatTensor` of shape `((batch_size, sequence_length)`, *optional*):
-            Mask to avoid performing attention on padding token indices. Mask
-            values selected in `[0, 1]`:
+                [What are input IDs?](../glossary#input-ids)
+            attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask
+                values selected in `[0, 1]`:
 
-            - 1 for tokens that are **not masked**,
-            - 0 for tokens that are **masked**.
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
 
-            [What are attention masks?](../glossary#attention-mask)
-        token_type_ids (`torch.LongTensor` of shape `((batch_size, sequence_length)`, *optional*):
-            Segment token indices to indicate first and second portions of the
-            inputs. Indices are selected in `[0, 1]`:
+                [What are attention masks?](../glossary#attention-mask)
+            token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Segment token indices to indicate first and second portions of the
+                inputs. Indices are selected in `[0, 1]`:
 
-            - 0 corresponds to a *sentence A* token,
-            - 1 corresponds to a *sentence B* token.
+                - 0 corresponds to a *sentence A* token,
+                - 1 corresponds to a *sentence B* token.
 
-            [What are token type IDs?](../glossary#token-type-ids)
-        position_ids (`torch.LongTensor` of shape `((batch_size, sequence_length)`, *optional*):
-            Indices of positions of each input sequence tokens in the position
-            embeddings. Selected in the range `[0,
-            config.max_position_embeddings - 1]`.
+                [What are token type IDs?](../glossary#token-type-ids)
+            position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Indices of positions of each input sequence tokens in the position
+                embeddings. Selected in the range `[0,
+                config.max_position_embeddings - 1]`.
 
-            [What are position IDs?](../glossary#position-ids)
-        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers,
-        num_heads)`, *optional*):
-            Mask to nullify selected heads of the self-attention modules. Mask
-            values selected in `[0, 1]`:
+                [What are position IDs?](../glossary#position-ids)
+            head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers,
+                num_heads)`, *optional*):
+                Mask to nullify selected heads of the self-attention modules. Mask
+                values selected in `[0, 1]`:
 
-            - 1 indicates the head is **not masked**,
-            - 0 indicates the head is **masked**.
+                - 1 indicates the head is **not masked**,
+                - 0 indicates the head is **masked**.
 
-        inputs_embeds (`torch.FloatTensor` of shape `((batch_size, sequence_length, hidden_size)`,
-        *optional*):
-            Optionally, instead of passing `input_ids` you can choose to
-            directly pass an embedded representation. This is useful if you want
-            more control over how to convert `input_ids` indices into associated
-            vectors than the model's internal embedding lookup matrix.
-        output_attentions (`bool`, *optional*):
-            Whether or not to return the attentions tensors of all attention
-            layers. See `attentions` under returned tensors for more detail.
-        output_hidden_states (`bool`, *optional*):
-            Whether or not to return the hidden states of all layers. See
-            `hidden_states` under returned tensors for more detail.
-        return_dict (`bool`, *optional*):
-            Whether or not to return a [`~file_utils.ModelOutput`] instead of a
-            plain tuple.
-        encoder_hidden_states  (`torch.FloatTensor` of shape `(batch_size,
-        sequence_length, hidden_size)`, *optional*):
-            Sequence of hidden-states at the output of the last layer of the
-            encoder. Used in the cross-attention if the model is configured as a
-            decoder.
-        encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size,
-        sequence_length)`, *optional*):
-            Mask to avoid performing attention on the padding token indices of
-            the encoder input. This mask is used in the cross-attention if the
-            model is configured as a decoder. Mask values selected in `[0, 1]`:
+            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`,
+                *optional*):
+                Optionally, instead of passing `input_ids` you can choose to
+                directly pass an embedded representation. This is useful if you want
+                more control over how to convert `input_ids` indices into associated
+                vectors than the model's internal embedding lookup matrix.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention
+                layers. See `attentions` under returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See
+                `hidden_states` under returned tensors for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~file_utils.ModelOutput`] instead of a
+                plain tuple.
+            encoder_hidden_states  (`torch.FloatTensor` of shape `(batch_size,
+                sequence_length, hidden_size)`, *optional*):
+                Sequence of hidden-states at the output of the last layer of the
+                encoder. Used in the cross-attention if the model is configured as a
+                decoder.
+            encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size,
+                sequence_length)`, *optional*):
+                Mask to avoid performing attention on the padding token indices of
+                the encoder input. This mask is used in the cross-attention if the
+                model is configured as a decoder. Mask values selected in `[0, 1]`:
 
-            - 1 for tokens that are **not masked**,
-            - 0 for tokens that are **masked**.
-        past_key_values (`tuple(tuple(torch.FloatTensor))` of length
-        `config.n_layers` with each tuple having 4 tensors of shape
-        `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
-            Contains precomputed key and value hidden states of the attention
-            blocks. Can be used to speed up decoding.
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+            past_key_values (`tuple(tuple(torch.FloatTensor))` of length
+                `config.n_layers` with each tuple having 4 tensors of shape
+                `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+                Contains precomputed key and value hidden states of the attention
+                blocks. Can be used to speed up decoding.
 
-            If `past_key_values` are used, the user can optionally input only
-            the last `decoder_input_ids` (those that don't have their past key
-            value states given to this model) of shape `(batch_size, 1)` instead
-            of all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
-        use_cache (`bool`, *optional*):
-            If set to `True`, `past_key_values` key value states are returned
-            and can be used to speed up decoding (see `past_key_values`).
-        Others (**kwargs)
-            some additional parameters might passed in from upstream pipeline,
-            which not influence the results.
+                If `past_key_values` are used, the user can optionally input only
+                the last `decoder_input_ids` (those that don't have their past key
+                value states given to this model) of shape `(batch_size, 1)` instead
+                of all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned
+                and can be used to speed up decoding (see `past_key_values`).
+            Others (**kwargs)
+                some additional parameters might passed in from upstream pipeline,
+                which not influence the results.
         """
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
diff --git a/modelscope/models/nlp/megatron_bert/configuration.py b/modelscope/models/nlp/megatron_bert/configuration.py
index 951fd7d1..85769ad6 100644
--- a/modelscope/models/nlp/megatron_bert/configuration.py
+++ b/modelscope/models/nlp/megatron_bert/configuration.py
@@ -77,7 +77,6 @@ class MegatronBertConfig(PretrainedConfig):
 
     Examples:
 
-    ```python
     >>> from transformers import MegatronBertConfig, MegatronBertModel
 
     >>> # Initializing a MEGATRON_BERT bert-base-uncased style configuration
@@ -88,7 +87,7 @@ class MegatronBertConfig(PretrainedConfig):
 
     >>> # Accessing the model configuration
     >>> configuration = model.config
-    ```"""
+    """
     model_type = 'megatron-bert'
 
     def __init__(self,
diff --git a/modelscope/models/nlp/megatron_bert/fill_mask.py b/modelscope/models/nlp/megatron_bert/fill_mask.py
index 2aa51d3d..652cf1ed 100644
--- a/modelscope/models/nlp/megatron_bert/fill_mask.py
+++ b/modelscope/models/nlp/megatron_bert/fill_mask.py
@@ -158,59 +158,61 @@ class MegatronBertForMaskedLM(MegatronBertPreTrainedModel):
     ):
         r"""
         Args:
-        input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
-            Indices of input sequence tokens in the vocabulary.
+            input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
+                Indices of input sequence tokens in the vocabulary.
 
-            Indices can be obtained using :class:`~modelscope.models.nlp.structbert.SbertTokenizer`. See
-            :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
-            details.
+                Indices can be obtained using :class:`~modelscope.models.nlp.structbert.SbertTokenizer`. See
+                :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__`
+                for details.
 
-            `What are input IDs? <../glossary.html#input-ids>`__
-        attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
-            Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+                `What are input IDs? <../glossary.html#input-ids>`__
+            attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+                Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
 
-            - 1 for tokens that are **not masked**,
-            - 0 for tokens that are **masked**.
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
 
-            `What are attention masks? <../glossary.html#attention-mask>`__
-        token_type_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
-            Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0,
-            1]``:
+                `What are attention masks? <../glossary.html#attention-mask>`__
+            token_type_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+                Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0,
+                1]``:
 
-            - 0 corresponds to a `sentence A` token,
-            - 1 corresponds to a `sentence B` token.
+                - 0 corresponds to a `sentence A` token,
+                - 1 corresponds to a `sentence B` token.
 
-            `What are token type IDs? <../glossary.html#token-type-ids>`_
-        position_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
-            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
-            config.max_position_embeddings - 1]``.
+                `What are token type IDs? <../glossary.html#token-type-ids>`_
+            position_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+                Indices of positions of each input sequence tokens in the position embeddings. Selected in the range
+                ``[0,config.max_position_embeddings - 1]``.
 
-            `What are position IDs? <../glossary.html#position-ids>`_
-        head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`):
-            Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``:
+                `What are position IDs? <../glossary.html#position-ids>`_
+            head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`,
+                `optional`):
+                Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``:
 
-            - 1 indicates the head is **not masked**,
-            - 0 indicates the head is **masked**.
+                - 1 indicates the head is **not masked**,
+                - 0 indicates the head is **masked**.
 
-        inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
-            Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
-            This is useful if you want more control over how to convert :obj:`input_ids` indices into associated
-            vectors than the model's internal embedding lookup matrix.
-        output_attentions (:obj:`bool`, `optional`):
-            Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
-            tensors for more detail.
-        output_hidden_states (:obj:`bool`, `optional`):
-            Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
-            more detail.
-        return_dict (:obj:`bool`, `optional`):
-            Whether or not to return a :class:`~transformers.ModelOutput` instead of a plain tuple.
-        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`,
-        *optional*):
-            Labels for computing the masked language modeling loss. Indices
-            should be in `[-100, 0, ..., config.vocab_size]` (see `input_ids`
-            docstring) Tokens with indices set to `-100` are ignored (masked),
-            the loss is only computed for the tokens with labels in `[0, ...,
-            config.vocab_size]`
+            inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`,
+                `optional`):
+                Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded
+                representation. This is useful if you want more control over how to convert :obj:`input_ids` indices
+                into associated vectors than the model's internal embedding lookup matrix.
+            output_attentions (:obj:`bool`, `optional`):
+                Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under
+                returned tensors for more detail.
+            output_hidden_states (:obj:`bool`, `optional`):
+                Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors
+                for more detail.
+            return_dict (:obj:`bool`, `optional`):
+                Whether or not to return a :class:`~transformers.ModelOutput` instead of a plain tuple.
+            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`,
+            *optional*):
+                Labels for computing the masked language modeling loss. Indices
+                should be in `[-100, 0, ..., config.vocab_size]` (see `input_ids`
+                docstring) Tokens with indices set to `-100` are ignored (masked),
+                the loss is only computed for the tokens with labels in `[0, ...,
+                config.vocab_size]`
 
         Returns:
             Returns `modelscope.outputs.AttentionFillMaskModelOutput`
diff --git a/modelscope/models/nlp/mglm/data_utils/__init__.py b/modelscope/models/nlp/mglm/data_utils/__init__.py
index fa243cb4..67d465c5 100644
--- a/modelscope/models/nlp/mglm/data_utils/__init__.py
+++ b/modelscope/models/nlp/mglm/data_utils/__init__.py
@@ -39,10 +39,10 @@ def should_split(split):
     """
     given split proportions checks if should split
     Examples:
-    >>> should_split([10,0,0])
-    False
-    >>> should_split([1,.1,.2])
-    True
+        >>> should_split([10,0,0])
+        >>> False
+        >>> should_split([1,.1,.2])
+        >>> True
     """
     return max(split) / sum(split) != 1.
 
diff --git a/modelscope/models/nlp/mglm/data_utils/extraction.py b/modelscope/models/nlp/mglm/data_utils/extraction.py
index da062f34..11e0ed62 100644
--- a/modelscope/models/nlp/mglm/data_utils/extraction.py
+++ b/modelscope/models/nlp/mglm/data_utils/extraction.py
@@ -6,68 +6,75 @@ import os
 import json
 import nltk
 
-nltk.download('punkt')
-
 
 class NLTKSegmenter:
 
     def __init(self):
-        pass
+        download_nltk()
 
     @staticmethod
     def segment_string(article):
         return nltk.tokenize.sent_tokenize(article)
 
 
-wiki_path = 'data/extracted'
-output_path = 'formatted/wiki-key.txt'
-segmenter = NLTKSegmenter()
-with open(output_path, 'w') as output:
-    for dirname in glob.glob(os.path.join(wiki_path, '*'), recursive=False):
-        for filename in glob.glob(
-                os.path.join(dirname, 'wiki_*'), recursive=True):
-            print(filename)
-            article_lines = []
-            article_open = False
-            with open(
-                    filename, mode='r', newline='\n',
-                    encoding='utf-8') as file:
-                for line in file:
-                    line = line.rstrip()
-                    if '<doc id=' in line:
-                        article_open = True
-                    elif '</doc>' in line:
-                        key_sentences, contents = [], []
-                        key, content = None, []
-                        for sentences in article_lines[1:]:
-                            if len(sentences) > 1:
-                                if key:
-                                    if len(content) > 0 or len(contents) == 0:
-                                        key_sentences.append(key)
-                                        contents.append(content)
+def download_nltk():
+    nltk.download('punkt')
+    wiki_path = 'data/extracted'
+    output_path = 'formatted/wiki-key.txt'
+    segmenter = NLTKSegmenter()
+    with open(output_path, 'w') as output:
+        for dirname in glob.glob(
+                os.path.join(wiki_path, '*'), recursive=False):
+            for filename in glob.glob(
+                    os.path.join(dirname, 'wiki_*'), recursive=True):
+                print(filename)
+                article_lines = []
+                article_open = False
+                with open(
+                        filename, mode='r', newline='\n',
+                        encoding='utf-8') as file:
+                    for line in file:
+                        line = line.rstrip()
+                        if '<doc id=' in line:
+                            article_open = True
+                        elif '</doc>' in line:
+                            key_sentences, contents = [], []
+                            key, content = None, []
+                            for sentences in article_lines[1:]:
+                                if len(sentences) > 1:
+                                    if key:
+                                        if len(content) > 0 or len(
+                                                contents) == 0:
+                                            key_sentences.append(key)
+                                            contents.append(content)
+                                        else:
+                                            contents[-1].append(key)
+                                        key, content = None, []
+                                    key_sentences.append(sentences[0])
+                                    contents.append(sentences[1:])
+                                elif len(sentences) > 0:
+                                    if key:
+                                        content.append(sentences[0])
                                     else:
-                                        contents[-1].append(key)
-                                    key, content = None, []
-                                key_sentences.append(sentences[0])
-                                contents.append(sentences[1:])
-                            elif len(sentences) > 0:
-                                if key:
-                                    content.append(sentences[0])
+                                        key = sentences[0]
+                            if key:
+                                if len(content) > 0 or len(contents) == 0:
+                                    key_sentences.append(key)
+                                    contents.append(content)
                                 else:
-                                    key = sentences[0]
-                        if key:
-                            if len(content) > 0 or len(contents) == 0:
-                                key_sentences.append(key)
-                                contents.append(content)
-                            else:
-                                contents[-1].append(key)
-                        contents = [' '.join(content) for content in contents]
-                        article = {'key': key_sentences, 'content': contents}
-                        output.write(json.dumps(article))
-                        output.write('\n')
-                        article_open = False
-                        article_lines = []
-                    else:
-                        if article_open and line:
-                            sentences = segmenter.segment_string(line)
-                            article_lines.append(sentences)
+                                    contents[-1].append(key)
+                            contents = [
+                                ' '.join(content) for content in contents
+                            ]
+                            article = {
+                                'key': key_sentences,
+                                'content': contents
+                            }
+                            output.write(json.dumps(article))
+                            output.write('\n')
+                            article_open = False
+                            article_lines = []
+                        else:
+                            if article_open and line:
+                                sentences = segmenter.segment_string(line)
+                                article_lines.append(sentences)
diff --git a/modelscope/models/nlp/mglm/data_utils/lazy_loader.py b/modelscope/models/nlp/mglm/data_utils/lazy_loader.py
index 77a77a8a..bf24556e 100644
--- a/modelscope/models/nlp/mglm/data_utils/lazy_loader.py
+++ b/modelscope/models/nlp/mglm/data_utils/lazy_loader.py
@@ -138,12 +138,12 @@ class LazyLoader(object):
         map_fn (callable): Fetched strings are passed through map_fn before being returned.
 
     Example of lazy loader directory structure:
-    file.json
-    file.lazy/
-        data_type1
-        data_type1.len.pkl
-        data_type2
-        data_type2.len.pkl
+        file.json
+        file.lazy/
+            data_type1
+            data_type1.len.pkl
+            data_type2
+            data_type2.len.pkl
     """
 
     def __init__(self,
diff --git a/modelscope/models/nlp/mglm/data_utils/wordpiece.py b/modelscope/models/nlp/mglm/data_utils/wordpiece.py
index 1cecffbd..b2a3fd3d 100755
--- a/modelscope/models/nlp/mglm/data_utils/wordpiece.py
+++ b/modelscope/models/nlp/mglm/data_utils/wordpiece.py
@@ -323,8 +323,8 @@ class WordpieceTokenizer(object):
         using the given vocabulary.
 
         For example:
-          input = "unaffable"
-          output = ["un", "##aff", "##able"]
+          >>> input = "unaffable"
+          >>> output = ["un", "##aff", "##able"]
 
         Args:
           text: A single token or whitespace separated tokens. This should have
diff --git a/modelscope/models/nlp/mglm/model/modeling_bert.py b/modelscope/models/nlp/mglm/model/modeling_bert.py
index 63f21224..28b5cd1e 100644
--- a/modelscope/models/nlp/mglm/model/modeling_bert.py
+++ b/modelscope/models/nlp/mglm/model/modeling_bert.py
@@ -926,19 +926,17 @@ class BertModel(PreTrainedBertModel):
             classifier pretrained on top of the hidden state associated to the first character of the
             input (`CLF`) to train on the Next-Sentence task (see BERT's paper).
 
-    Example usage:
-    ```python
-    # Already been converted into WordPiece token ids
-    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
-    input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
-    token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
+    Examples:
+        >>> # Already been converted into WordPiece token ids
+        >>> input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
+        >>> input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
+        >>> token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
 
-    config = modeling.BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
-        num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
+        >>> config = modeling.BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
+        >>>     num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
 
-    model = modeling.BertModel(config=config)
-    all_encoder_layers, pooled_output = model(input_ids, token_type_ids, input_mask)
-    ```
+        >>> model = modeling.BertModel(config=config)
+        >>> all_encoder_layers, pooled_output = model(input_ids, token_type_ids, input_mask)
     """ # noqa
 
     def __init__(self, config):
@@ -1029,19 +1027,17 @@ class BertForPreTraining(PreTrainedBertModel):
             - the masked language modeling logits of shape [batch_size, sequence_length, vocab_size], and
             - the next sentence classification logits of shape [batch_size, 2].
 
-    Example usage:
-    ```python
-    # Already been converted into WordPiece token ids
-    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
-    input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
-    token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
+    Examples:
+        >>> # Already been converted into WordPiece token ids
+        >>> input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
+        >>> input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
+        >>> token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
 
-    config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
-        num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
+        >>> config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
+        >>>     num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
 
-    model = BertForPreTraining(config)
-    masked_lm_logits_scores, seq_relationship_logits = model(input_ids, token_type_ids, input_mask)
-    ```
+        >>> model = BertForPreTraining(config)
+        >>> masked_lm_logits_scores, seq_relationship_logits = model(input_ids, token_type_ids, input_mask)
     """
 
     def __init__(self, config):
@@ -1109,19 +1105,17 @@ class BertForMaskedLM(PreTrainedBertModel):
         if `masked_lm_labels` is `None`:
             Outputs the masked language modeling logits of shape [batch_size, sequence_length, vocab_size].
 
-    Example usage:
-    ```python
-    # Already been converted into WordPiece token ids
-    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
-    input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
-    token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
+    Examples:
+        >>> # Already been converted into WordPiece token ids
+        >>> input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
+        >>> input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
+        >>> token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
 
-    config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
-        num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
+        >>> config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
+        >>>     num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
 
-    model = BertForMaskedLM(config)
-    masked_lm_logits_scores = model(input_ids, token_type_ids, input_mask)
-    ```
+        >>> model = BertForMaskedLM(config)
+        >>> masked_lm_logits_scores = model(input_ids, token_type_ids, input_mask)
     """
 
     def __init__(self, config):
@@ -1184,19 +1178,17 @@ class BertForNextSentencePrediction(PreTrainedBertModel):
         if `next_sentence_label` is `None`:
             Outputs the next sentence classification logits of shape [batch_size, 2].
 
-    Example usage:
-    ```python
-    # Already been converted into WordPiece token ids
-    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
-    input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
-    token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
+    Examples:
+        >>> # Already been converted into WordPiece token ids
+        >>> input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
+        >>> input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
+        >>> token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
 
-    config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
-        num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
+        >>> config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
+        >>>     num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
 
-    model = BertForNextSentencePrediction(config)
-    seq_relationship_logits = model(input_ids, token_type_ids, input_mask)
-    ```
+        >>> model = BertForNextSentencePrediction(config)
+        >>> seq_relationship_logits = model(input_ids, token_type_ids, input_mask)
     """
 
     def __init__(self, config):
@@ -1258,21 +1250,19 @@ class BertForSequenceClassification(PreTrainedBertModel):
         if `labels` is `None`:
             Outputs the classification logits of shape [batch_size, num_labels].
 
-    Example usage:
-    ```python
-    # Already been converted into WordPiece token ids
-    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
-    input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
-    token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
+    Examples:
+        >>> # Already been converted into WordPiece token ids
+        >>> input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
+        >>> input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
+        >>> token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
 
-    config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
-        num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
+        >>> config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
+        >>>     num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
 
-    num_labels = 2
+        >>> num_labels = 2
 
-    model = BertForSequenceClassification(config, num_labels)
-    logits = model(input_ids, token_type_ids, input_mask)
-    ```
+        >>> model = BertForSequenceClassification(config, num_labels)
+        >>> logits = model(input_ids, token_type_ids, input_mask)
     """
 
     def __init__(self, config, num_labels=2):
@@ -1335,20 +1325,18 @@ class BertForMultipleChoice(PreTrainedBertModel):
         if `labels` is `None`:
             Outputs the classification logits of shape [batch_size, num_labels].
 
-    Example usage:
-    ```python
-    # Already been converted into WordPiece token ids
-    input_ids = torch.LongTensor([[[31, 51, 99], [15, 5, 0]], [[12, 16, 42], [14, 28, 57]]])
-    input_mask = torch.LongTensor([[[1, 1, 1], [1, 1, 0]],[[1,1,0], [1, 0, 0]]])
-    token_type_ids = torch.LongTensor([[[0, 0, 1], [0, 1, 0]],[[0, 1, 1], [0, 0, 1]]])
-    config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
-        num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
+    Examples:
+        >>> # Already been converted into WordPiece token ids
+        >>> input_ids = torch.LongTensor([[[31, 51, 99], [15, 5, 0]], [[12, 16, 42], [14, 28, 57]]])
+        >>> input_mask = torch.LongTensor([[[1, 1, 1], [1, 1, 0]],[[1,1,0], [1, 0, 0]]])
+        >>> token_type_ids = torch.LongTensor([[[0, 0, 1], [0, 1, 0]],[[0, 1, 1], [0, 0, 1]]])
+        >>> config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
+        >>>     num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
 
-    num_choices = 2
+        >>> num_choices = 2
 
-    model = BertForMultipleChoice(config, num_choices)
-    logits = model(input_ids, token_type_ids, input_mask)
-    ```
+        >>> model = BertForMultipleChoice(config, num_choices)
+        >>> logits = model(input_ids, token_type_ids, input_mask)
     """
 
     def __init__(self, config):
@@ -1417,21 +1405,19 @@ class BertForTokenClassification(PreTrainedBertModel):
         if `labels` is `None`:
             Outputs the classification logits of shape [batch_size, sequence_length, num_labels].
 
-    Example usage:
-    ```python
-    # Already been converted into WordPiece token ids
-    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
-    input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
-    token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
+    Examples:
+        >>> # Already been converted into WordPiece token ids
+        >>> input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
+        >>> input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
+        >>> token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
 
-    config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
-        num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
+        >>> config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
+        >>>     num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
 
-    num_labels = 2
+        >>> num_labels = 2
 
-    model = BertForTokenClassification(config, num_labels)
-    logits = model(input_ids, token_type_ids, input_mask)
-    ```
+        >>> model = BertForTokenClassification(config, num_labels)
+        >>> logits = model(input_ids, token_type_ids, input_mask)
     """
 
     def __init__(self, config, num_labels=2):
@@ -1507,19 +1493,17 @@ class BertForQuestionAnswering(PreTrainedBertModel):
             Outputs a tuple of start_logits, end_logits which are the logits respectively for the start and end
             position tokens of shape [batch_size, sequence_length].
 
-    Example usage:
-    ```python
-    # Already been converted into WordPiece token ids
-    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
-    input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
-    token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
+    Examples:
+        >>> # Already been converted into WordPiece token ids
+        >>> input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
+        >>> input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
+        >>> token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
 
-    config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
-        num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
+        >>> config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
+        >>>     num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
 
-    model = BertForQuestionAnswering(config)
-    start_logits, end_logits = model(input_ids, token_type_ids, input_mask)
-    ```
+        >>> model = BertForQuestionAnswering(config)
+        >>> start_logits, end_logits = model(input_ids, token_type_ids, input_mask)
     """
 
     def __init__(self, config):
diff --git a/modelscope/models/nlp/mglm/requirements.txt b/modelscope/models/nlp/mglm/requirements.txt
index e44ae5d1..55e58e75 100644
--- a/modelscope/models/nlp/mglm/requirements.txt
+++ b/modelscope/models/nlp/mglm/requirements.txt
@@ -7,7 +7,6 @@ ftfy
 langdetect
 lsh
 matplotlib
-mpi4py
 nltk
 pandas
 regex
diff --git a/modelscope/models/nlp/mglm/tasks/data_utils.py b/modelscope/models/nlp/mglm/tasks/data_utils.py
index 8792d080..636739c0 100644
--- a/modelscope/models/nlp/mglm/tasks/data_utils.py
+++ b/modelscope/models/nlp/mglm/tasks/data_utils.py
@@ -48,16 +48,16 @@ class InputExample(object):
                  meta: Optional[Dict] = None,
                  idx=-1,
                  num_choices=1):
-        """
-        Create a new InputExample.
+        """Create a new InputExample.
 
-        :param guid: a unique textual identifier
-        :param text_a: the sequence of text
-        :param text_b: an optional, second sequence of text
-        :param label: an optional label
-        :param logits: an optional list of per-class logits
-        :param meta: an optional dictionary to store arbitrary meta information
-        :param idx: an optional numeric index
+        Args:
+            guid: a unique textual identifier
+            text_a: the sequence of text
+            text_b: an optional, second sequence of text
+            label: an optional label
+            logits: an optional list of per-class logits
+            meta: an optional dictionary to store arbitrary meta information
+            idx: an optional numeric index
         """
         self.guid = guid
         self.text_a = text_a
diff --git a/modelscope/models/nlp/mglm/tasks/superglue/pvp.py b/modelscope/models/nlp/mglm/tasks/superglue/pvp.py
index e149f503..d55c0d9e 100644
--- a/modelscope/models/nlp/mglm/tasks/superglue/pvp.py
+++ b/modelscope/models/nlp/mglm/tasks/superglue/pvp.py
@@ -185,10 +185,13 @@ class PVP(ABC):
         """
         Encode an input example using this pattern-verbalizer pair.
 
-        :param example: the input example to encode
-        :param priming: whether to use this example for priming
-        :param labeled: if ``priming=True``, whether the label should be appended to this example
-        :return: A tuple, consisting of a list of input ids and a list of token type ids
+        Args:
+            example: the input example to encode
+            priming: whether to use this example for priming
+            labeled: if ``priming=True``, whether the label should be appended to this example
+
+        Returns:
+            A tuple, consisting of a list of input ids and a list of token type ids
         """
 
         if not priming:
@@ -498,8 +501,10 @@ class PVP(ABC):
         mask token (or one consecutive sequence of mask tokens for PET with multiple masks). If a task requires only a
         single sequence of text, the second sequence should be an empty list.
 
-        :param example: the input example to process
-        :return: Two sequences of text. All text segments can optionally be marked as being shortenable.
+        Args:
+            example: the input example to process
+        Returns:
+            Two sequences of text. All text segments can optionally be marked as being shortenable.
         """
         pass
 
@@ -634,10 +639,13 @@ class CopaPVP(PVP):
         """
         Encode an input example using this pattern-verbalizer pair.
 
-        :param example: the input example to encode
-        :param priming: whether to use this example for priming
-        :param labeled: if ``priming=True``, whether the label should be appended to this example
-        :return: A tuple, consisting of a list of input ids and a list of token type ids
+        Args:
+            example: the input example to encode
+            priming: whether to use this example for priming
+            labeled: if ``priming=True``, whether the label should be appended to this example
+
+        Returns:
+             A tuple, consisting of a list of input ids and a list of token type ids
         """
         if self.continuous_prompt or self.pattern_id < 2:
             return super().encode(example, priming=priming, labeled=labeled)
@@ -765,11 +773,12 @@ class WscPVP(PVP):
                labeled: bool = False):
         """
         Encode an input example using this pattern-verbalizer pair.
-
-        :param example: the input example to encode
-        :param priming: whether to use this example for priming
-        :param labeled: if ``priming=True``, whether the label should be appended to this example
-        :return: A tuple, consisting of a list of input ids and a list of token type ids
+        Args:
+            example: the input example to encode
+            priming: whether to use this example for priming
+            labeled: if ``priming=True``, whether the label should be appended to this example
+        Returns:
+             A tuple, consisting of a list of input ids and a list of token type ids
         """
         if self.args.loss_func in ['generative', 'mix']:
             sample = super().encode(example, priming=priming, labeled=labeled)
diff --git a/modelscope/models/nlp/palm_v2/configuration.py b/modelscope/models/nlp/palm_v2/configuration.py
index 48e0e20b..9d033479 100644
--- a/modelscope/models/nlp/palm_v2/configuration.py
+++ b/modelscope/models/nlp/palm_v2/configuration.py
@@ -63,7 +63,7 @@ class PalmConfig(PretrainedConfig):
         attn_separate (:obj:`bool`, `optional`, defaults to false):
             Whether or not to separate the q, k, v of attention.
 
-    Examples::
+    Examples:
 
         >>> from modelscope.models.nlp.palm_v2 import PalmForConditionalGeneration, PalmConfig
         >>> configuration = PalmConfig()
diff --git a/modelscope/models/nlp/palm_v2/text_generation.py b/modelscope/models/nlp/palm_v2/text_generation.py
index 5bb446b5..a87b5cdd 100644
--- a/modelscope/models/nlp/palm_v2/text_generation.py
+++ b/modelscope/models/nlp/palm_v2/text_generation.py
@@ -638,10 +638,7 @@ class AbsSummarizer(PalmPreTrainedModel):  # Model
         self.generator.dense.weight = self.decoder.embeddings.weight
 
         if checkpoint is not None:
-            if 'model' in checkpoint:
-                checkpoint = checkpoint['model']
-            for key in list(checkpoint.keys()):
-                checkpoint[key.replace('model.palm.', '')] = checkpoint[key]
+            checkpoint = self._unwrap_checkpoint(checkpoint)
             self.load_state_dict(checkpoint, strict=False)
         else:
             for module in self.decoder.modules():
@@ -673,6 +670,17 @@ class AbsSummarizer(PalmPreTrainedModel):  # Model
                 self.decoder.embeddings = tgt_embeddings
             self.generator.dense.weight = self.decoder.embeddings.weight
 
+    @staticmethod
+    def _unwrap_checkpoint(checkpoint: Dict):
+        wrap_names = ('model', 'palm')
+        for name in wrap_names:
+            if name in checkpoint:
+                checkpoint = checkpoint[name]
+        for name in wrap_names:
+            checkpoint = {(k[len(name) + 1:] if k.startswith(name) else k): v
+                          for k, v in checkpoint.items()}
+        return checkpoint
+
     def forward(self, src, tgt, mask_src):
         top_vec, _ = self.bert(src, mask_src, return_dict=False)
         state = TransformerDecoderState(src)
diff --git a/modelscope/models/nlp/plug/backbone.py b/modelscope/models/nlp/plug/backbone.py
index 6f7d594b..37714ed7 100644
--- a/modelscope/models/nlp/plug/backbone.py
+++ b/modelscope/models/nlp/plug/backbone.py
@@ -563,19 +563,17 @@ class BertModel(PreTrainedBertModel):
             classifier pretrained on top of the hidden state associated to the first character of the
             input (`CLF`) to train on the Next-Sentence task (see BERT's paper).
 
-    Example usage:
-    ```python
-    # Already been converted into WordPiece token ids
-    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
-    input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
-    token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
+    Examples:
+        >>> # Already been converted into WordPiece token ids
+        >>> input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
+        >>> input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
+        >>> token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
 
-    config = modeling.BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
-        num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
+        >>> config = modeling.BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
+        >>>     num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
 
-    model = modeling.BertModel(config=config)
-    all_encoder_layers, pooled_output = model(input_ids, token_type_ids, input_mask)
-    ```
+        >>> model = modeling.BertModel(config=config)
+        >>> all_encoder_layers, pooled_output = model(input_ids, token_type_ids, input_mask)
     """
 
     def __init__(self, config):
@@ -909,9 +907,8 @@ class PlugModel(torch.nn.Module):
         config ([`PlugNLGConfig`]): Model configuration class with all the parameters of the model.
             Initializing with a config file does not load the weights associated with the model, only the
             configuration. Check out the [`~DistributedPlug.initialize_model`] method to load the model weights.
-    Example:
+    Examples:
 
-    ```python
     >>> # The PLUG model has 27B parameters and usually need to run on multiple GPUs. The example given
     >>> # here only initializes a slice of the model on a single GPU.
     >>> # Check out the [`~DistributedPipeline.__init__`] method to initialize entire PLUG model.
diff --git a/modelscope/models/nlp/plug/configuration.py b/modelscope/models/nlp/plug/configuration.py
index c60458c8..fedc66c0 100644
--- a/modelscope/models/nlp/plug/configuration.py
+++ b/modelscope/models/nlp/plug/configuration.py
@@ -183,7 +183,6 @@ class PlugNLGConfig(PlugNLUConfig):
 
     Example:
 
-    ```python
     >>> # The PLUG model has 27B parameters and usually need to run on multiple GPUs. The example given
     >>> # here only initializes a slice of the model on a single GPU.
     >>> # Check out the [`~DistributedPipeline.__init__`] method to initialize entire PLUG model.
@@ -197,7 +196,6 @@ class PlugNLGConfig(PlugNLUConfig):
 
     >>> # Accessing the model configuration
     >>> configuration = model.config
-    ```
     """
 
     model_type = 'plugNLG'
diff --git a/modelscope/models/nlp/plug_mental/__init__.py b/modelscope/models/nlp/plug_mental/__init__.py
new file mode 100644
index 00000000..eb7a1b79
--- /dev/null
+++ b/modelscope/models/nlp/plug_mental/__init__.py
@@ -0,0 +1,39 @@
+# Copyright 2021-2022 The Alibaba DAMO NLP Team Authors.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .backbone import (PlugMentalModel, PlugMentalPreTrainedModel)
+    from .configuration import PlugMentalConfig
+    from .text_classification import PlugMentalForSequenceClassification
+else:
+    _import_structure = {
+        'backbone': ['PlugMentalModel', 'PlugMentalPreTrainedModel'],
+        'configuration': ['PlugMentalConfig'],
+        'text_classification': ['PlugMentalForSequenceClassification'],
+    }
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/models/nlp/plug_mental/adv_utils.py b/modelscope/models/nlp/plug_mental/adv_utils.py
new file mode 100644
index 00000000..eee44199
--- /dev/null
+++ b/modelscope/models/nlp/plug_mental/adv_utils.py
@@ -0,0 +1,168 @@
+# Copyright 2021-2022 The Alibaba DAMO NLP Team Authors.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+from torch import nn
+
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+
+def _symmetric_kl_div(logits1, logits2, attention_mask=None):
+    """
+    Calclate two logits' the KL div value symmetrically.
+    :param logits1: The first logit.
+    :param logits2: The second logit.
+    :param attention_mask: An optional attention_mask which is used to mask some element out.
+    This is usually useful in token_classification tasks.
+    If the shape of logits is [N1, N2, ... Nn, D], the shape of attention_mask should be [N1, N2, ... Nn]
+    :return: The mean loss.
+    """
+    labels_num = logits1.shape[-1]
+    KLDiv = nn.KLDivLoss(reduction='none')
+    loss = torch.sum(
+        KLDiv(nn.LogSoftmax(dim=-1)(logits1),
+              nn.Softmax(dim=-1)(logits2)),
+        dim=-1) + torch.sum(
+            KLDiv(nn.LogSoftmax(dim=-1)(logits2),
+                  nn.Softmax(dim=-1)(logits1)),
+            dim=-1)
+    if attention_mask is not None:
+        loss = torch.sum(
+            loss * attention_mask) / torch.sum(attention_mask) / labels_num
+    else:
+        loss = torch.mean(loss) / labels_num
+    return loss
+
+
+def compute_adv_loss(embedding,
+                     model,
+                     ori_logits,
+                     ori_loss,
+                     adv_grad_factor,
+                     adv_bound=None,
+                     sigma=5e-6,
+                     **kwargs):
+    """
+    Calculate the adv loss of the model.
+    :param embedding: Original sentense embedding
+    :param model: The model, or the forward function(including decoder/classifier),
+            accept kwargs as input, output logits
+    :param ori_logits: The original logits outputed from the model function
+    :param ori_loss: The original loss
+    :param adv_grad_factor: This factor will be multipled by the KL loss grad and then the result will be added to
+            the original embedding.
+            More details please check:https://arxiv.org/abs/1908.04577
+            The range of this value always be 1e-3~1e-7
+    :param adv_bound: adv_bound is used to cut the top and the bottom bound of the produced embedding.
+            If not proveded, 2 * sigma will be used as the adv_bound factor
+    :param sigma: The std factor used to produce a 0 mean normal distribution.
+            If adv_bound not proveded, 2 * sigma will be used as the adv_bound factor
+    :param kwargs: the input param used in model function
+    :return: The original loss adds the adv loss
+    """
+    adv_bound = adv_bound if adv_bound is not None else 2 * sigma
+    embedding_1 = embedding + embedding.data.new(embedding.size()).normal_(
+        0, sigma)  # 95% in +- 1e-5
+    kwargs.pop('input_ids')
+    if 'inputs_embeds' in kwargs:
+        kwargs.pop('inputs_embeds')
+    with_attention_mask = False if 'with_attention_mask' not in kwargs else kwargs[
+        'with_attention_mask']
+    attention_mask = kwargs['attention_mask']
+    if not with_attention_mask:
+        attention_mask = None
+    if 'with_attention_mask' in kwargs:
+        kwargs.pop('with_attention_mask')
+    outputs = model(**kwargs, inputs_embeds=embedding_1)
+    v1_logits = outputs.logits
+    loss = _symmetric_kl_div(ori_logits, v1_logits, attention_mask)
+    emb_grad = torch.autograd.grad(loss, embedding_1)[0].data
+    emb_grad_norm = emb_grad.norm(
+        dim=2, keepdim=True, p=float('inf')).max(
+            1, keepdim=True)[0]
+    is_nan = torch.any(torch.isnan(emb_grad_norm))
+    if is_nan:
+        logger.warning('Nan occured when calculating adv loss.')
+        return ori_loss
+    emb_grad = emb_grad / (emb_grad_norm + 1e-6)
+    embedding_2 = embedding_1 + adv_grad_factor * emb_grad
+    embedding_2 = torch.max(embedding_1 - adv_bound, embedding_2)
+    embedding_2 = torch.min(embedding_1 + adv_bound, embedding_2)
+    outputs = model(**kwargs, inputs_embeds=embedding_2)
+    adv_logits = outputs.logits
+    adv_loss = _symmetric_kl_div(ori_logits, adv_logits, attention_mask)
+    return ori_loss + adv_loss
+
+
+def compute_adv_loss_pair(embedding,
+                          model,
+                          start_logits,
+                          end_logits,
+                          ori_loss,
+                          adv_grad_factor,
+                          adv_bound=None,
+                          sigma=5e-6,
+                          **kwargs):
+    """
+    Calculate the adv loss of the model. This function is used in the pair logits scenerio.
+    :param embedding: Original sentense embedding
+    :param model: The model, or the forward function(including decoder/classifier),
+            accept kwargs as input, output logits
+    :param start_logits: The original start logits outputed from the model function
+    :param end_logits: The original end logits outputed from the model function
+    :param ori_loss: The original loss
+    :param adv_grad_factor: This factor will be multipled by the KL loss grad and then the result will be added to
+            the original embedding.
+            More details please check:https://arxiv.org/abs/1908.04577
+            The range of this value always be 1e-3~1e-7
+    :param adv_bound: adv_bound is used to cut the top and the bottom bound of the produced embedding.
+            If not proveded, 2 * sigma will be used as the adv_bound factor
+    :param sigma: The std factor used to produce a 0 mean normal distribution.
+            If adv_bound not proveded, 2 * sigma will be used as the adv_bound factor
+    :param kwargs: the input param used in model function
+    :return: The original loss adds the adv loss
+    """
+    adv_bound = adv_bound if adv_bound is not None else 2 * sigma
+    embedding_1 = embedding + embedding.data.new(embedding.size()).normal_(
+        0, sigma)  # 95% in +- 1e-5
+    kwargs.pop('input_ids')
+    if 'inputs_embeds' in kwargs:
+        kwargs.pop('inputs_embeds')
+    outputs = model(**kwargs, inputs_embeds=embedding_1)
+    v1_logits_start, v1_logits_end = outputs.logits
+    loss = _symmetric_kl_div(start_logits,
+                             v1_logits_start) + _symmetric_kl_div(
+                                 end_logits, v1_logits_end)
+    loss = loss / 2
+    emb_grad = torch.autograd.grad(loss, embedding_1)[0].data
+    emb_grad_norm = emb_grad.norm(
+        dim=2, keepdim=True, p=float('inf')).max(
+            1, keepdim=True)[0]
+    is_nan = torch.any(torch.isnan(emb_grad_norm))
+    if is_nan:
+        logger.warning('Nan occured when calculating pair adv loss.')
+        return ori_loss
+    emb_grad = emb_grad / emb_grad_norm
+    embedding_2 = embedding_1 + adv_grad_factor * emb_grad
+    embedding_2 = torch.max(embedding_1 - adv_bound, embedding_2)
+    embedding_2 = torch.min(embedding_1 + adv_bound, embedding_2)
+    outputs = model(**kwargs, inputs_embeds=embedding_2)
+    adv_logits_start, adv_logits_end = outputs.logits
+    adv_loss = _symmetric_kl_div(start_logits,
+                                 adv_logits_start) + _symmetric_kl_div(
+                                     end_logits, adv_logits_end)
+    return ori_loss + adv_loss
diff --git a/modelscope/models/nlp/plug_mental/backbone.py b/modelscope/models/nlp/plug_mental/backbone.py
new file mode 100755
index 00000000..e8531f52
--- /dev/null
+++ b/modelscope/models/nlp/plug_mental/backbone.py
@@ -0,0 +1,1077 @@
+# Copyright 2021-2022 The Alibaba DAMO NLP Team Authors.
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch Plug Mental model. mainly copied from :module:`~transformers.modeling_bert`"""
+
+import math
+from dataclasses import dataclass
+from typing import Optional, Union
+
+import torch
+import torch.nn as nn
+import torch.utils.checkpoint
+from packaging import version
+from transformers.activations import ACT2FN
+from transformers.modeling_utils import (PreTrainedModel,
+                                         apply_chunking_to_forward,
+                                         find_pruneable_heads_and_indices,
+                                         prune_linear_layer)
+
+from modelscope.metainfo import Models
+from modelscope.models import Model, TorchModel
+from modelscope.models.builder import MODELS
+from modelscope.outputs import AttentionBackboneModelOutput
+from modelscope.utils.constant import Tasks
+from modelscope.utils.logger import get_logger
+from modelscope.utils.nlp.utils import parse_labels_in_order
+from .configuration import PlugMentalConfig
+
+logger = get_logger()
+
+
+class SbertEmbeddings(nn.Module):
+    """Construct the embeddings from word, position and token_type embeddings."""
+
+    def __init__(self, config):
+        super().__init__()
+        self.word_embeddings = nn.Embedding(
+            config.vocab_size,
+            config.hidden_size,
+            padding_idx=config.pad_token_id)
+        self.position_embeddings = nn.Embedding(config.max_position_embeddings,
+                                                config.hidden_size)
+        self.token_type_embeddings = nn.Embedding(config.type_vocab_size,
+                                                  config.hidden_size)
+
+        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
+        # any TensorFlow checkpoint file
+        self.LayerNorm = nn.LayerNorm(
+            config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        # position_ids (1, len position emb) is contiguous in memory and exported when serialized
+        self.position_embedding_type = getattr(config,
+                                               'position_embedding_type',
+                                               'absolute')
+        self.register_buffer(
+            'position_ids',
+            torch.arange(config.max_position_embeddings).expand((1, -1)))
+        if version.parse(torch.__version__) > version.parse('1.6.0'):
+            self.register_buffer(
+                'token_type_ids',
+                torch.zeros(
+                    self.position_ids.size(),
+                    dtype=torch.long,
+                    device=self.position_ids.device),
+                persistent=False,
+            )
+
+    def forward(self,
+                input_ids=None,
+                token_type_ids=None,
+                position_ids=None,
+                inputs_embeds=None,
+                past_key_values_length=0,
+                return_inputs_embeds=False):
+        if input_ids is not None:
+            input_shape = input_ids.size()
+        else:
+            input_shape = inputs_embeds.size()[:-1]
+
+        seq_length = input_shape[1]
+
+        if position_ids is None:
+            position_ids = self.position_ids[:,
+                                             past_key_values_length:seq_length
+                                             + past_key_values_length]
+
+        # Setting the token_type_ids to the registered buffer in constructor where it is all zeros, which usually occurs
+        # when its auto-generated, registered buffer helps users
+        # when tracing the model without passing token_type_ids, solves
+        # issue #5664
+        if token_type_ids is None:
+            if hasattr(self, 'token_type_ids'):
+                buffered_token_type_ids = self.token_type_ids[:, :seq_length]
+                buffered_token_type_ids_expanded = buffered_token_type_ids.expand(
+                    input_shape[0], seq_length)
+                token_type_ids = buffered_token_type_ids_expanded
+            else:
+                token_type_ids = torch.zeros(
+                    input_shape,
+                    dtype=torch.long,
+                    device=self.position_ids.device)
+
+        if inputs_embeds is None:
+            inputs_embeds = self.word_embeddings(input_ids)
+        token_type_embeddings = self.token_type_embeddings(token_type_ids)
+
+        embeddings = inputs_embeds + token_type_embeddings
+        if self.position_embedding_type == 'absolute':
+            position_embeddings = self.position_embeddings(position_ids)
+            embeddings += position_embeddings
+        embeddings = self.LayerNorm(embeddings)
+        embeddings = self.dropout(embeddings)
+        if not return_inputs_embeds:
+            return embeddings
+        else:
+            return embeddings, inputs_embeds
+
+
+class SbertSelfAttention(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(
+                config, 'embedding_size'):
+            raise ValueError(
+                f'The hidden size ({config.hidden_size}) is not a multiple of the number of attention '
+                f'heads ({config.num_attention_heads})')
+
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size
+                                       / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+
+        self.query = nn.Linear(config.hidden_size, self.all_head_size)
+        self.key = nn.Linear(config.hidden_size, self.all_head_size)
+        self.value = nn.Linear(config.hidden_size, self.all_head_size)
+
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+        self.position_embedding_type = getattr(config,
+                                               'position_embedding_type',
+                                               'absolute')
+        if self.position_embedding_type == 'relative_key' or self.position_embedding_type == 'relative_key_query':
+            self.max_position_embeddings = config.max_position_embeddings
+            self.distance_embedding = nn.Embedding(
+                2 * config.max_position_embeddings - 1,
+                self.attention_head_size)
+
+        self.is_decoder = config.is_decoder
+
+    def transpose_for_scores(self, x):
+        new_x_shape = x.size()[:-1] + (self.num_attention_heads,
+                                       self.attention_head_size)
+        x = x.view(*new_x_shape)
+        return x.permute(0, 2, 1, 3)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=False,
+    ):
+        mixed_query_layer = self.query(hidden_states)
+
+        # If this is instantiated as a cross-attention module, the keys
+        # and values come from an encoder; the attention mask needs to be
+        # such that the encoder's padding tokens are not attended to.
+        is_cross_attention = encoder_hidden_states is not None
+
+        if is_cross_attention and past_key_value is not None:
+            # reuse k,v, cross_attentions
+            key_layer = past_key_value[0]
+            value_layer = past_key_value[1]
+            attention_mask = encoder_attention_mask
+        elif is_cross_attention:
+            key_layer = self.transpose_for_scores(
+                self.key(encoder_hidden_states))
+            value_layer = self.transpose_for_scores(
+                self.value(encoder_hidden_states))
+            attention_mask = encoder_attention_mask
+        elif past_key_value is not None:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
+            key_layer = torch.cat([past_key_value[0], key_layer], dim=2)
+            value_layer = torch.cat([past_key_value[1], value_layer], dim=2)
+        else:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
+
+        query_layer = self.transpose_for_scores(mixed_query_layer)
+
+        if self.is_decoder:
+            # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
+            # Further calls to cross_attention layer can then reuse all cross-attention
+            # key/value_states (first "if" case)
+            # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
+            # all previous decoder key/value_states. Further calls to uni-directional self-attention
+            # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
+            # if encoder bi-directional self-attention `past_key_value` is always `None`
+            past_key_value = (key_layer, value_layer)
+
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = torch.matmul(query_layer,
+                                        key_layer.transpose(-1, -2))
+
+        if self.position_embedding_type == 'relative_key' or self.position_embedding_type == 'relative_key_query':
+            seq_length = hidden_states.size()[1]
+            position_ids_l = torch.arange(
+                seq_length, dtype=torch.long,
+                device=hidden_states.device).view(-1, 1)
+            position_ids_r = torch.arange(
+                seq_length, dtype=torch.long,
+                device=hidden_states.device).view(1, -1)
+            distance = position_ids_l - position_ids_r
+            positional_embedding = self.distance_embedding(
+                distance + self.max_position_embeddings - 1)
+            positional_embedding = positional_embedding.to(
+                dtype=query_layer.dtype)  # fp16 compatibility
+
+            if self.position_embedding_type == 'relative_key':
+                relative_position_scores = torch.einsum(
+                    'bhld,lrd->bhlr', query_layer, positional_embedding)
+                attention_scores = attention_scores + relative_position_scores
+            elif self.position_embedding_type == 'relative_key_query':
+                relative_position_scores_query = torch.einsum(
+                    'bhld,lrd->bhlr', query_layer, positional_embedding)
+                relative_position_scores_key = torch.einsum(
+                    'bhrd,lrd->bhlr', key_layer, positional_embedding)
+                attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key
+
+        attention_scores = attention_scores / math.sqrt(
+            self.attention_head_size)
+        if attention_mask is not None:
+            # Apply the attention mask is (precomputed for all layers in SbertModel forward() function)
+            attention_scores = attention_scores + attention_mask
+
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.Softmax(dim=-1)(attention_scores)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(attention_probs)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs = attention_probs * head_mask
+
+        context_layer = torch.matmul(attention_probs, value_layer)
+
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (
+            self.all_head_size, )
+        context_layer = context_layer.view(*new_context_layer_shape)
+
+        outputs = (context_layer,
+                   attention_probs) if output_attentions else (context_layer, )
+
+        if self.is_decoder:
+            outputs = outputs + (past_key_value, )
+        return outputs
+
+
+class SbertSelfOutput(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(
+            config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+class SbertAttention(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.self = SbertSelfAttention(config)
+        self.output = SbertSelfOutput(config)
+        self.pruned_heads = set()
+
+    def prune_heads(self, heads):
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(
+            heads, self.self.num_attention_heads,
+            self.self.attention_head_size, self.pruned_heads)
+
+        # Prune linear layers
+        self.self.query = prune_linear_layer(self.self.query, index)
+        self.self.key = prune_linear_layer(self.self.key, index)
+        self.self.value = prune_linear_layer(self.self.value, index)
+        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
+
+        # Update hyper params and store pruned heads
+        self.self.num_attention_heads = self.self.num_attention_heads - len(
+            heads)
+        self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
+        self.pruned_heads = self.pruned_heads.union(heads)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=False,
+    ):
+        self_outputs = self.self(
+            hidden_states,
+            attention_mask,
+            head_mask,
+            encoder_hidden_states,
+            encoder_attention_mask,
+            past_key_value,
+            output_attentions,
+        )
+        attention_output = self.output(self_outputs[0], hidden_states)
+        outputs = (attention_output,
+                   ) + self_outputs[1:]  # add attentions if we output them
+        return outputs
+
+
+class SbertIntermediate(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        return hidden_states
+
+
+class SbertOutput(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(
+            config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+class SbertLayer(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.chunk_size_feed_forward = config.chunk_size_feed_forward
+        self.seq_len_dim = 1
+        self.attention = SbertAttention(config)
+        self.is_decoder = config.is_decoder
+        self.add_cross_attention = config.add_cross_attention
+        if self.add_cross_attention:
+            if not self.is_decoder:
+                raise ValueError(
+                    f'{self} should be used as a decoder model if cross attention is added'
+                )
+            self.crossattention = SbertAttention(config)
+        self.intermediate = SbertIntermediate(config)
+        self.output = SbertOutput(config)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=False,
+    ):
+        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
+        self_attn_past_key_value = past_key_value[:
+                                                  2] if past_key_value is not None else None
+        self_attention_outputs = self.attention(
+            hidden_states,
+            attention_mask,
+            head_mask,
+            output_attentions=output_attentions,
+            past_key_value=self_attn_past_key_value,
+        )
+        attention_output = self_attention_outputs[0]
+
+        # if decoder, the last output is tuple of self-attn cache
+        if self.is_decoder:
+            outputs = self_attention_outputs[1:-1]
+            present_key_value = self_attention_outputs[-1]
+        else:
+            outputs = self_attention_outputs[
+                1:]  # add self attentions if we output attention weights
+
+        cross_attn_present_key_value = None
+        if self.is_decoder and encoder_hidden_states is not None:
+            if not hasattr(self, 'crossattention'):
+                raise ValueError(
+                    f'If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention '
+                    f'layers by setting `config.add_cross_attention=True`')
+
+            # cross_attn cached key/values tuple is at positions 3,4 of past_key_value tuple
+            cross_attn_past_key_value = past_key_value[
+                -2:] if past_key_value is not None else None
+            cross_attention_outputs = self.crossattention(
+                attention_output,
+                attention_mask,
+                head_mask,
+                encoder_hidden_states,
+                encoder_attention_mask,
+                cross_attn_past_key_value,
+                output_attentions,
+            )
+            attention_output = cross_attention_outputs[0]
+            outputs = outputs + cross_attention_outputs[
+                1:-1]  # add cross attentions if we output attention weights
+
+            # add cross-attn cache to positions 3,4 of present_key_value tuple
+            cross_attn_present_key_value = cross_attention_outputs[-1]
+            present_key_value = present_key_value + cross_attn_present_key_value
+
+        layer_output = apply_chunking_to_forward(self.feed_forward_chunk,
+                                                 self.chunk_size_feed_forward,
+                                                 self.seq_len_dim,
+                                                 attention_output)
+        outputs = (layer_output, ) + outputs
+
+        # if decoder, return the attn key/values as the last output
+        if self.is_decoder:
+            outputs = outputs + (present_key_value, )
+
+        return outputs
+
+    def feed_forward_chunk(self, attention_output):
+        intermediate_output = self.intermediate(attention_output)
+        layer_output = self.output(intermediate_output, attention_output)
+        return layer_output
+
+
+class SbertEncoder(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.layer = nn.ModuleList(
+            [SbertLayer(config) for _ in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_values=None,
+        use_cache=None,
+        output_attentions=False,
+        output_hidden_states=False,
+        return_dict=True,
+    ):
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+        all_cross_attentions = (
+        ) if output_attentions and self.config.add_cross_attention else None
+
+        next_decoder_cache = () if use_cache else None
+        for i, layer_module in enumerate(self.layer):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states, )
+
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+            past_key_value = past_key_values[
+                i] if past_key_values is not None else None
+
+            if self.gradient_checkpointing and self.training:
+
+                if use_cache:
+                    logger.warning(
+                        '`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...'
+                    )
+                    use_cache = False
+
+                def create_custom_forward(module):
+
+                    def custom_forward(*inputs):
+                        return module(*inputs, past_key_value,
+                                      output_attentions)
+
+                    return custom_forward
+
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(layer_module),
+                    hidden_states,
+                    attention_mask,
+                    layer_head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                )
+            else:
+                layer_outputs = layer_module(
+                    hidden_states,
+                    attention_mask,
+                    layer_head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    past_key_value,
+                    output_attentions,
+                )
+
+            hidden_states = layer_outputs[0]
+            if use_cache:
+                next_decoder_cache += (layer_outputs[-1], )
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (
+                    layer_outputs[1], )
+                if self.config.add_cross_attention:
+                    all_cross_attentions = all_cross_attentions + (
+                        layer_outputs[2], )
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states, )
+
+        if not return_dict:
+            return tuple(v for v in [
+                hidden_states,
+                next_decoder_cache,
+                all_hidden_states,
+                all_self_attentions,
+                all_cross_attentions,
+            ] if v is not None)
+        return AttentionBackboneModelOutput(
+            last_hidden_state=hidden_states,
+            past_key_values=next_decoder_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+            cross_attentions=all_cross_attentions,
+        )
+
+
+class SbertPooler(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.activation = nn.Tanh()
+
+    def forward(self, hidden_states):
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the first token.
+        first_token_tensor = hidden_states[:, 0]
+        pooled_output = self.dense(first_token_tensor)
+        pooled_output = self.activation(pooled_output)
+        return pooled_output
+
+
+class Adapter(nn.Module):
+
+    def __init__(self, args, adapter_config):
+        super(Adapter, self).__init__()
+        self.adapter_config = adapter_config
+        self.args = args
+        self.down_project = nn.Linear(
+            self.adapter_config.project_hidden_size,
+            self.adapter_config.adapter_size,
+        )
+        self.encoder = SbertEncoder(self.adapter_config)
+        self.up_project = nn.Linear(self.adapter_config.adapter_size,
+                                    adapter_config.project_hidden_size)
+        self.init_weights()
+
+    def forward(self, hidden_states):
+        down_projected = self.down_project(hidden_states)
+
+        input_shape = down_projected.size()[:-1]
+        attention_mask = torch.ones(input_shape, device=hidden_states.device)
+        # encoder_attention_mask = torch.ones(
+        #     input_shape, device=hidden_states.device)
+        if attention_mask.dim() == 3:
+            extended_attention_mask = attention_mask[:, None, :, :]
+
+        if attention_mask.dim() == 2:
+            extended_attention_mask = attention_mask[:, None, None, :]
+        extended_attention_mask = extended_attention_mask.to(
+            dtype=next(self.parameters()).dtype)  # fp16 compatibility
+        extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
+
+        # If a 2D ou 3D attention mask is provided for the cross-attention
+        # we need to make broadcastabe to [batch_size, num_heads, seq_length, seq_length]
+        # if encoder_attention_mask.dim() == 3:
+        #     encoder_extended_attention_mask = encoder_attention_mask[:,
+        #                                                              None, :, :]
+        # if encoder_attention_mask.dim() == 2:
+        #     encoder_extended_attention_mask = encoder_attention_mask[:, None,
+        #                                                              None, :]
+
+        head_mask = [None] * self.adapter_config.num_hidden_layers
+        encoder_outputs = self.encoder(
+            down_projected,
+            attention_mask=extended_attention_mask,
+            head_mask=head_mask)
+
+        up_projected = self.up_project(encoder_outputs[0])
+        return hidden_states + up_projected
+
+    def init_weights(self):
+        self.down_project.weight.data.normal_(
+            mean=0.0, std=self.adapter_config.adapter_initializer_range)
+        self.down_project.bias.data.zero_()
+        self.up_project.weight.data.normal_(
+            mean=0.0, std=self.adapter_config.adapter_initializer_range)
+        self.up_project.bias.data.zero_()
+
+
+class AdapterModel(nn.Module):
+
+    def __init__(
+        self,
+        args,
+        pretrained_model_config,
+    ):
+        super(AdapterModel, self).__init__()
+        self.config = pretrained_model_config
+        self.args = args
+        self.adapter_size = self.args.adapter_size
+
+        class AdapterConfig:
+            project_hidden_size: int = self.config.hidden_size
+            hidden_act: str = 'gelu'
+            adapter_size: int = self.adapter_size  # 64
+            adapter_initializer_range: float = 0.0002
+            is_decoder: bool = False
+            attention_probs_dropout_prob: float = 0.1
+            hidden_dropout_prob: float = 0.1
+            hidden_size: int = 768
+            initializer_range: float = 0.02
+            intermediate_size: int = 3072
+            layer_norm_eps: float = 1e-05  # 和Roberta large不一致，1e-12
+            max_position_embeddings: int = 512
+            num_attention_heads: int = 12
+            num_hidden_layers: int = self.args.adapter_transformer_layers
+            output_attentions: bool = False
+            output_hidden_states: bool = False
+            torchscript: bool = False
+            type_vocab_size: int = 2
+            vocab_size: int = 21128
+            output_value_attentions: bool = False
+            chunk_size_feed_forward: int = 0
+            add_cross_attention: bool = False
+
+        self.adapter_skip_layers = self.args.adapter_skip_layers
+        # self.config.output_hidden_states=True
+        self.adapter_list = args.adapter_list
+        # self.adapter_list =[int(i) for i in self.adapter_list]
+        self.adapter_num = len(self.adapter_list)
+        # self.adapter = Adapter(args, AdapterConfig)
+
+        self.adapter = nn.ModuleList(
+            [Adapter(args, AdapterConfig) for _ in range(self.adapter_num)])
+
+        # self.dense = nn.Linear(self.config.hidden_size * 2, self.config.hidden_size)
+        # self.dropout = nn.Dropout(self.config.hidden_dropout_prob)
+
+    def forward(
+        self,
+        pretrained_model_outputs,
+    ):
+        # for i in pretrained_model_outputs:
+        #     print(i.shape)
+
+        outputs = pretrained_model_outputs
+        sequence_output = outputs.last_hidden_state
+        # pooler_output = outputs[1]
+        hidden_states = outputs.hidden_states
+        # num = len(hidden_states)
+        hidden_states_last = torch.zeros(sequence_output.size()).to(
+            sequence_output.device)
+
+        adapter_hidden_states = []
+        adapter_hidden_states_count = 0
+        for i, adapter_module in enumerate(self.adapter):
+            fusion_state = hidden_states[
+                self.adapter_list[i]] + hidden_states_last
+            hidden_states_last = adapter_module(fusion_state)
+            adapter_hidden_states.append(hidden_states_last)
+            adapter_hidden_states_count += 1
+            if self.adapter_skip_layers >= 1:  # if adapter_skip_layers>=1, skip connection
+                if adapter_hidden_states_count % self.adapter_skip_layers == 0:
+                    hidden_states_last = hidden_states_last + adapter_hidden_states[
+                        int(adapter_hidden_states_count
+                            / self.adapter_skip_layers)]
+        return hidden_states_last
+
+
+class PlugMentalPreTrainedModel(TorchModel, PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = PlugMentalConfig
+    base_model_prefix = 'bert'
+    supports_gradient_checkpointing = True
+    _keys_to_ignore_on_load_missing = [r'position_ids']
+
+    def __init__(self, config, **kwargs):
+        super().__init__(config.name_or_path, **kwargs)
+        super(Model, self).__init__(config)
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, nn.Linear):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(
+                mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(
+                mean=0.0, std=self.config.initializer_range)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, SbertEncoder):
+            module.gradient_checkpointing = value
+
+    @classmethod
+    def _instantiate(cls, **kwargs):
+        """Instantiate the model.
+
+        Args:
+            kwargs: Input args.
+                    model_dir: The model dir used to load the checkpoint and the label information.
+                    num_labels: An optional arg to tell the model how many classes to initialize.
+                                    Method will call utils.parse_label_mapping if num_labels is not input.
+                    label2id: An optional label2id mapping, which will cover the label2id in configuration (if exists).
+
+        Returns:
+            The loaded model, which is initialized by transformers.PreTrainedModel.from_pretrained
+        """
+
+        model_dir = kwargs.pop('model_dir', None)
+        cfg = kwargs.pop('cfg', None)
+        model_args = parse_labels_in_order(model_dir, cfg, **kwargs)
+
+        if model_dir is None:
+            config = SbertConfig(**model_args)
+            model = cls(config)
+        else:
+            model = super(Model, cls).from_pretrained(
+                pretrained_model_name_or_path=model_dir, **model_args)
+        return model
+
+
+@dataclass
+class AttentionBackboneModelOutputWithEmbedding(AttentionBackboneModelOutput):
+    embedding_output: torch.FloatTensor = None
+    logits: Optional[Union[tuple, torch.FloatTensor]] = None
+    kwargs: dict = None
+
+
+@MODELS.register_module(Tasks.backbone, module_name=Models.plug_mental)
+class PlugMentalModel(PlugMentalPreTrainedModel):
+    """The PlugMental Model transformer outputting raw hidden-states without any specific head on top.
+
+    This model inherits from :class:`~transformers.PreTrainedModel`. Check the superclass documentation for the generic
+    methods the library implements for all its model (such as downloading or saving, resizing the input embeddings,
+    pruning heads etc.)
+
+    This model is also a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`__
+    subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to
+    general usage and behavior.
+
+    Parameters:
+        config (:class:`~modelscope.models.nlp.plug_mental.SbertConfig`): Model configuration class with
+            all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model
+            weights.
+
+    The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
+    cross-attention is added between the self-attention layers, following the architecture described in `Attention is
+    all you need <https://arxiv.org/abs/1706.03762>`__ by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit,
+    Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin.
+
+    To behave as an decoder the model needs to be initialized with the :obj:`is_decoder` argument of the configuration
+    set to :obj:`True`. To be used in a Seq2Seq model, the model needs to initialized with both :obj:`is_decoder`
+    argument and :obj:`add_cross_attention` set to :obj:`True`; an :obj:`encoder_hidden_states` is then expected as an
+    input to the forward pass.
+    """
+
+    def __init__(self,
+                 config: PlugMentalConfig,
+                 add_pooling_layer=True,
+                 **kwargs):
+        super().__init__(config)
+        self.config = config
+
+        self.embeddings = SbertEmbeddings(config)
+        self.encoder = SbertEncoder(config)
+
+        self.pooler = SbertPooler(config) if add_pooling_layer else None
+
+        class AdapterModelConfig:
+            adapter_skip_layers = 0
+            adapter_size = config.adapter_size
+            adapter_transformer_layers = config.adapter_transformer_layers
+            adapter_list = config.adapter_list
+
+        self.adapter = AdapterModel(AdapterModelConfig, config)
+        self.adapter_dense = nn.Linear(self.config.hidden_size,
+                                       self.config.hidden_size)
+        self.com_dense = nn.Linear(self.config.hidden_size * 2,
+                                   self.config.hidden_size)
+
+        self.init_weights()
+
+    def get_input_embeddings(self):
+        return self.embeddings.word_embeddings
+
+    def set_input_embeddings(self, value):
+        self.embeddings.word_embeddings = value
+
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+
+    def forward(self,
+                input_ids=None,
+                attention_mask=None,
+                token_type_ids=None,
+                position_ids=None,
+                head_mask=None,
+                inputs_embeds=None,
+                encoder_hidden_states=None,
+                encoder_attention_mask=None,
+                past_key_values=None,
+                use_cache=None,
+                output_attentions=None,
+                output_hidden_states=None,
+                return_dict=None,
+                **kwargs):
+        r"""
+        Args:
+            input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
+                Indices of input sequence tokens in the vocabulary.
+
+
+            attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+                Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+            token_type_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+                Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0,
+                1]``:
+
+                - 0 corresponds to a `sentence A` token,
+                - 1 corresponds to a `sentence B` token.
+
+            position_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+                Indices of positions of each input sequence tokens in the position embeddings. Selected in the range
+                ``[0, config.max_position_embeddings - 1]``.
+
+            head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`,
+            `optional`):
+                Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``:
+
+                - 1 indicates the head is **not masked**,
+                - 0 indicates the head is **masked**.
+
+            inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`,
+            `optional`):
+                Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded
+                representation. This is useful if you want more control over how to convert :obj:`input_ids`
+                indices into associated vectors than the model's internal embedding lookup matrix.
+            output_attentions (:obj:`bool`, `optional`):
+                Whether or not to return the attentions tensors of all attention layers. See ``attentions``
+                under returned tensors for more detail.
+            output_hidden_states (:obj:`bool`, `optional`):
+                Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned
+                tensors for more detail.
+            return_dict (:obj:`bool`, `optional`):
+                Whether or not to return a :class:`~transformers.ModelOutput` instead of a plain tuple.
+            encoder_hidden_states  (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`,
+            `optional`):
+                Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
+                the model is configured as a decoder.
+            encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+                Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used
+                in the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+            past_key_values (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers` with each tuple
+                having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+                Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up
+                decoding.
+
+                If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids`
+                (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
+                instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`.
+            use_cache (:obj:`bool`, `optional`):
+                If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
+                decoding (see :obj:`past_key_values`).
+
+        Returns:
+            Returns `modelscope.outputs.AttentionBackboneModelOutputWithEmbedding`
+
+        Examples:
+            >>> from modelscope.models import Model
+            >>> from modelscope.preprocessors import Preprocessor
+            >>> model = Model.from_pretrained('damo/nlp_plug-mental_backbone_base_std', task='backbone')
+            >>> preprocessor = Preprocessor.from_pretrained('damo/nlp_plug-mental_backbone_base_std')
+            >>> print(model(**preprocessor('这是个测试')))
+        """
+
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else
+            self.config.output_hidden_states)
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if self.config.is_decoder:
+            use_cache = use_cache if use_cache is not None else self.config.use_cache
+        else:
+            use_cache = False
+
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError(
+                'You cannot specify both input_ids and inputs_embeds at the same time'
+            )
+        elif input_ids is not None:
+            input_shape = input_ids.size()
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+        else:
+            raise ValueError(
+                'You have to specify either input_ids or inputs_embeds')
+
+        batch_size, seq_length = input_shape
+        device = input_ids.device if input_ids is not None else inputs_embeds.device
+
+        # past_key_values_length
+        past_key_values_length = past_key_values[0][0].shape[
+            2] if past_key_values is not None else 0
+
+        if attention_mask is None:
+            attention_mask = torch.ones(
+                ((batch_size, seq_length + past_key_values_length)),
+                device=device)
+
+        if token_type_ids is None:
+            if hasattr(self.embeddings, 'token_type_ids'):
+                buffered_token_type_ids = self.embeddings.token_type_ids[:, :
+                                                                         seq_length]
+                buffered_token_type_ids_expanded = buffered_token_type_ids.expand(
+                    batch_size, seq_length)
+                token_type_ids = buffered_token_type_ids_expanded
+            else:
+                token_type_ids = torch.zeros(
+                    input_shape, dtype=torch.long, device=device)
+
+        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+        # ourselves in which case we just need to make it broadcastable to all heads.
+        extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(
+            attention_mask, input_shape, device)
+
+        # If a 2D or 3D attention mask is provided for the cross-attention
+        # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
+        if self.config.is_decoder and encoder_hidden_states is not None:
+            encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size(
+            )
+            encoder_hidden_shape = (encoder_batch_size,
+                                    encoder_sequence_length)
+            if encoder_attention_mask is None:
+                encoder_attention_mask = torch.ones(
+                    encoder_hidden_shape, device=device)
+            encoder_extended_attention_mask = self.invert_attention_mask(
+                encoder_attention_mask)
+        else:
+            encoder_extended_attention_mask = None
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        head_mask = self.get_head_mask(head_mask,
+                                       self.config.num_hidden_layers)
+
+        embedding_output, orignal_embeds = self.embeddings(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            token_type_ids=token_type_ids,
+            inputs_embeds=inputs_embeds,
+            past_key_values_length=past_key_values_length,
+            return_inputs_embeds=True,
+        )
+        encoder_outputs = self.encoder(
+            embedding_output,
+            attention_mask=extended_attention_mask,
+            head_mask=head_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_extended_attention_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        adapter_model_output = self.adapter(encoder_outputs)
+        encoder_pooled_output = self.pooler(encoder_outputs.last_hidden_state)
+        # sequence_output = encoder_outputs[0]
+        # pooled_output = self.pooler(
+        #     sequence_output) if self.pooler is not None else None
+        pooled_output = self.adapter_dense(encoder_pooled_output
+                                           + adapter_model_output[:, 0])
+        sequence_output = self.com_dense(
+            torch.cat(
+                [encoder_outputs.last_hidden_state, adapter_model_output],
+                dim=2))
+
+        if not return_dict:
+            return (sequence_output,
+                    pooled_output) + encoder_outputs[1:] + (orignal_embeds, )
+
+        return AttentionBackboneModelOutputWithEmbedding(
+            last_hidden_state=sequence_output,
+            pooler_output=pooled_output,
+            past_key_values=encoder_outputs.past_key_values,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+            cross_attentions=encoder_outputs.cross_attentions,
+            embedding_output=orignal_embeds)
diff --git a/modelscope/models/nlp/plug_mental/configuration.py b/modelscope/models/nlp/plug_mental/configuration.py
new file mode 100644
index 00000000..1157eeb9
--- /dev/null
+++ b/modelscope/models/nlp/plug_mental/configuration.py
@@ -0,0 +1,142 @@
+# Copyright 2021-2022 The Alibaba DAMO NLP Team Authors.
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PLUG mental model configuration, mainly copied from :class:`~transformers.BertConfig` """
+from transformers import PretrainedConfig
+
+from modelscope.utils import logger as logging
+
+logger = logging.get_logger()
+
+
+class PlugMentalConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration
+    of a :class:`~modelscope.models.nlp.plug_mental.PLugMentalModel`.
+    It is used to instantiate a PlugMental model according to the specified arguments.
+
+    Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
+    outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
+
+
+    Args:
+        vocab_size (:obj:`int`, `optional`, defaults to 30522):
+            Vocabulary size of the BERT model. Defines the number of different tokens that can be represented by the
+            :obj:`inputs_ids` passed when calling :class:`~transformers.BertModel` or
+            :class:`~transformers.TFBertModel`.
+        hidden_size (:obj:`int`, `optional`, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (:obj:`int`, `optional`, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (:obj:`int`, `optional`, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (:obj:`int`, `optional`, defaults to 3072):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
+        hidden_act (:obj:`str` or :obj:`Callable`, `optional`, defaults to :obj:`"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string,
+            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
+        hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
+            The dropout ratio for the attention probabilities.
+        max_position_embeddings (:obj:`int`, `optional`, defaults to 512):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        type_vocab_size (:obj:`int`, `optional`, defaults to 2):
+            The vocabulary size of the :obj:`token_type_ids` passed when calling :class:`~transformers.BertModel` or
+            :class:`~transformers.TFBertModel`.
+        initializer_range (:obj:`float`, `optional`, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (:obj:`float`, `optional`, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        position_embedding_type (:obj:`str`, `optional`, defaults to :obj:`"absolute"`):
+            Type of position embedding. Choose one of :obj:`"absolute"`, :obj:`"relative_key"`,
+            :obj:`"relative_key_query"`. For positional embeddings use :obj:`"absolute"`. For more information on
+            :obj:`"relative_key"`, please refer to `Self-Attention with Relative Position Representations (Shaw et al.)
+            <https://arxiv.org/abs/1803.02155>`__. For more information on :obj:`"relative_key_query"`, please refer to
+            `Method 4` in `Improve Transformer Models with Better Relative Position Embeddings (Huang et al.)
+            <https://arxiv.org/abs/2009.13658>`__.
+        use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if ``config.is_decoder=True``.
+        classifier_dropout (:obj:`float`, `optional`):
+            The dropout ratio for the classification head.
+        adv_grad_factor (:obj:`float`, `optional`): This factor will be multiplied by the KL loss grad and then
+            the result will be added to the original embedding.
+            More details please check:https://arxiv.org/abs/1908.04577
+            The range of this value should between 1e-3~1e-7
+        adv_bound (:obj:`float`, `optional`): adv_bound is used to cut the top and the bottom bound of
+            the produced embedding.
+            If not provided, 2 * sigma will be used as the adv_bound factor
+        sigma (:obj:`float`, `optional`): The std factor used to produce a 0 mean normal distribution.
+            If adv_bound not provided, 2 * sigma will be used as the adv_bound factor
+    """
+
+    model_type = 'plug-mental'
+
+    def __init__(self,
+                 vocab_size=30522,
+                 hidden_size=768,
+                 num_hidden_layers=12,
+                 num_attention_heads=12,
+                 intermediate_size=3072,
+                 hidden_act='gelu',
+                 hidden_dropout_prob=0.1,
+                 attention_probs_dropout_prob=0.1,
+                 max_position_embeddings=512,
+                 type_vocab_size=2,
+                 initializer_range=0.02,
+                 layer_norm_eps=1e-12,
+                 pad_token_id=0,
+                 position_embedding_type='absolute',
+                 use_cache=True,
+                 classifier_dropout=None,
+                 adapter_size=768,
+                 adapter_transformer_layers=1,
+                 adapter_list=[11],
+                 **kwargs):
+        super().__init__(pad_token_id=pad_token_id, **kwargs)
+
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.hidden_act = hidden_act
+        self.intermediate_size = intermediate_size
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.type_vocab_size = type_vocab_size
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.position_embedding_type = position_embedding_type
+        self.use_cache = use_cache
+        self.classifier_dropout = classifier_dropout
+        self.output_hidden_states = True
+        # adv_grad_factor, used in adv loss.
+        # Users can check adv_utils.py for details.
+        # if adv_grad_factor set to None, no adv loss will not applied to the model.
+        self.adv_grad_factor = None
+        # sigma value, used in adv loss.
+        self.sigma = 5e-6 if 'sigma' not in kwargs else kwargs['sigma']
+        # adv_bound value, used in adv loss.
+        self.adv_bound = 2 * self.sigma if 'adv_bound' not in kwargs else kwargs[
+            'adv_bound']
+
+        # adapter config
+        self.adapter_size = adapter_size
+        self.adapter_transformer_layers = adapter_transformer_layers
+        self.adapter_list = adapter_list
diff --git a/modelscope/models/nlp/plug_mental/text_classification.py b/modelscope/models/nlp/plug_mental/text_classification.py
new file mode 100644
index 00000000..e09a912d
--- /dev/null
+++ b/modelscope/models/nlp/plug_mental/text_classification.py
@@ -0,0 +1,223 @@
+# Copyright 2021-2022 The Alibaba DAMO NLP Team Authors.
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.utils.checkpoint
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+
+from modelscope.metainfo import Models
+from modelscope.models.builder import MODELS
+from modelscope.outputs import AttentionTextClassificationModelOutput
+from modelscope.utils import logger as logging
+from modelscope.utils.constant import Tasks
+from .adv_utils import compute_adv_loss
+from .backbone import PlugMentalModel, PlugMentalPreTrainedModel
+from .configuration import PlugMentalConfig
+
+logger = logging.get_logger()
+
+
+@MODELS.register_module(
+    Tasks.text_classification, module_name=Models.plug_mental)
+@MODELS.register_module(Tasks.nli, module_name=Models.plug_mental)
+@MODELS.register_module(
+    Tasks.sentiment_classification, module_name=Models.plug_mental)
+@MODELS.register_module(
+    Tasks.sentence_similarity, module_name=Models.plug_mental)
+@MODELS.register_module(
+    Tasks.zero_shot_classification, module_name=Models.plug_mental)
+class PlugMentalForSequenceClassification(PlugMentalPreTrainedModel):
+    r"""PlugMental Model transformer with a sequence classification/regression head on top
+    (a linear layer on top of the pooled output) e.g. for GLUE tasks.
+
+    This model inherits from :class:`~transformers.PreTrainedModel`. Check the superclass documentation for the generic
+    methods the library implements for all its model (such as downloading or saving, resizing the input embeddings,
+    pruning heads etc.)
+
+    This model is also a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`__
+    subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to
+    general usage and behavior.
+
+    Preprocessor:
+        This is the text classification model of PlugMental, the preprocessor of this model
+        is `modelscope.preprocessors.TextClassificationTransformersPreprocessor`.
+
+    Trainer:
+        This model is a normal PyTorch model, and can be trained by variable trainers, like EpochBasedTrainer,
+        NlpEpochBasedTrainer, or trainers from other frameworks.
+        The preferred trainer in ModelScope is NlpEpochBasedTrainer.
+
+    Parameters:
+        config (:class:`~modelscope.models.nlp.plug_mental.PlugMentalConfig`): Model configuration class with
+            all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model
+            weights.
+    """
+
+    def __init__(self, config: PlugMentalConfig, **kwargs):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.config = config
+        if self.config.adv_grad_factor is None:
+            logger.warning(
+                'Adv parameters not set, skipping compute_adv_loss.')
+
+        PlugMentalForSequenceClassification.base_model_prefix = getattr(
+            config, 'base_model_prefix',
+            PlugMentalForSequenceClassification.base_model_prefix)
+        setattr(self, self.base_model_prefix, PlugMentalModel(config))
+        classifier_dropout = (
+            config.classifier_dropout if config.classifier_dropout is not None
+            else config.hidden_dropout_prob)
+        self.dropout = nn.Dropout(classifier_dropout)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+        self.init_weights()
+
+    def _forward_call(self, **kwargs):
+        outputs = self.base_model(**kwargs)
+        pooled_output = outputs[1]
+        pooled_output = self.dropout(pooled_output)
+        logits = self.classifier(pooled_output)
+        outputs['logits'] = logits
+        outputs.kwargs = kwargs
+        return outputs
+
+    def forward(self,
+                input_ids=None,
+                attention_mask=None,
+                token_type_ids=None,
+                position_ids=None,
+                head_mask=None,
+                inputs_embeds=None,
+                labels=None,
+                output_attentions=None,
+                output_hidden_states=None,
+                return_dict=None,
+                *args,
+                **kwargs):
+        r"""
+        Args:
+            input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
+                Indices of input sequence tokens in the vocabulary.
+
+
+            attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+                Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+            token_type_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+                Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0,
+                1]``:
+
+                - 0 corresponds to a `sentence A` token,
+                - 1 corresponds to a `sentence B` token.
+
+            position_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+                Indices of positions of each input sequence tokens in the position embeddings. Selected in the range
+                ``[0, config.max_position_embeddings - 1]``.
+
+            head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`,
+            `optional`):
+                Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``:
+
+                - 1 indicates the head is **not masked**,
+                - 0 indicates the head is **masked**.
+
+            inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`,
+            `optional`):
+                Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded
+                representation. This is useful if you want more control over how to convert :obj:`input_ids`
+                indices into associated vectors than the model's internal embedding lookup matrix.
+            output_attentions (:obj:`bool`, `optional`):
+                Whether or not to return the attentions tensors of all attention layers. See
+                ``attentions`` under returned tensors for more detail.
+            output_hidden_states (:obj:`bool`, `optional`):
+                Whether or not to return the hidden states of all layers. See ``hidden_states`` under
+                returned tensors for more detail.
+            return_dict (:obj:`bool`, `optional`):
+                Whether or not to return a :class:`~transformers.ModelOutput` instead of a plain tuple.
+            labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
+                Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ...,
+                config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed
+                (Mean-Square loss), If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+
+        Returns:
+            Returns `modelscope.outputs.AttentionTextClassificationModelOutput`
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if not return_dict:
+            logger.error('Return tuple in sbert is not supported now.')
+        outputs = self._forward_call(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict)
+        return self.compute_loss(outputs, labels, **outputs.kwargs)
+
+    def compute_loss(self, outputs, labels, **kwargs):
+        logits = outputs.logits
+        embedding_output = outputs.embedding_output
+        loss = None
+        if labels is not None:
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = 'regression'
+                elif self.num_labels > 1 and (labels.dtype == torch.long
+                                              or labels.dtype == torch.int):
+                    self.config.problem_type = 'single_label_classification'
+                else:
+                    self.config.problem_type = 'multi_label_classification'
+
+            if self.config.problem_type == 'regression':
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(logits, labels)
+            elif self.config.problem_type == 'single_label_classification':
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(
+                    logits.view(-1, self.num_labels), labels.view(-1))
+                if self.config.adv_grad_factor is not None and self.training:
+                    loss = compute_adv_loss(
+                        embedding=embedding_output,
+                        model=self._forward_call,
+                        ori_logits=logits,
+                        ori_loss=loss,
+                        adv_bound=self.config.adv_bound,
+                        adv_grad_factor=self.config.adv_grad_factor,
+                        sigma=self.config.sigma,
+                        **kwargs)
+            elif self.config.problem_type == 'multi_label_classification':
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(logits, labels)
+
+        return AttentionTextClassificationModelOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
diff --git a/modelscope/models/nlp/ponet/backbone.py b/modelscope/models/nlp/ponet/backbone.py
index 731e6516..aeb58bec 100644
--- a/modelscope/models/nlp/ponet/backbone.py
+++ b/modelscope/models/nlp/ponet/backbone.py
@@ -714,69 +714,72 @@ class PoNetModel(PoNetPreTrainedModel):
     ):
         r"""
         Args:
-        input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
-            Indices of input sequence tokens in the vocabulary.
+            input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
+                Indices of input sequence tokens in the vocabulary.
 
-            Indices can be obtained using :class:`~modelscope.models.nlp.ponet.PoNetTokenizer`. See
-            :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
-            details.
+                Indices can be obtained using :class:`~modelscope.models.nlp.ponet.PoNetTokenizer`. See
+                :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__`
+                for details.
 
-        attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
-            Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+            attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+                Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
 
-            - 1 for tokens that are **not masked**,
-            - 0 for tokens that are **masked**.
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
 
-        token_type_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
-            Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0,
-            1]``:
+            token_type_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+                Segment token indices to indicate first and second portions of the inputs. Indices are selected in
+                ``[0,1]``:
 
-            - 0 corresponds to a `sentence A` token,
-            - 1 corresponds to a `sentence B` token.
+                - 0 corresponds to a `sentence A` token,
+                - 1 corresponds to a `sentence B` token.
 
-        position_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
-            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
-            config.max_position_embeddings - 1]``.
+            position_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+                Indices of positions of each input sequence tokens in the position embeddings. Selected in the range
+                ``[0,config.max_position_embeddings - 1]``.
 
-        head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`):
-            Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``:
+            head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`,
+                `optional`):
+                Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``:
 
-            - 1 indicates the head is **not masked**,
-            - 0 indicates the head is **masked**.
+                - 1 indicates the head is **not masked**,
+                - 0 indicates the head is **masked**.
 
-        inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
-            Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
-            This is useful if you want more control over how to convert :obj:`input_ids` indices into associated
-            vectors than the model's internal embedding lookup matrix.
-        output_attentions (:obj:`bool`, `optional`):
-            Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
-            tensors for more detail.
-        output_hidden_states (:obj:`bool`, `optional`):
-            Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
-            more detail.
-        return_dict (:obj:`bool`, `optional`):
-            Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
-        encoder_hidden_states
-            (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
-            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
-            the model is configured as a decoder.
-        encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
-            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
-            the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``:
+            inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`,
+                `optional`):
+                Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded
+                representation. This is useful if you want more control over how to convert :obj:`input_ids`
+                indices into associated vectors than the model's internal embedding lookup matrix.
+            output_attentions (:obj:`bool`, `optional`):
+                Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under
+                returned tensors for more detail.
+            output_hidden_states (:obj:`bool`, `optional`):
+                Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors
+                for more detail.
+            return_dict (:obj:`bool`, `optional`):
+                Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+            encoder_hidden_states
+                (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+                Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
+                the model is configured as a decoder.
+            encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+                Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used
+                in the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``:
 
-            - 1 for tokens that are **not masked**,
-            - 0 for tokens that are **masked**.
-        past_key_values (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers`
-            with each tuple having 4 tensors of shape :obj:
-            `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
-            Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+            past_key_values (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers`
+                with each tuple having 4 tensors of shape :obj:
+                `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+                Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up
+                decoding.
 
-            If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids`
-            (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
-            instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`.
-        use_cache (:obj:`bool`, `optional`):
-            If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
-            decoding (see :obj:`past_key_values`).
+                If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids`
+                (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
+                instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`.
+            use_cache (:obj:`bool`, `optional`):
+                If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
+                decoding (see :obj:`past_key_values`).
 
         Returns:
             Returns `modelscope.outputs.AttentionBackboneModelOutput`
diff --git a/modelscope/models/nlp/ponet/fill_mask.py b/modelscope/models/nlp/ponet/fill_mask.py
index 591b1041..053959b7 100644
--- a/modelscope/models/nlp/ponet/fill_mask.py
+++ b/modelscope/models/nlp/ponet/fill_mask.py
@@ -147,53 +147,54 @@ class PoNetForMaskedLM(PoNetPreTrainedModel):
     ):
         r"""
         Args:
-        input_ids (:obj:`torch.LongTensor` of shape :obj:`('batch_size, sequence_length')`):
-            Indices of input sequence tokens in the vocabulary.
+            input_ids (:obj:`torch.LongTensor` of shape :obj:`('batch_size, sequence_length')`):
+                Indices of input sequence tokens in the vocabulary.
 
-            Indices can be obtained using :class:`~modelscope.models.nlp.ponet.PoNetTokenizer`. See
-            :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
-            details.
+                Indices can be obtained using :class:`~modelscope.models.nlp.ponet.PoNetTokenizer`. See
+                :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__`
+                for details.
 
-        attention_mask (:obj:`torch.FloatTensor` of shape :obj:`('batch_size, sequence_length')`, `optional`):
-            Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+            attention_mask (:obj:`torch.FloatTensor` of shape :obj:`('batch_size, sequence_length')`, `optional`):
+                Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
 
-            - 1 for tokens that are **not masked**,
-            - 0 for tokens that are **masked**.
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
 
-        token_type_ids (:obj:`torch.LongTensor` of shape :obj:`('batch_size, sequence_length')`, `optional`):
-            Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0,
-            1]``:
+            token_type_ids (:obj:`torch.LongTensor` of shape :obj:`('batch_size, sequence_length')`, `optional`):
+                Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0,
+                1]``:
 
-            - 0 corresponds to a `sentence A` token,
-            - 1 corresponds to a `sentence B` token.
+                - 0 corresponds to a `sentence A` token,
+                - 1 corresponds to a `sentence B` token.
 
-        position_ids (:obj:`torch.LongTensor` of shape :obj:`('batch_size, sequence_length')`, `optional`):
-            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
-            config.max_position_embeddings - 1]``.
+            position_ids (:obj:`torch.LongTensor` of shape :obj:`('batch_size, sequence_length')`, `optional`):
+                Indices of positions of each input sequence tokens in the position embeddings. Selected in the range
+                ``[0, config.max_position_embeddings - 1]``.
 
-        head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`):
-            Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``:
+            head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`,
+                `optional`):
+                Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``:
 
-            - 1 indicates the head is **not masked**,
-            - 0 indicates the head is **masked**.
+                - 1 indicates the head is **not masked**,
+                - 0 indicates the head is **masked**.
 
-        inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`('batch_size, sequence_length', hidden_size)`,
-            `optional`):
-            Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
-            This is useful if you want more control over how to convert :obj:`input_ids` indices into associated
-            vectors than the model's internal embedding lookup matrix.
-        output_attentions (:obj:`bool`, `optional`):
-            Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
-            tensors for more detail.
-        output_hidden_states (:obj:`bool`, `optional`):
-            Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
-            more detail.
-        return_dict (:obj:`bool`, `optional`):
-            Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
-        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
-            Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ...,
-            config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored
-            (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``
+            inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`('batch_size, sequence_length', hidden_size)`,
+                `optional`):
+                Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded
+                representation. This is useful if you want more control over how to convert :obj:`input_ids`
+                indices into associated vectors than the model's internal embedding lookup matrix.
+            output_attentions (:obj:`bool`, `optional`):
+                Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under
+                returned tensors for more detail.
+            output_hidden_states (:obj:`bool`, `optional`):
+                Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors
+                for more detail.
+            return_dict (:obj:`bool`, `optional`):
+                Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+            labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+                Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ...,
+                config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored
+                (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``
 
         Returns:
             Returns `modelscope.outputs.AttentionFillMaskModelOutput`
diff --git a/modelscope/models/nlp/space/dialog_modeling.py b/modelscope/models/nlp/space/dialog_modeling.py
index 16e9dc53..faecfa54 100644
--- a/modelscope/models/nlp/space/dialog_modeling.py
+++ b/modelscope/models/nlp/space/dialog_modeling.py
@@ -87,6 +87,7 @@ class SpaceForDialogModeling(TorchModel):
                         'aspn': array([47,8345,32,29,1983]),
                         'db': array([19, 24, 20]),
                     }
+
         Examples:
             >>> from modelscope.hub.snapshot_download import snapshot_download
             >>> from modelscope.models.nlp import SpaceForDialogModeling
diff --git a/modelscope/models/nlp/space/model/model_base.py b/modelscope/models/nlp/space/model/model_base.py
index b7812182..4c5755e9 100644
--- a/modelscope/models/nlp/space/model/model_base.py
+++ b/modelscope/models/nlp/space/model/model_base.py
@@ -37,7 +37,7 @@ class SpaceModelBase(nn.Module):
         return
 
     def _create_parameters(self):
-        """ Create model's paramters. """
+        """ Create model's parameters. """
         raise NotImplementedError
 
     def _forward(self, inputs, is_training, with_label):
diff --git a/modelscope/models/nlp/space/model/unified_transformer.py b/modelscope/models/nlp/space/model/unified_transformer.py
index 19069971..53bf3146 100644
--- a/modelscope/models/nlp/space/model/unified_transformer.py
+++ b/modelscope/models/nlp/space/model/unified_transformer.py
@@ -107,7 +107,7 @@ class UnifiedTransformer(SpaceModelBase):
         return
 
     def _create_parameters(self):
-        """ Create model's paramters. """
+        """ Create model's parameters. """
         sequence_mask = np.tri(
             self.num_pos_embeddings,
             self.num_pos_embeddings,
diff --git a/modelscope/models/nlp/space_T_cn/backbone.py b/modelscope/models/nlp/space_T_cn/backbone.py
index 9cc2c349..b1df58ba 100644
--- a/modelscope/models/nlp/space_T_cn/backbone.py
+++ b/modelscope/models/nlp/space_T_cn/backbone.py
@@ -773,19 +773,17 @@ class SpaceTCnModel(PreTrainedBertModel):
             classifier pretrained on top of the hidden state associated to the first character of the
             input (`CLF`) to train on the Next-Sentence task (see BERT's paper).
 
-    Example usage:
-    ```python
-    # Already been converted into WordPiece token ids
-    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
-    input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
-    token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
+    Example:
+        >>> # Already been converted into WordPiece token ids
+        >>> input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
+        >>> input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
+        >>> token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
 
-    config = modeling.SpaceTCnConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
-        num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
+        >>> config = modeling.SpaceTCnConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
+        >>>     num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
 
-    model = modeling.SpaceTCnModel(config=config)
-    all_encoder_layers, pooled_output = model(input_ids, token_type_ids, input_mask)
-    ```
+        >>> model = modeling.SpaceTCnModel(config=config)
+        >>> all_encoder_layers, pooled_output = model(input_ids, token_type_ids, input_mask)
     """
 
     def __init__(self, config, schema_link_module='none'):
diff --git a/modelscope/models/nlp/space_T_cn/table_question_answering.py b/modelscope/models/nlp/space_T_cn/table_question_answering.py
index 82345da6..7f8eda99 100644
--- a/modelscope/models/nlp/space_T_cn/table_question_answering.py
+++ b/modelscope/models/nlp/space_T_cn/table_question_answering.py
@@ -740,32 +740,32 @@ class TableQuestionAnswering(Model):
 
 
         Returns:
-            Dict[str, Tensor]: results
-                Example:
-                    {
-                        'result':
-                            {
-                                'question_tok': ['有', '哪', '些', '风', '险', '类', '型', '？'],
-                                'question': '有哪些风险类型？',
-                                'table_id': 'fund',
-                                'sql': {
-                                    'cond_conn_op': 0,
-                                    'sel': [5],
-                                    'agg': [0],
-                                    'conds': [[10, 2, 'Nulll']]
-                                },
-                                'action': 10,
-                                'model_out': [
-                                    [6, 0, 0, 0],
-                                    [0, 0, 0, 0],
-                                    [0, 0, 0, 0, 0, 0],
-                                    [2, 0, 0, 0, 0, 0],
-                                    [0, 0, 0, 0, 0, 0],
-                                    [0, 0, 0, 0, 0, 0]
-                                ]
-                            },
-                        'history_sql': None
-                    }
+            Dict[str, Tensor]: results dict as follows:
+
+                >>> {
+                >>>     'result':
+                >>>         {
+                >>>             'question_tok': ['有', '哪', '些', '风', '险', '类', '型', '？'],
+                >>>             'question': '有哪些风险类型？',
+                >>>             'table_id': 'fund',
+                >>>             'sql': {
+                >>>                 'cond_conn_op': 0,
+                >>>                 'sel': [5],
+                >>>                 'agg': [0],
+                >>>                 'conds': [[10, 2, 'Nulll']]
+                >>>             },
+                >>>             'action': 10,
+                >>>             'model_out': [
+                >>>                 [6, 0, 0, 0],
+                >>>                 [0, 0, 0, 0],
+                >>>                 [0, 0, 0, 0, 0, 0],
+                >>>                 [2, 0, 0, 0, 0, 0],
+                >>>                 [0, 0, 0, 0, 0, 0],
+                >>>                 [0, 0, 0, 0, 0, 0]
+                >>>             ]
+                >>>         },
+                >>>     'history_sql': None
+                >>> }
 
         Example:
             >>> from modelscope.models.nlp import TableQuestionAnswering
diff --git a/modelscope/models/nlp/structbert/backbone.py b/modelscope/models/nlp/structbert/backbone.py
index 0ba3dbb7..58d324a8 100755
--- a/modelscope/models/nlp/structbert/backbone.py
+++ b/modelscope/models/nlp/structbert/backbone.py
@@ -730,68 +730,71 @@ class SbertModel(SbertPreTrainedModel):
                 **kwargs):
         r"""
         Args:
-        input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
-            Indices of input sequence tokens in the vocabulary.
+            input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
+                Indices of input sequence tokens in the vocabulary.
 
-            Indices can be obtained using :class:`~modelscope.models.nlp.structbert.SbertTokenizer`. See
-            :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
-            details.
+                Indices can be obtained using :class:`~modelscope.models.nlp.structbert.SbertTokenizer`. See
+                :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__`
+                for details.
 
-        attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
-            Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+            attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+                Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
 
-            - 1 for tokens that are **not masked**,
-            - 0 for tokens that are **masked**.
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
 
-        token_type_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
-            Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0,
-            1]``:
+            token_type_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+                Segment token indices to indicate first and second portions of the inputs. Indices are selected in
+                ``[0, 1]``:
 
-            - 0 corresponds to a `sentence A` token,
-            - 1 corresponds to a `sentence B` token.
+                - 0 corresponds to a `sentence A` token,
+                - 1 corresponds to a `sentence B` token.
 
-        position_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
-            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
-            config.max_position_embeddings - 1]``.
+            position_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+                Indices of positions of each input sequence tokens in the position embeddings. Selected in the range
+                ``[0, config.max_position_embeddings - 1]``.
 
-        head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`):
-            Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``:
+            head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`,
+                `optional`):
+                Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``:
 
-            - 1 indicates the head is **not masked**,
-            - 0 indicates the head is **masked**.
+                - 1 indicates the head is **not masked**,
+                - 0 indicates the head is **masked**.
 
-        inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
-            Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
-            This is useful if you want more control over how to convert :obj:`input_ids` indices into associated
-            vectors than the model's internal embedding lookup matrix.
-        output_attentions (:obj:`bool`, `optional`):
-            Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
-            tensors for more detail.
-        output_hidden_states (:obj:`bool`, `optional`):
-            Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
-            more detail.
-        return_dict (:obj:`bool`, `optional`):
-            Whether or not to return a :class:`~transformers.ModelOutput` instead of a plain tuple.
-        encoder_hidden_states  (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`,
-            `optional`):
-            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
-            the model is configured as a decoder.
-        encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
-            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
-            the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``:
+            inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`,
+                `optional`):
+                Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded
+                representation. This is useful if you want more control over how to convert :obj:`input_ids` indices
+                into associated vectors than the model's internal embedding lookup matrix.
+            output_attentions (:obj:`bool`, `optional`):
+                Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under
+                returned tensors for more detail.
+            output_hidden_states (:obj:`bool`, `optional`):
+                Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors
+                for more detail.
+            return_dict (:obj:`bool`, `optional`):
+                Whether or not to return a :class:`~transformers.ModelOutput` instead of a plain tuple.
+            encoder_hidden_states  (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`,
+                `optional`):
+                Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
+                the model is configured as a decoder.
+            encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+                Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used
+                in the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``:
 
-            - 1 for tokens that are **not masked**,
-            - 0 for tokens that are **masked**.
-        past_key_values (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers` with each tuple
-            having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
-            Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+            past_key_values (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers` with each tuple
+                having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+                Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up
+                decoding.
 
-            If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids`
-            (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
-            instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`.
-        use_cache (:obj:`bool`, `optional`):
-            If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
-            decoding (see :obj:`past_key_values`).
+                If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids`
+                (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
+                instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`.
+            use_cache (:obj:`bool`, `optional`):
+                If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
+                decoding (see :obj:`past_key_values`).
 
         Returns:
             Returns `modelscope.outputs.AttentionBackboneModelOutputWithEmbedding`
diff --git a/modelscope/models/nlp/structbert/faq_question_answering.py b/modelscope/models/nlp/structbert/faq_question_answering.py
index c5cd3061..bc22ab61 100644
--- a/modelscope/models/nlp/structbert/faq_question_answering.py
+++ b/modelscope/models/nlp/structbert/faq_question_answering.py
@@ -9,6 +9,7 @@ import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from torch import Tensor
+from torch.nn import BCEWithLogitsLoss
 
 from modelscope.metainfo import Models
 from modelscope.models.builder import MODELS
@@ -17,6 +18,9 @@ from modelscope.models.nlp.task_models.task_model import BaseTaskModel
 from modelscope.outputs import FaqQuestionAnsweringOutput
 from modelscope.utils.config import Config, ConfigFields
 from modelscope.utils.constant import ModelFile, Tasks
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
 
 activations = {
     'relu': F.relu,
@@ -88,9 +92,6 @@ class MetricsLayer(nn.Module):
         return self.args.metrics
 
     def forward(self, query, protos):
-        """ query : [bsz, n_query, dim]
-            support : [bsz, n_query, n_cls, dim] | [bsz, n_cls, dim]
-        """
         if self.args.metrics == 'cosine':
             supervised_dists = self.cosine_similarity(query, protos)
             if self.training:
@@ -102,8 +103,6 @@ class MetricsLayer(nn.Module):
         return supervised_dists
 
     def cosine_similarity(self, x, y):
-        # x=[bsz, n_query, dim]
-        # y=[bsz, n_cls, dim]
         n_query = x.shape[0]
         n_cls = y.shape[0]
         dim = x.shape[-1]
@@ -155,58 +154,87 @@ class PoolingLayer(nn.Module):
         return self.pooling(x, mask)
 
 
+class Alignment(nn.Module):
+
+    def __init__(self):
+        super().__init__()
+
+    def _attention(self, a, b):
+        return torch.matmul(a, b.transpose(1, 2))
+
+    def forward(self, a, b, mask_a, mask_b):
+        attn = self._attention(a, b)
+        mask = torch.matmul(mask_a.float(), mask_b.transpose(1, 2).float())
+        mask = mask.bool()
+        attn.masked_fill_(~mask, -1e4)
+        return attn
+
+
+def _create_args(model_config, hidden_size):
+    metric = model_config.get('metric', 'cosine')
+    pooling_method = model_config.get('pooling', 'avg')
+    Arg = namedtuple(
+        'args',
+        ['metrics', 'proj_hidden_size', 'hidden_size', 'dropout', 'pooling'])
+    args = Arg(
+        metrics=metric,
+        proj_hidden_size=hidden_size,
+        hidden_size=hidden_size,
+        dropout=0.0,
+        pooling=pooling_method)
+    return args
+
+
 @MODELS.register_module(
     Tasks.faq_question_answering, module_name=Models.structbert)
 class SbertForFaqQuestionAnswering(BaseTaskModel):
     _backbone_prefix = ''
+    PROTO_NET = 'protonet'
+    MGIMN_NET = 'mgimnnet'
 
     @classmethod
     def _instantiate(cls, **kwargs):
-        model = cls(kwargs.get('model_dir'))
-        model.load_checkpoint(kwargs.get('model_dir'))
+        model_dir = kwargs.pop('model_dir')
+        model = cls(model_dir, **kwargs)
+        model.load_checkpoint(model_dir)
         return model
 
     def __init__(self, model_dir, *args, **kwargs):
         super().__init__(model_dir, *args, **kwargs)
-
         backbone_cfg = SbertConfig.from_pretrained(model_dir)
-        self.bert = SbertModel(backbone_cfg)
-
         model_config = Config.from_file(
             os.path.join(model_dir,
                          ModelFile.CONFIGURATION)).get(ConfigFields.model, {})
+        model_config.update(kwargs)
 
-        metric = model_config.get('metric', 'cosine')
-        pooling_method = model_config.get('pooling', 'avg')
-
-        Arg = namedtuple('args', [
-            'metrics', 'proj_hidden_size', 'hidden_size', 'dropout', 'pooling'
-        ])
-        args = Arg(
-            metrics=metric,
-            proj_hidden_size=self.bert.config.hidden_size,
-            hidden_size=self.bert.config.hidden_size,
-            dropout=0.0,
-            pooling=pooling_method)
-
-        self.metrics_layer = MetricsLayer(args)
-        self.pooling = PoolingLayer(args)
+        network_name = model_config.get('network', self.PROTO_NET)
+        if network_name == self.PROTO_NET:
+            network = ProtoNet(backbone_cfg, model_config)
+        elif network_name == self.MGIMN_NET:
+            network = MGIMNNet(backbone_cfg, model_config)
+        else:
+            raise NotImplementedError(network_name)
+        logger.info(f'faq task build {network_name} network')
+        self.network = network
 
     def forward(self, input: Dict[str, Tensor]) -> FaqQuestionAnsweringOutput:
         """
         Args:
             input (Dict[str, Tensor]): the preprocessed data, it contains the following keys:
-                query(:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
+
+                - query(:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
                     The query to be predicted.
-                support(:obj:`torch.LongTensor` of shape :obj:`(support_size, sequence_length)`):
+                - support(:obj:`torch.LongTensor` of shape :obj:`(support_size, sequence_length)`):
                     The support set.
-                support_label(:obj:`torch.LongTensor` of shape :obj:`(support_size, )`):
+                - support_label(:obj:`torch.LongTensor` of shape :obj:`(support_size, )`):
                     The labels of support set.
 
         Returns:
             Dict[str, Tensor]: result, it contains the following key:
-                scores(:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_cls)`):
+
+                - scores(:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_cls)`):
                     Predicted scores of all classes for each query.
+
         Examples:
             >>> from modelscope.hub.snapshot_download import snapshot_download
             >>> from modelscope.preprocessors import FaqQuestionAnsweringTransformersPreprocessor
@@ -242,18 +270,87 @@ class SbertForFaqQuestionAnswering(BaseTaskModel):
         support = input['support']
         query_mask = input['query_attention_mask']
         support_mask = input['support_attention_mask']
-
-        n_query = query.shape[0]
-        n_support = support.shape[0]
-
         support_labels = input['support_labels']
+        logits, scores = self.network(query, support, query_mask, support_mask,
+                                      support_labels)
+
+        if 'labels' in input:
+            query_labels = input['labels']
+            num_cls = torch.max(support_labels) + 1
+            loss = self._compute_loss(logits, query_labels, num_cls)
+            pred_labels = torch.argmax(scores, dim=1)
+            return FaqQuestionAnsweringOutput(
+                loss=loss, logits=scores, labels=pred_labels).to_dict()
+        else:
+            return FaqQuestionAnsweringOutput(scores=scores)
+
+    def _compute_loss(self, logits, target, num_cls):
+        onehot_labels = get_onehot_labels(target, num_cls)
+        loss = BCEWithLogitsLoss(reduction='mean')(logits, onehot_labels)
+        return loss
+
+    def forward_sentence_embedding(self, inputs):
+        return self.network.sentence_embedding(inputs)
+
+    def load_checkpoint(self,
+                        model_local_dir,
+                        default_dtype=None,
+                        load_state_fn=None,
+                        **kwargs):
+        ckpt_file = os.path.join(model_local_dir, 'pytorch_model.bin')
+        state_dict = torch.load(ckpt_file, map_location='cpu')
+        # compatible with the old checkpoints
+        new_state_dict = {}
+        for var_name, var_value in state_dict.items():
+            new_var_name = var_name
+            if not str(var_name).startswith('network'):
+                new_var_name = f'network.{var_name}'
+            new_state_dict[new_var_name] = var_value
+        if default_dtype is not None:
+            torch.set_default_dtype(default_dtype)
+
+        missing_keys, unexpected_keys, mismatched_keys, error_msgs = self._load_checkpoint(
+            new_state_dict,
+            load_state_fn=load_state_fn,
+            ignore_mismatched_sizes=True,
+            _fast_init=True,
+        )
+
+        return {
+            'missing_keys': missing_keys,
+            'unexpected_keys': unexpected_keys,
+            'mismatched_keys': mismatched_keys,
+            'error_msgs': error_msgs,
+        }
+
+
+def get_onehot_labels(target, num_cls):
+    target = target.view(-1, 1)
+    size = target.shape[0]
+    target_oh = torch.zeros(size, num_cls).to(target)
+    target_oh.scatter_(dim=1, index=target, value=1)
+    return target_oh.view(size, num_cls).float()
+
+
+class ProtoNet(nn.Module):
+
+    def __init__(self, backbone_config, model_config):
+        super(ProtoNet, self).__init__()
+        self.bert = SbertModel(backbone_config)
+        args = _create_args(model_config, self.bert.config.hidden_size)
+        self.metrics_layer = MetricsLayer(args)
+        self.pooling = PoolingLayer(args)
+
+    def __call__(self, query, support, query_mask, support_mask,
+                 support_labels):
+        n_query = query.shape[0]
+
         num_cls = torch.max(support_labels) + 1
-        onehot_labels = self._get_onehot_labels(support_labels, n_support,
-                                                num_cls)
+        onehot_labels = get_onehot_labels(support_labels, num_cls)
 
         input_ids = torch.cat([query, support])
         input_mask = torch.cat([query_mask, support_mask], dim=0)
-        pooled_representation = self.forward_sentence_embedding({
+        pooled_representation = self.sentence_embedding({
             'input_ids':
             input_ids,
             'attention_mask':
@@ -269,29 +366,9 @@ class SbertForFaqQuestionAnswering(BaseTaskModel):
             scores = torch.sigmoid(logits)
         else:
             scores = logits
-        if 'labels' in input:
-            query_labels = input['labels']
-            loss = self._compute_loss(logits, query_labels, num_cls)
-            _, pred_labels = torch.max(scores, dim=1)
-            return FaqQuestionAnsweringOutput(
-                loss=loss, logits=scores).to_dict()
-        else:
-            return FaqQuestionAnsweringOutput(scores=scores)
+        return logits, scores
 
-    def _compute_loss(self, logits, target, num_cls):
-        from torch.nn import CrossEntropyLoss
-        logits = logits.view([-1, num_cls])
-        target = target.reshape(-1)
-        loss = CrossEntropyLoss(reduction='mean')(logits, target)
-        return loss
-
-    def _get_onehot_labels(self, labels, support_size, num_cls):
-        labels_ = labels.view(support_size, 1)
-        target_oh = torch.zeros(support_size, num_cls).to(labels)
-        target_oh.scatter_(dim=1, index=labels_, value=1)
-        return target_oh.view(support_size, num_cls).float()
-
-    def forward_sentence_embedding(self, inputs: Dict[str, Tensor]):
+    def sentence_embedding(self, inputs: Dict[str, Tensor]):
         input_ids = inputs['input_ids']
         input_mask = inputs['attention_mask']
         if not isinstance(input_ids, Tensor):
@@ -304,3 +381,273 @@ class SbertForFaqQuestionAnswering(BaseTaskModel):
             input_mask = input_mask.unsqueeze(-1)
         pooled_representation = self.pooling(last_hidden_states, input_mask)
         return pooled_representation
+
+
+class MGIMNNet(nn.Module):
+    # default use class_level_interaction only
+    INSTANCE_LEVEL_INTERACTION = 'instance_level_interaction'
+    EPISODE_LEVEL_INTERACTION = 'episode_level_interaction'
+
+    def __init__(self, backbone_config, model_config):
+        super(MGIMNNet, self).__init__()
+        self.bert = SbertModel(backbone_config)
+        self.model_config = model_config
+        self.alignment = Alignment()
+        hidden_size = self.bert.config.hidden_size
+        use_instance_level_interaction = self.safe_get(
+            self.INSTANCE_LEVEL_INTERACTION, True)
+        use_episode_level_interaction = self.safe_get(
+            self.EPISODE_LEVEL_INTERACTION, True)
+        output_size = 1 + int(use_instance_level_interaction) + int(
+            use_episode_level_interaction)
+        logger.info(
+            f'faq MGIMN model class-level-interaction:true, instance-level-interaction:{use_instance_level_interaction}, \
+            episode-level-interaction:{use_episode_level_interaction}')
+        self.fuse_proj = LinearProjection(
+            hidden_size + hidden_size * 3 * output_size,
+            hidden_size,
+            activation='relu')
+        args = _create_args(model_config, hidden_size)
+        self.pooling = PoolingLayer(args)
+        new_args = args._replace(pooling='avg')
+        self.avg_pooling = PoolingLayer(new_args)
+
+        self.instance_compare_layer = torch.nn.Sequential(
+            LinearProjection(hidden_size * 4, hidden_size, activation='relu'))
+
+        self.prediction = torch.nn.Sequential(
+            LinearProjection(hidden_size * 2, hidden_size, activation='relu'),
+            nn.Dropout(0), LinearProjection(hidden_size, 1))
+
+    def __call__(self, query, support, query_mask, support_mask,
+                 support_labels):
+        z_query, z_support = self.context_embedding(query, support, query_mask,
+                                                    support_mask)
+        n_cls = int(torch.max(support_labels)) + 1
+        n_query, sent_len = query.shape
+        n_support = support.shape[0]
+        k_shot = n_support // n_cls
+
+        q_params, s_params = {
+            'n_cls': n_cls,
+            'k_shot': k_shot
+        }, {
+            'n_query': n_query
+        }
+        if self.safe_get(self.INSTANCE_LEVEL_INTERACTION, True):
+            ins_z_query, ins_z_support = self._instance_level_interaction(
+                z_query, query_mask, z_support, support_mask)
+            q_params['ins_z_query'] = ins_z_query
+            s_params['ins_z_support'] = ins_z_support
+
+        cls_z_query, cls_z_support = self._class_level_interaction(
+            z_query, query_mask, z_support, support_mask, n_cls)
+        q_params['cls_z_query'] = cls_z_query
+        s_params['cls_z_support'] = cls_z_support
+        if self.safe_get(self.EPISODE_LEVEL_INTERACTION, True):
+            eps_z_query, eps_z_support = self._episode_level_interaction(
+                z_query, query_mask, z_support, support_mask)
+            q_params['eps_z_query'] = eps_z_query
+            s_params['eps_z_support'] = eps_z_support
+        fused_z_query = self._fuse_query(z_query, **q_params)
+        fused_z_support = self._fuse_support(z_support, **s_params)
+        query_mask_expanded = query_mask.unsqueeze(1).repeat(
+            1, n_support, 1).view(n_query * n_support, sent_len, 1)
+        support_mask_expanded = support_mask.unsqueeze(0).repeat(
+            n_query, 1, 1).view(n_query * n_support, sent_len, 1)
+        Q = self.pooling(fused_z_query, query_mask_expanded)
+        S = self.pooling(fused_z_support, support_mask_expanded)
+        matching_feature = self._instance_compare(Q, S, n_query, n_cls, k_shot)
+        logits = self.prediction(matching_feature)
+        logits = logits.view(n_query, n_cls)
+        return logits, torch.sigmoid(logits)
+
+    def _instance_compare(self, Q, S, n_query, n_cls, k_shot):
+        z_dim = Q.shape[-1]
+        S = S.view(n_query, n_cls * k_shot, z_dim)
+        Q = Q.view(n_query, k_shot * n_cls, z_dim)
+        cat_features = torch.cat([Q, S, Q * S, (Q - S).abs()], dim=-1)
+        instance_matching_feature = self.instance_compare_layer(cat_features)
+        instance_matching_feature = instance_matching_feature.view(
+            n_query, n_cls, k_shot, z_dim)
+        cls_matching_feature_mean = instance_matching_feature.mean(2)
+        cls_matching_feature_max, _ = instance_matching_feature.max(2)
+        cls_matching_feature = torch.cat(
+            [cls_matching_feature_mean, cls_matching_feature_max], dim=-1)
+        return cls_matching_feature
+
+    def _instance_level_interaction(self, z_query, query_mask, z_support,
+                                    support_mask):
+        n_query, sent_len, z_dim = z_query.shape
+        n_support = z_support.shape[0]
+        z_query = z_query.unsqueeze(1).repeat(1, n_support, 1,
+                                              1).view(n_query * n_support,
+                                                      sent_len, z_dim)
+        query_mask = query_mask.unsqueeze(1).repeat(1, n_support, 1).view(
+            n_query * n_support, sent_len, 1)
+        z_support = z_support.unsqueeze(0).repeat(n_query, 1, 1, 1).view(
+            n_query * n_support, sent_len, z_dim)
+        support_mask = support_mask.unsqueeze(0).repeat(n_query, 1, 1, 1).view(
+            n_query * n_support, sent_len, 1)
+        attn = self.alignment(z_query, z_support, query_mask, support_mask)
+        attn_a = F.softmax(attn, dim=1)
+        attn_b = F.softmax(attn, dim=2)
+        ins_support = torch.matmul(attn_a.transpose(1, 2), z_query)
+        ins_query = torch.matmul(attn_b, z_support)
+        return ins_query, ins_support
+
+    def _class_level_interaction(self, z_query, query_mask, z_support,
+                                 support_mask, n_cls):
+        z_support_ori = z_support
+        support_mask_ori = support_mask
+
+        n_query, sent_len, z_dim = z_query.shape
+        n_support = z_support.shape[0]
+        k_shot = n_support // n_cls
+
+        # class-based query encoding
+        z_query = z_query.unsqueeze(1).repeat(1, n_cls, 1,
+                                              1).view(n_query * n_cls,
+                                                      sent_len, z_dim)
+        query_mask = query_mask.unsqueeze(1).unsqueeze(-1).repeat(
+            1, n_cls, 1, 1).view(n_query * n_cls, sent_len, 1)
+        z_support = z_support.unsqueeze(0).repeat(n_query, 1, 1, 1).view(
+            n_query * n_cls, k_shot * sent_len, z_dim)
+        support_mask = support_mask.unsqueeze(0).unsqueeze(-1).repeat(
+            n_query, 1, 1, 1).view(n_query * n_cls, k_shot * sent_len, 1)
+        attn = self.alignment(z_query, z_support, query_mask, support_mask)
+        attn_b = F.softmax(attn, dim=2)
+        cls_query = torch.matmul(attn_b, z_support)
+        cls_query = cls_query.view(n_query, n_cls, sent_len, z_dim)
+
+        # class-based support encoding
+        z_support = z_support_ori.view(n_cls, k_shot * sent_len, z_dim)
+        support_mask = support_mask_ori.view(n_cls, k_shot * sent_len, 1)
+        attn = self.alignment(z_support, z_support, support_mask, support_mask)
+        attn_b = F.softmax(attn, dim=2)
+        cls_support = torch.matmul(attn_b, z_support)
+        cls_support = cls_support.view(n_cls * k_shot, sent_len, z_dim)
+        return cls_query, cls_support
+
+    def _episode_level_interaction(self, z_query, query_mask, z_support,
+                                   support_mask):
+        z_support_ori = z_support
+        support_mask_ori = support_mask
+
+        n_query, sent_len, z_dim = z_query.shape
+        n_support = z_support.shape[0]
+
+        # episode-based query encoding
+        query_mask = query_mask.view(n_query, sent_len, 1)
+        z_support = z_support.unsqueeze(0).repeat(n_query, 1, 1, 1).view(
+            n_query, n_support * sent_len, z_dim)
+        support_mask = support_mask.unsqueeze(0).unsqueeze(-1).repeat(
+            n_query, 1, 1, 1).view(n_query, n_support * sent_len, 1)
+        attn = self.alignment(z_query, z_support, query_mask, support_mask)
+        attn_b = F.softmax(attn, dim=2)
+        eps_query = torch.matmul(attn_b, z_support)
+
+        # episode-based support encoding
+        z_support2 = z_support_ori.view(1, n_support * sent_len,
+                                        z_dim).repeat(n_support, 1, 1)
+        support_mask = support_mask_ori.view(1, n_support * sent_len,
+                                             1).repeat(n_support, 1, 1)
+        attn = self.alignment(z_support_ori, z_support2,
+                              support_mask_ori.unsqueeze(-1), support_mask)
+        attn_b = F.softmax(attn, dim=2)
+        eps_support = torch.matmul(attn_b, z_support2)
+        eps_support = eps_support.view(n_support, sent_len, z_dim)
+        return eps_query, eps_support
+
+    def _fuse_query(self,
+                    x,
+                    n_cls,
+                    k_shot,
+                    ins_z_query=None,
+                    cls_z_query=None,
+                    eps_z_query=None):
+        n_query, sent_len, z_dim = x.shape
+        assert cls_z_query is not None
+        cls_features = cls_z_query.unsqueeze(2).repeat(
+            1, 1, k_shot, 1, 1).view(n_cls * k_shot * n_query, sent_len, z_dim)
+        x = x.unsqueeze(1).repeat(1, n_cls * k_shot, 1,
+                                  1).view(n_cls * k_shot * n_query, sent_len,
+                                          z_dim)
+        features = [
+            x, cls_features, x * cls_features, (x - cls_features).abs()
+        ]
+        if ins_z_query is not None:
+            features.extend(
+                [ins_z_query, ins_z_query * x, (ins_z_query - x).abs()])
+        if eps_z_query is not None:
+            eps_z_query = eps_z_query.unsqueeze(1).repeat(
+                1, n_cls * k_shot, 1, 1).view(n_cls * k_shot * n_query,
+                                              sent_len, z_dim)
+            features.extend(
+                [eps_z_query, eps_z_query * x, (eps_z_query - x).abs()])
+        features = torch.cat(features, dim=-1)
+        fusion_feat = self.fuse_proj(features)
+        return fusion_feat
+
+    def _fuse_support(self,
+                      x,
+                      n_query,
+                      ins_z_support=None,
+                      cls_z_support=None,
+                      eps_z_support=None):
+        assert cls_z_support is not None
+        n_support, sent_len, z_dim = x.shape
+        x = x.unsqueeze(0).repeat(n_query, 1, 1,
+                                  1).view(n_support * n_query, sent_len, z_dim)
+        cls_features = cls_z_support.unsqueeze(0).repeat(
+            n_query, 1, 1, 1).view(n_support * n_query, sent_len, z_dim)
+        features = [
+            x, cls_features, x * cls_features, (x - cls_features).abs()
+        ]
+        if ins_z_support is not None:
+            features.extend(
+                [ins_z_support, ins_z_support * x, (ins_z_support - x).abs()])
+        if eps_z_support is not None:
+            eps_z_support = eps_z_support.unsqueeze(0).repeat(
+                n_query, 1, 1, 1).view(n_query * n_support, sent_len, z_dim)
+            features.extend(
+                [eps_z_support, eps_z_support * x, (eps_z_support - x).abs()])
+        features = torch.cat(features, dim=-1)
+        fusion_feat = self.fuse_proj(features)
+        return fusion_feat
+
+    def context_embedding(self, query, support, query_mask, support_mask):
+        n_query = query.shape[0]
+        n_support = support.shape[0]
+        x = torch.cat([query, support], dim=0)
+        x_mask = torch.cat([query_mask, support_mask], dim=0)
+        last_hidden_state = self.bert(x, x_mask).last_hidden_state
+        z_dim = last_hidden_state.shape[-1]
+        sent_len = last_hidden_state.shape[-2]
+        z_query = last_hidden_state[:n_query].view([n_query, sent_len, z_dim])
+        z_support = last_hidden_state[n_query:].view(
+            [n_support, sent_len, z_dim])
+        return z_query, z_support
+
+    def sentence_embedding(self, inputs: Dict[str, Tensor]):
+        input_ids = inputs['input_ids']
+        input_mask = inputs['attention_mask']
+        if not isinstance(input_ids, Tensor):
+            input_ids = torch.IntTensor(input_ids)
+        if not isinstance(input_mask, Tensor):
+            input_mask = torch.IntTensor(input_mask)
+        rst = self.bert(input_ids, input_mask)
+        last_hidden_states = rst.last_hidden_state
+        if len(input_mask.shape) == 2:
+            input_mask = input_mask.unsqueeze(-1)
+        pooled_representation = self.avg_pooling(last_hidden_states,
+                                                 input_mask)
+        return pooled_representation
+
+    def safe_get(self, k, default=None):
+        try:
+            return self.model_config.get(k, default)
+        except Exception as e:
+            logger.debug(f'{k} not in model_config, use default:{default}')
+            logger.debug(e)
+            return default
diff --git a/modelscope/models/nlp/structbert/fill_mask.py b/modelscope/models/nlp/structbert/fill_mask.py
index 3554d0c7..4ded0d6c 100644
--- a/modelscope/models/nlp/structbert/fill_mask.py
+++ b/modelscope/models/nlp/structbert/fill_mask.py
@@ -156,57 +156,59 @@ class SbertForMaskedLM(SbertPreTrainedModel):
     ):
         r"""
         Args:
-        input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
-            Indices of input sequence tokens in the vocabulary.
+            input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
+                Indices of input sequence tokens in the vocabulary.
 
-            Indices can be obtained using :class:`~modelscope.models.nlp.structbert.SbertTokenizer`. See
-            :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
-            details.
+                Indices can be obtained using :class:`~modelscope.models.nlp.structbert.SbertTokenizer`. See
+                :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__`
+                for details.
 
-            `What are input IDs? <../glossary.html#input-ids>`__
+                `What are input IDs? <../glossary.html#input-ids>`__
 
-        attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
-            Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+            attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+                Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
 
-            - 1 for tokens that are **not masked**,
-            - 0 for tokens that are **masked**.
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
 
-            `What are attention masks? <../glossary.html#attention-mask>`__
+                `What are attention masks? <../glossary.html#attention-mask>`__
 
-        token_type_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
-            Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0,
-            1]``:
+            token_type_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+                Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0,
+                1]``:
 
-            - 0 corresponds to a `sentence A` token,
-            - 1 corresponds to a `sentence B` token.
+                - 0 corresponds to a `sentence A` token,
+                - 1 corresponds to a `sentence B` token.
 
-            `What are token type IDs? <../glossary.html#token-type-ids>`_
-        position_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
-            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
-            config.max_position_embeddings - 1]``.
+                `What are token type IDs? <../glossary.html#token-type-ids>`_
+            position_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+                Indices of positions of each input sequence tokens in the position embeddings. Selected in the range
+                ``[0, config.max_position_embeddings - 1]``.
 
-        head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`):
-            Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``:
+            head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`,
+                `optional`):
+                Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``:
 
-            - 1 indicates the head is **not masked**,
-            - 0 indicates the head is **masked**.
+                - 1 indicates the head is **not masked**,
+                - 0 indicates the head is **masked**.
 
-        inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
-            Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
-            This is useful if you want more control over how to convert :obj:`input_ids` indices into associated
-            vectors than the model's internal embedding lookup matrix.
-        output_attentions (:obj:`bool`, `optional`):
-            Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
-            tensors for more detail.
-        output_hidden_states (:obj:`bool`, `optional`):
-            Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
-            more detail.
-        return_dict (:obj:`bool`, `optional`):
-            Whether or not to return a :class:`~transformers.ModelOutput` instead of a plain tuple.
-        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
-            Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ...,
-            config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored
-            (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``
+            inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`,
+                `optional`):
+                Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded
+                representation. This is useful if you want more control over how to convert :obj:`input_ids` indices
+                into associated vectors than the model's internal embedding lookup matrix.
+            output_attentions (:obj:`bool`, `optional`):
+                Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under
+                returned tensors for more detail.
+            output_hidden_states (:obj:`bool`, `optional`):
+                Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors
+                for more detail.
+            return_dict (:obj:`bool`, `optional`):
+                Whether or not to return a :class:`~transformers.ModelOutput` instead of a plain tuple.
+            labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+                Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ...,
+                config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored
+                (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``
 
         Returns:
             Returns `modelscope.outputs.AttentionFillMaskModelOutput`
diff --git a/modelscope/models/nlp/structbert/text_classification.py b/modelscope/models/nlp/structbert/text_classification.py
index f0f0c440..d1cdb01d 100644
--- a/modelscope/models/nlp/structbert/text_classification.py
+++ b/modelscope/models/nlp/structbert/text_classification.py
@@ -113,52 +113,54 @@ class SbertForSequenceClassification(SbertPreTrainedModel):
                 **kwargs):
         r"""
         Args:
-        input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
-            Indices of input sequence tokens in the vocabulary.
+            input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
+                Indices of input sequence tokens in the vocabulary.
 
-            Indices can be obtained using :class:`~modelscope.models.nlp.structbert.SbertTokenizer`. See
-            :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
-            details.
+                Indices can be obtained using :class:`~modelscope.models.nlp.structbert.SbertTokenizer`. See
+                :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__`
+                for details.
 
-        attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
-            Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+            attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+                Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
 
-            - 1 for tokens that are **not masked**,
-            - 0 for tokens that are **masked**.
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
 
-        token_type_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
-            Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0,
-            1]``:
+            token_type_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+                Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0,
+                1]``:
 
-            - 0 corresponds to a `sentence A` token,
-            - 1 corresponds to a `sentence B` token.
+                - 0 corresponds to a `sentence A` token,
+                - 1 corresponds to a `sentence B` token.
 
-        position_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
-            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
-            config.max_position_embeddings - 1]``.
+            position_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+                Indices of positions of each input sequence tokens in the position embeddings. Selected in the range
+                ``[0, config.max_position_embeddings - 1]``.
 
-        head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`):
-            Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``:
+            head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`,
+                `optional`):
+                Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``:
 
-            - 1 indicates the head is **not masked**,
-            - 0 indicates the head is **masked**.
+                - 1 indicates the head is **not masked**,
+                - 0 indicates the head is **masked**.
 
-        inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
-            Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
-            This is useful if you want more control over how to convert :obj:`input_ids` indices into associated
-            vectors than the model's internal embedding lookup matrix.
-        output_attentions (:obj:`bool`, `optional`):
-            Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
-            tensors for more detail.
-        output_hidden_states (:obj:`bool`, `optional`):
-            Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
-            more detail.
-        return_dict (:obj:`bool`, `optional`):
-            Whether or not to return a :class:`~transformers.ModelOutput` instead of a plain tuple.
-        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
-            Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ...,
-            config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
-            If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+            inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`,
+                `optional`):
+                Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded
+                representation. This is useful if you want more control over how to convert :obj:`input_ids` indices
+                into associated vectors than the model's internal embedding lookup matrix.
+            output_attentions (:obj:`bool`, `optional`):
+                Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under
+                returned tensors for more detail.
+            output_hidden_states (:obj:`bool`, `optional`):
+                Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors
+                for more detail.
+            return_dict (:obj:`bool`, `optional`):
+                Whether or not to return a :class:`~transformers.ModelOutput` instead of a plain tuple.
+            labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
+                Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ...,
+                config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed
+                (Mean-Square loss), If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
 
         Returns:
             Returns `modelscope.outputs.AttentionTextClassificationModelOutput`
diff --git a/modelscope/models/nlp/structbert/token_classification.py b/modelscope/models/nlp/structbert/token_classification.py
index 8bfd46bc..4ee2dda7 100644
--- a/modelscope/models/nlp/structbert/token_classification.py
+++ b/modelscope/models/nlp/structbert/token_classification.py
@@ -110,62 +110,62 @@ class SbertForTokenClassification(SbertPreTrainedModel):
     ):
         r"""
         Args:
-        input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
-            Indices of input sequence tokens in the vocabulary.
+            input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
+                Indices of input sequence tokens in the vocabulary.
 
-            Indices can be obtained using :class:`~modelscope.models.nlp.structbert.SbertTokenizer`. See
-            :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
-            details.
+                Indices can be obtained using :class:`~modelscope.models.nlp.structbert.SbertTokenizer`. See
+                :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__`
+                for details.
 
-        attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
-            Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+            attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+                Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
 
-            - 1 for tokens that are **not masked**,
-            - 0 for tokens that are **masked**.
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
 
-        token_type_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
-            Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0,
-            1]``:
+            token_type_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+                Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0,
+                1]``:
 
-            - 0 corresponds to a `sentence A` token,
-            - 1 corresponds to a `sentence B` token.
+                - 0 corresponds to a `sentence A` token,
+                - 1 corresponds to a `sentence B` token.
 
-        position_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
-            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
-            config.max_position_embeddings - 1]``.
+            position_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+                Indices of positions of each input sequence tokens in the position embeddings. Selected in the range
+                ``[0, config.max_position_embeddings - 1]``.
 
-        head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`):
-            Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``:
+            head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`,
+                `optional`):
+                Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``:
 
-            - 1 indicates the head is **not masked**,
-            - 0 indicates the head is **masked**.
+                - 1 indicates the head is **not masked**,
+                - 0 indicates the head is **masked**.
 
-        inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
-            Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
-            This is useful if you want more control over how to convert :obj:`input_ids` indices into associated
-            vectors than the model's internal embedding lookup matrix.
-        output_attentions (:obj:`bool`, `optional`):
-            Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
-            tensors for more detail.
-        output_hidden_states (:obj:`bool`, `optional`):
-            Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
-            more detail.
-        return_dict (:obj:`bool`, `optional`):
-            Whether or not to return a :class:`~transformers.ModelOutput` instead of a plain tuple.
-        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
-            Labels for computing the token classification loss. Indices should be in ``[0, ..., config.num_labels -
-            1]``.
-        offset_mapping (:obj:`torch.FloatTensor` of shape :obj:`(batch_size,
-        sequence_length)`, `optional`):
-            Indices of positions of each input sequence tokens in the sentence.
-            Selected in the range ``[0, sequence_length - 1]``.
-        label_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size,
-        sequence_length)`, `optional`):
-            Mask to avoid performing attention on padding token indices. Mask
-            values selected in ``[0, 1]``:
+            inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`,
+                `optional`):
+                Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded
+                representation. This is useful if you want more control over how to convert :obj:`input_ids` indices
+                into associated vectors than the model's internal embedding lookup matrix.
+            output_attentions (:obj:`bool`, `optional`):
+                Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under
+                returned tensors for more detail.
+            output_hidden_states (:obj:`bool`, `optional`):
+                Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors
+                for more detail.
+            return_dict (:obj:`bool`, `optional`):
+                Whether or not to return a :class:`~transformers.ModelOutput` instead of a plain tuple.
+            labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+                Labels for computing the token classification loss. Indices should be in
+                ``[0, ..., config.num_labels - 1]``.
+            offset_mapping (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+                Indices of positions of each input sequence tokens in the sentence.
+                Selected in the range ``[0, sequence_length - 1]``.
+            label_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+                Mask to avoid performing attention on padding token indices. Mask
+                values selected in ``[0, 1]``:
 
-            - 1 for tokens that are **not masked**,
-            - 0 for tokens that are **masked**.
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
 
         Returns:
             Returns `modelscope.outputs.AttentionTokenClassificationModelOutput`
diff --git a/modelscope/models/nlp/task_models/__init__.py b/modelscope/models/nlp/task_models/__init__.py
index aaea718e..cd8ca926 100644
--- a/modelscope/models/nlp/task_models/__init__.py
+++ b/modelscope/models/nlp/task_models/__init__.py
@@ -4,37 +4,27 @@ from typing import TYPE_CHECKING
 from modelscope.utils.import_utils import LazyImportModule
 
 if TYPE_CHECKING:
-    from .information_extraction import InformationExtractionModel
-    from .feature_extraction import FeatureExtractionModel
-    from .fill_mask import FillMaskModel
-    from .nncrf_for_named_entity_recognition import (
-        LSTMCRFForNamedEntityRecognition,
-        LSTMCRFForWordSegmentation,
-        LSTMCRFForPartOfSpeech,
-        TransformerCRFForNamedEntityRecognition,
-        TransformerCRFForWordSegmentation,
-    )
-    from .sequence_classification import SequenceClassificationModel
+    from .information_extraction import ModelForInformationExtraction
+    from .feature_extraction import ModelForFeatureExtraction
+    from .fill_mask import ModelForFillMask
+    from .text_classification import ModelForTextClassification
     from .task_model import SingleBackboneTaskModelBase
-    from .token_classification import TokenClassificationModel
-    from .text_generation import TaskModelForTextGeneration
+    from .token_classification import (ModelForTokenClassification,
+                                       ModelForTokenClassificationWithCRF)
+    from .text_generation import ModelForTextGeneration
+    from .text_ranking import ModelForTextRanking
 
 else:
     _import_structure = {
-        'information_extraction': ['InformationExtractionModel'],
-        'feature_extraction': ['FeatureExtractionModel'],
-        'fill_mask': ['FillMaskModel'],
-        'nncrf_for_named_entity_recognition': [
-            'LSTMCRFForNamedEntityRecognition',
-            'LSTMCRFForWordSegmentation',
-            'LSTMCRFForPartOfSpeech',
-            'TransformerCRFForNamedEntityRecognition',
-            'TransformerCRFForWordSegmentation',
-        ],
-        'sequence_classification': ['SequenceClassificationModel'],
+        'information_extraction': ['ModelForInformationExtraction'],
+        'feature_extraction': ['ModelForFeatureExtraction'],
+        'fill_mask': ['ModelForFillMask'],
+        'text_classification': ['ModelForTextClassification'],
         'task_model': ['SingleBackboneTaskModelBase'],
-        'token_classification': ['TokenClassificationModel'],
-        'text_generation': ['TaskModelForTextGeneration'],
+        'token_classification':
+        ['ModelForTokenClassification', 'ModelForTokenClassificationWithCRF'],
+        'text_generation': ['ModelForTextGeneration'],
+        'text_ranking': ['ModelForTextRanking'],
     }
 
     import sys
diff --git a/modelscope/models/nlp/task_models/feature_extraction.py b/modelscope/models/nlp/task_models/feature_extraction.py
index f6214e9c..e39f5d8d 100644
--- a/modelscope/models/nlp/task_models/feature_extraction.py
+++ b/modelscope/models/nlp/task_models/feature_extraction.py
@@ -5,33 +5,105 @@ import numpy as np
 
 from modelscope.metainfo import TaskModels
 from modelscope.models.builder import MODELS
-from modelscope.models.nlp.task_models.task_model import \
-    SingleBackboneTaskModelBase
+from modelscope.models.nlp.task_models.task_model import EncoderModel
 from modelscope.outputs import FeatureExtractionOutput, OutputKeys
 from modelscope.utils.constant import Tasks
 
-__all__ = ['FeatureExtractionModel']
+__all__ = ['ModelForFeatureExtraction']
 
 
 @MODELS.register_module(
     Tasks.feature_extraction, module_name=TaskModels.feature_extraction)
-class FeatureExtractionModel(SingleBackboneTaskModelBase):
+class ModelForFeatureExtraction(EncoderModel):
+    task = Tasks.feature_extraction
 
-    def __init__(self, model_dir: str, *args, **kwargs):
-        """initialize the fill mask model from the `model_dir` path.
+    # There is no head for feature extraction method
+    head_type = None
 
+    def forward(self,
+                input_ids=None,
+                attention_mask=None,
+                token_type_ids=None,
+                position_ids=None,
+                head_mask=None,
+                inputs_embeds=None,
+                labels=None,
+                output_attentions=None,
+                output_hidden_states=None,
+                return_dict=None,
+                **kwargs):
+        r"""
         Args:
-            model_dir (str): the model path.
+        input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary.
+
+            Indices can be obtained using :class:`~modelscope.models.nlp.structbert.SbertTokenizer`. See
+            :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
+            details.
+
+        attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+        token_type_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0,
+            1]``:
+
+            - 0 corresponds to a `sentence A` token,
+            - 1 corresponds to a `sentence B` token.
+
+        position_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
+            config.max_position_embeddings - 1]``.
+
+        head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`):
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+            Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
+            This is useful if you want more control over how to convert :obj:`input_ids` indices into associated
+            vectors than the model's internal embedding lookup matrix.
+        output_attentions (:obj:`bool`, `optional`):
+            Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
+            tensors for more detail.
+        output_hidden_states (:obj:`bool`, `optional`):
+            Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
+            more detail.
+        return_dict (:obj:`bool`, `optional`):
+            Whether or not to return a :class:`~transformers.ModelOutput` instead of a plain tuple.
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
+            Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ...,
+            config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
+            If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+
+        Returns:
+            Returns `modelscope.outputs.AttentionTextClassificationModelOutput`
+
+        Examples:
+            >>> from modelscope.models import Model
+            >>> from modelscope.preprocessors import Preprocessor
+            >>> model = Model.from_pretrained('damo/nlp_structbert_sentence-similarity_chinese-base')
+            >>> preprocessor = Preprocessor.from_pretrained('damo/nlp_structbert_sentence-similarity_chinese-base')
+            >>> print(model(**preprocessor(('This is a test', 'This is also a test'))))
         """
-        super().__init__(model_dir, *args, **kwargs)
-        if 'base_model_prefix' in kwargs:
-            self._base_model_prefix = kwargs['base_model_prefix']
 
-        self.build_backbone(self.backbone_cfg)
-
-    def forward(self, **input: Dict[str, Any]) -> FeatureExtractionOutput:
         # backbone do not need labels, only head need for loss compute
-        input.pop(OutputKeys.LABELS, None)
-        outputs = super().forward(input)
-        sequence_output = outputs.last_hidden_state
-        return FeatureExtractionOutput(text_embedding=sequence_output)
+        feature = self.extract_feature(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        return FeatureExtractionOutput(
+            text_embedding=feature.last_hidden_state)
diff --git a/modelscope/models/nlp/task_models/fill_mask.py b/modelscope/models/nlp/task_models/fill_mask.py
index 0f7d3345..b96e86c5 100644
--- a/modelscope/models/nlp/task_models/fill_mask.py
+++ b/modelscope/models/nlp/task_models/fill_mask.py
@@ -2,47 +2,73 @@
 from typing import Any, Dict
 
 import numpy as np
+import torch
 
-from modelscope.metainfo import TaskModels
+from modelscope.metainfo import Heads, TaskModels
 from modelscope.models.builder import MODELS
-from modelscope.models.nlp.bert import BertConfig
-from modelscope.models.nlp.task_models.task_model import \
-    SingleBackboneTaskModelBase
-from modelscope.outputs import OutputKeys
+from modelscope.models.nlp.task_models.task_model import EncoderModel
 from modelscope.utils.constant import Tasks
-from modelscope.utils.hub import parse_label_mapping
 
-__all__ = ['FillMaskModel']
+__all__ = ['ModelForFillMask']
 
 
 @MODELS.register_module(Tasks.fill_mask, module_name=TaskModels.fill_mask)
-class FillMaskModel(SingleBackboneTaskModelBase):
+class ModelForFillMask(EncoderModel):
+    task = Tasks.fill_mask
 
-    def __init__(self, model_dir: str, *args, **kwargs):
-        """initialize the fill mask model from the `model_dir` path.
+    # The default base head type is fill-mask for this head
+    head_type = Heads.fill_mask
 
-        Args:
-            model_dir (str): the model path.
-        """
-        super().__init__(model_dir, *args, **kwargs)
-        if 'base_model_prefix' in kwargs:
-            self._base_model_prefix = kwargs['base_model_prefix']
+    _keys_to_ignore_on_load_unexpected = [r'pooler']
+    _keys_to_ignore_on_load_missing = [
+        r'position_ids', r'predictions.decoder.bias'
+    ]
 
-        self.build_backbone(self.backbone_cfg)
-        self.build_head(self.head_cfg)
+    def forward(self,
+                input_ids=None,
+                attention_mask=None,
+                token_type_ids=None,
+                position_ids=None,
+                head_mask=None,
+                inputs_embeds=None,
+                labels=None,
+                output_attentions=None,
+                output_hidden_states=None,
+                return_dict=None,
+                **kwargs):
+        outputs = super().forward(input_ids, attention_mask, token_type_ids,
+                                  position_ids, head_mask, inputs_embeds,
+                                  labels, output_attentions,
+                                  output_hidden_states, **kwargs)
 
-    def forward(self, **input: Dict[str, Any]) -> Dict[str, np.ndarray]:
-
-        # backbone do not need labels, only head need for loss compute
-        labels = input.pop(OutputKeys.LABELS, None)
-
-        outputs = super().forward(input)
-        sequence_output = outputs.last_hidden_state
-        outputs = self.head.forward(sequence_output)
-
-        if labels is not None:
-            input[OutputKeys.LABELS] = labels
-            loss = self.compute_loss(outputs, labels)
-            outputs.update(loss)
-        outputs[OutputKeys.INPUT_IDS] = input[OutputKeys.INPUT_IDS]
+        outputs.input_ids = input_ids
         return outputs
+
+    def get_output_embeddings(self):
+        return self.head.cls.predictions.decoder
+
+    def set_output_embeddings(self, new_embeddings):
+        self.head.cls.predictions.decoder = new_embeddings
+
+    def prepare_inputs_for_generation(self,
+                                      input_ids,
+                                      attention_mask=None,
+                                      **model_kwargs):
+        input_shape = input_ids.shape
+        effective_batch_size = input_shape[0]
+
+        #  add a dummy token
+        if self.config.pad_token_id is None:
+            raise ValueError('The PAD token should be defined for generation')
+        attention_shape0 = attention_mask.shape[0]
+        attention_mask = torch.cat(
+            [attention_mask,
+             attention_mask.new_zeros((attention_shape0, 1))],
+            dim=-1)
+        dummy_token = torch.full((effective_batch_size, 1),
+                                 self.config.pad_token_id,
+                                 dtype=torch.long,
+                                 device=input_ids.device)
+        input_ids = torch.cat([input_ids, dummy_token], dim=1)
+
+        return {'input_ids': input_ids, 'attention_mask': attention_mask}
diff --git a/modelscope/models/nlp/task_models/information_extraction.py b/modelscope/models/nlp/task_models/information_extraction.py
index 3a8380a6..0eb14277 100644
--- a/modelscope/models/nlp/task_models/information_extraction.py
+++ b/modelscope/models/nlp/task_models/information_extraction.py
@@ -3,14 +3,12 @@ from typing import Any, Dict
 
 import numpy as np
 
-from modelscope.metainfo import TaskModels
+from modelscope.metainfo import Heads, TaskModels
 from modelscope.models.builder import MODELS
-from modelscope.models.nlp.task_models.task_model import \
-    SingleBackboneTaskModelBase
-from modelscope.outputs import InformationExtractionOutput, OutputKeys
+from modelscope.models.nlp.task_models.task_model import EncoderModel
 from modelscope.utils.constant import Tasks
 
-__all__ = ['InformationExtractionModel']
+__all__ = ['ModelForInformationExtraction']
 
 
 @MODELS.register_module(
@@ -18,22 +16,8 @@ __all__ = ['InformationExtractionModel']
     module_name=TaskModels.information_extraction)
 @MODELS.register_module(
     Tasks.relation_extraction, module_name=TaskModels.information_extraction)
-class InformationExtractionModel(SingleBackboneTaskModelBase):
+class ModelForInformationExtraction(EncoderModel):
+    task = Tasks.information_extraction
 
-    def __init__(self, model_dir: str, *args, **kwargs):
-        """initialize the information extraction model from the `model_dir` path.
-
-        Args:
-            model_dir (str): the model path.
-        """
-        super().__init__(model_dir, *args, **kwargs)
-
-        self.build_backbone(self.backbone_cfg)
-        self.build_head(self.head_cfg)
-
-    def forward(self, **input: Dict[str, Any]) -> InformationExtractionOutput:
-        outputs = super().forward(input)
-        sequence_output = outputs.last_hidden_state
-        outputs = self.head.forward(sequence_output, input['text'],
-                                    input['offsets'])
-        return InformationExtractionOutput(spo_list=outputs)
+    # The default base head type is fill-mask for this head
+    head_type = Heads.information_extraction
diff --git a/modelscope/models/nlp/task_models/sequence_classification.py b/modelscope/models/nlp/task_models/sequence_classification.py
deleted file mode 100644
index 6c0c09a2..00000000
--- a/modelscope/models/nlp/task_models/sequence_classification.py
+++ /dev/null
@@ -1,56 +0,0 @@
-# Copyright (c) Alibaba, Inc. and its affiliates.
-from typing import Any, Dict
-
-import numpy as np
-
-from modelscope.metainfo import TaskModels
-from modelscope.models.builder import MODELS
-from modelscope.models.nlp.task_models.task_model import \
-    SingleBackboneTaskModelBase
-from modelscope.outputs import OutputKeys
-from modelscope.utils.constant import Tasks
-from modelscope.utils.hub import parse_label_mapping
-
-__all__ = ['SequenceClassificationModel']
-
-
-@MODELS.register_module(
-    Tasks.text_classification, module_name=TaskModels.text_classification)
-class SequenceClassificationModel(SingleBackboneTaskModelBase):
-
-    def __init__(self, model_dir: str, *args, **kwargs):
-        """initialize the sequence classification model from the `model_dir` path.
-
-        Args:
-            model_dir (str): the model path.
-        """
-        super().__init__(model_dir, *args, **kwargs)
-        if 'base_model_prefix' in kwargs:
-            self._base_model_prefix = kwargs['base_model_prefix']
-
-        # get the num_labels from label_mapping.json
-        self.id2label = {}
-        # get the num_labels
-        num_labels = kwargs.get('num_labels')
-        if num_labels is None:
-            label2id = parse_label_mapping(model_dir)
-            if label2id is not None and len(label2id) > 0:
-                num_labels = len(label2id)
-            self.id2label = {id: label for label, id in label2id.items()}
-        self.head_cfg['num_labels'] = num_labels
-
-        self.build_backbone(self.backbone_cfg)
-        self.build_head(self.head_cfg)
-
-    def forward(self, **input: Dict[str, Any]) -> Dict[str, np.ndarray]:
-        # backbone do not need labels, only head need for loss compute
-        labels = input.pop(OutputKeys.LABELS, None)
-
-        outputs = super().forward(input)
-        pooled_output = outputs.pooler_output
-        outputs = self.head.forward(pooled_output)
-        if labels is not None:
-            input[OutputKeys.LABELS] = labels
-            loss = self.compute_loss(outputs, labels)
-            outputs.update(loss)
-        return outputs
diff --git a/modelscope/models/nlp/task_models/task_model.py b/modelscope/models/nlp/task_models/task_model.py
index 0c02f8d2..02b54896 100644
--- a/modelscope/models/nlp/task_models/task_model.py
+++ b/modelscope/models/nlp/task_models/task_model.py
@@ -3,21 +3,23 @@ import os.path
 import re
 from abc import ABC
 from collections import OrderedDict
-from typing import Any, Dict
+from typing import Any, Dict, Optional
 
 import torch
 from torch import nn
 
 from modelscope.models.base import TorchModel
 from modelscope.models.builder import build_backbone, build_head
-from modelscope.utils.config import ConfigDict
-from modelscope.utils.constant import Fields, Tasks
+from modelscope.outputs import OutputKeys
+from modelscope.utils.checkpoint import load_task_model_checkpoint
+from modelscope.utils.config import Config, ConfigDict
+from modelscope.utils.constant import DEFAULT_MODEL_REVISION, ModelFile
 from modelscope.utils.file_utils import func_receive_dict_inputs
 from modelscope.utils.logger import get_logger
 
 logger = get_logger()
 
-__all__ = ['EncoderDecoderTaskModelBase', 'SingleBackboneTaskModelBase']
+__all__ = ['EncoderModel', 'SingleBackboneTaskModelBase']
 
 
 def _repr(modules, depth=1):
@@ -107,7 +109,8 @@ class BaseTaskModel(TorchModel, ABC):
 
         """
         # TODO Sharded ckpt
-        ckpt_file = os.path.join(model_local_dir, 'pytorch_model.bin')
+        ckpt_file = os.path.join(model_local_dir,
+                                 ModelFile.TORCH_MODEL_BIN_FILE)
         state_dict = torch.load(ckpt_file, map_location='cpu')
         if default_dtype is not None:
             torch.set_default_dtype(default_dtype)
@@ -199,7 +202,7 @@ class BaseTaskModel(TorchModel, ABC):
                 ]
 
         if _fast_init:
-            # retrieve unintialized modules and initialize
+            # retrieve uninitialized modules and initialize
             uninitialized_modules = self.retrieve_modules_from_names(
                 missing_keys,
                 prefix=prefix,
@@ -321,10 +324,10 @@ class BaseTaskModel(TorchModel, ABC):
                 f'Some weights of the model checkpoint were not used when'
                 f' initializing {self.__class__.__name__}: {unexpected_keys}\n- This IS expected if you are'
                 f' initializing {self.__class__.__name__} from the checkpoint of a model trained on another task or'
-                ' with another architecture (e.g. initializing a BertForSequenceClassification model from a'
+                ' with another architecture (e.g. initializing a BertForTokenClassification model from a'
                 ' BertForPreTraining model).\n- This IS NOT expected if you are initializing'
                 f' {self.__class__.__name__} from the checkpoint of a model that you expect to be exactly identical'
-                ' (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).'
+                ' (initializing a BertForTokenClassification model from a BertForTokenClassification model).'
             )
         else:
             logger.info(
@@ -452,59 +455,241 @@ class SingleBackboneTaskModelBase(BaseTaskModel):
         return sequence_output, pooled_output
 
 
-class EncoderDecoderTaskModelBase(BaseTaskModel):
+class EncoderModel(TorchModel):
     """
-    This is the base class of encoder-decoder nlp task classes.
+    This is the base class of any encoder nlp task classes.
     """
-    # The encoder backbone prefix, default to "encoder"
-    _encoder_prefix = 'encoder'
-    # The decoder backbone prefix, default to "decoder"
-    _decoder_prefix = 'decoder'
-    # The key in cfg specifing the encoder type
-    _encoder_key_in_cfg = 'encoder_type'
-    # The key in cfg specifing the decoder type
-    _decoder_key_in_cfg = 'decoder_type'
+    # keys to ignore when load missing
+    _keys_to_ignore_on_load_missing = None
+    # keys to ignore when load unexpected
+    _keys_to_ignore_on_load_unexpected = None
+    # The encoder prefix defaults to "encoder"
+    base_model_prefix = 'encoder'
+    # The default backbone model type is None, should be bert/T5
+    base_model_type = None
+    # The head prefix defaults to "head"
+    head_prefix = 'head'
+    # The head type defaults as None
+    head_type = None
+    # override base model prefix by task model prefix
+    override_base_model_prefix = False
+    # override base model type by task model type
+    override_base_model_type = False
 
     def __init__(self, model_dir: str, *args, **kwargs):
         super().__init__(model_dir, *args, **kwargs)
+        self.config = ConfigDict(kwargs)
+        backbone_cfg = self.parse_encoder_cfg()
+        head_cfg = self.parse_head_cfg()
+        self.build_encoder(backbone_cfg)
+        if head_cfg.type is not None:
+            self.build_head(head_cfg)
 
-    def build_encoder(self):
-        encoder = build_backbone(
-            self.config,
-            type_name=self._encoder_key_in_cfg,
-            task_name=Tasks.backbone)
-        setattr(self, self._encoder_prefix, encoder)
-        return encoder
+    def __repr__(self):
+        # only log backbone and head name
+        depth = 1
+        return _repr(self, depth)
 
-    def build_decoder(self):
-        decoder = build_backbone(
-            self.config,
-            type_name=self._decoder_key_in_cfg,
-            task_name=Tasks.backbone)
-        setattr(self, self._decoder_prefix, decoder)
-        return decoder
+    def _get_transformer_config(self):
+        transformer_config_file = os.path.join(self.model_dir,
+                                               ModelFile.CONFIG)
+        transformer_config = None
+        if os.path.exists(transformer_config_file):
+            transformer_config = Config.from_file(transformer_config_file)
+        return transformer_config.copy()
+
+    def _use_transformer_config(self, cfg):
+        if 'model_type' not in cfg and 'type' not in cfg:
+            return True
+        else:
+            return False
+
+    def parse_encoder_cfg(self):
+        # get encoder from backbone-head configuration format
+        encoder_cfg = self.config.get('backbone', None)
+        if encoder_cfg is None:
+            encoder_cfg = self.config.copy()
+            if 'model_type' in encoder_cfg and 'type' not in encoder_cfg:
+                encoder_cfg.type = encoder_cfg.model_type
+            elif self._use_transformer_config(encoder_cfg):
+                encoder_cfg = self._get_transformer_config()
+                encoder_cfg.type = encoder_cfg.model_type
+
+        if 'type' not in encoder_cfg or self.override_base_model_type:
+            encoder_cfg.type = self.base_model_type
+        if encoder_cfg.type is None:
+            raise KeyError(
+                'Missing encoder type, please explicit define encoder type in configuration.json'
+            )
+        encoder_cfg.model_dir = self.model_dir
+        return encoder_cfg
+
+    def parse_head_cfg(self):
+        head_cfg = self.config.get('head', None)
+        if head_cfg is None:
+            head_cfg = self.config.copy()
+            if 'head_type' in head_cfg and 'type' not in head_cfg:
+                head_cfg.type = head_cfg.head_type
+            elif self._use_transformer_config(head_cfg):
+                head_cfg = self._get_transformer_config()
+                head_cfg.type = self.head_type
+        if 'type' not in head_cfg:
+            head_cfg.type = self.head_type
+        return head_cfg
+
+    def build_encoder(self, cfg):
+        backbone = build_backbone(cfg)
+        if 'prefix' in cfg:
+            self.base_model_prefix = cfg['prefix']
+        elif 'base_model_prefix' in cfg:
+            self.base_model_prefix = cfg['base_model_prefix']
+        elif hasattr(backbone, 'base_model_prefix') \
+                and not self.override_base_model_prefix:
+            self.base_model_prefix = backbone.base_model_prefix
+        setattr(self, self.base_model_prefix, backbone)
+
+    def build_head(self, cfg):
+        if cfg is None:
+            raise ValueError(
+                'Head config is missing, check if this was a backbone-only model'
+            )
+        head = build_head(cfg, task_name=self.group_key)
+        setattr(self, self.head_prefix, head)
 
     @property
-    def encoder_(self):
-        return getattr(self, self._encoder_prefix)
+    def encoder(self):
+        if 'encoder' != self.base_model_prefix:
+            return getattr(self, self.base_model_prefix)
+        return super().__getattr__('encoder')
 
     @property
-    def decoder_(self):
-        return getattr(self, self._decoder_prefix)
+    def head(self):
+        if 'head' != self.head_prefix:
+            return getattr(self, self.head_prefix)
+        return super().__getattr__('head')
 
-    def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
-        if func_receive_dict_inputs(self.encoder_.forward):
-            encoder_outputs = self.encoder_.forward(input)
+    def extract_feature(self, **input: Dict[str, Any]) -> Dict[str, Any]:
+        """default forward method is the backbone-only forward"""
+        if func_receive_dict_inputs(self.encoder.forward):
+            outputs = self.encoder.forward(input)
         else:
-            encoder_outputs = self.encoder_.forward(**input)
-        decoder_inputs = self.project_decoder_inputs_and_mediate(
-            input, encoder_outputs)
-        if func_receive_dict_inputs(self.decoder_.forward):
-            outputs = self.decoder_.forward(decoder_inputs)
-        else:
-            outputs = self.decoder_.forward(**decoder_inputs)
-
+            outputs = self.encoder.forward(**input)
         return outputs
 
-    def project_decoder_inputs_and_mediate(self, input, encoder_outputs):
-        return {**input, **encoder_outputs}
+    def forward(self,
+                input_ids=None,
+                attention_mask=None,
+                token_type_ids=None,
+                position_ids=None,
+                head_mask=None,
+                inputs_embeds=None,
+                labels=None,
+                output_attentions=None,
+                output_hidden_states=None,
+                return_dict=None,
+                *args,
+                **kwargs):
+        r"""
+        Args:
+        input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary.
+
+            Indices can be obtained using :class:`~modelscope.models.nlp.structbert.SbertTokenizer`. See
+            :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
+            details.
+
+        attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+        token_type_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0,
+            1]``:
+
+            - 0 corresponds to a `sentence A` token,
+            - 1 corresponds to a `sentence B` token.
+
+        position_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
+            config.max_position_embeddings - 1]``.
+
+        head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`):
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+            Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
+            This is useful if you want more control over how to convert :obj:`input_ids` indices into associated
+            vectors than the model's internal embedding lookup matrix.
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
+            Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ...,
+            config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
+            If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+         output_attentions (:obj:`bool`, `optional`):
+            Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
+            tensors for more detail.
+        output_hidden_states (:obj:`bool`, `optional`):
+            Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
+            more detail.
+        return_dict (:obj:`bool`, `optional`):
+            Whether or not to return a :class:`~transformers.ModelOutput` instead of a plain tuple.
+        *args:
+            In Torch 1.11 onnx has a bug in the _slow_forward method, could only keep *args solving the problem
+        **kwargs:
+            Accept additional kwargs in the children class
+
+        Returns:
+            Returns `modelscope.outputs.ModelOutput`
+
+        Examples:
+            >>> from modelscope.models import Model
+            >>> from modelscope.preprocessors import Preprocessor
+            >>> model = Model.from_pretrained('damo/nlp_structbert_sentence-similarity_chinese-base')
+            >>> preprocessor = Preprocessor.from_pretrained('damo/nlp_structbert_sentence-similarity_chinese-base')
+            >>> print(model(**preprocessor(('This is a test', 'This is also a test'))))
+        """
+
+        if OutputKeys.LABEL in kwargs and labels is None:
+            labels = kwargs.pop(OutputKeys.LABEL, None)
+        feature = self.extract_feature(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        outputs = self.head.forward(feature, attention_mask, labels, **kwargs)
+        return outputs
+
+    @classmethod
+    def _instantiate(cls, **kwargs):
+        model_dir = kwargs.get('model_dir')
+        model = cls(**kwargs)
+        model_load_handler = load_task_model_checkpoint(
+            model_to_load=model, model_local_dir=model_dir, **kwargs)
+        return model_load_handler['model']
+
+    @classmethod
+    def from_pretrained(cls,
+                        model_name_or_path: str,
+                        revision: Optional[str] = DEFAULT_MODEL_REVISION,
+                        cfg_dict: Config = None,
+                        device: str = None,
+                        **kwargs):
+        task = kwargs.pop('task', None)
+        return super(TorchModel, cls).from_pretrained(
+            model_name_or_path=model_name_or_path,
+            revision=revision,
+            cfg_dict=cfg_dict,
+            devic=device,
+            task=task if task is not None else cls.task,
+            **kwargs)
diff --git a/modelscope/models/nlp/task_models/text_classification.py b/modelscope/models/nlp/task_models/text_classification.py
new file mode 100644
index 00000000..e7f311a6
--- /dev/null
+++ b/modelscope/models/nlp/task_models/text_classification.py
@@ -0,0 +1,51 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import Any, Dict
+
+import numpy as np
+
+from modelscope.metainfo import Heads, TaskModels
+from modelscope.models.builder import MODELS
+from modelscope.models.nlp.task_models.task_model import EncoderModel
+from modelscope.utils.constant import Tasks
+from modelscope.utils.hub import parse_label_mapping
+
+__all__ = ['ModelForTextClassification']
+
+
+@MODELS.register_module(
+    Tasks.text_classification, module_name=TaskModels.text_classification)
+class ModelForTextClassification(EncoderModel):
+    task = Tasks.text_classification
+
+    # The default base head type is text-classification for this head
+    head_type = Heads.text_classification
+
+    def __init__(self, model_dir: str, *args, **kwargs):
+        """initialize the sequence classification model from the `model_dir` path.
+
+        Args:
+            model_dir (str): the model path.
+        """
+        # get the num_labels from label_mapping.json
+        self.id2label = {}
+
+        # get the num_labels
+        num_labels = kwargs.get('num_labels')
+        if num_labels is None:
+            label2id = parse_label_mapping(model_dir)
+            if label2id is not None and len(label2id) > 0:
+                num_labels = len(label2id)
+            self.id2label = {id: label for label, id in label2id.items()}
+        kwargs['num_labels'] = num_labels
+        super().__init__(model_dir, *args, **kwargs)
+
+    def parse_head_cfg(self):
+        head_cfg = super().parse_head_cfg()
+        if hasattr(head_cfg, 'classifier_dropout'):
+            head_cfg['classifier_dropout'] = (
+                head_cfg.classifier_dropout if head_cfg['classifier_dropout']
+                is not None else head_cfg.hidden_dropout_prob)
+        else:
+            head_cfg['classifier_dropout'] = head_cfg.hidden_dropout_prob
+        head_cfg['num_labels'] = self.config.num_labels
+        return head_cfg
diff --git a/modelscope/models/nlp/task_models/text_generation.py b/modelscope/models/nlp/task_models/text_generation.py
index cd8e20cf..e09a25ff 100644
--- a/modelscope/models/nlp/task_models/text_generation.py
+++ b/modelscope/models/nlp/task_models/text_generation.py
@@ -12,12 +12,12 @@ from modelscope.outputs import (OutputKeys, TextGenerationModelOutput,
                                 TokenGeneratorOutput)
 from modelscope.utils.constant import Tasks
 
-__all__ = ['TaskModelForTextGeneration']
+__all__ = ['ModelForTextGeneration']
 
 
 @MODELS.register_module(
     Tasks.text_generation, module_name=TaskModels.text_generation)
-class TaskModelForTextGeneration(SingleBackboneTaskModelBase, PreTrainedModel):
+class ModelForTextGeneration(SingleBackboneTaskModelBase, PreTrainedModel):
 
     def __init__(self, model_dir: str, *args, **kwargs):
         """initialize the text generation model from the `model_dir` path.
diff --git a/modelscope/models/nlp/task_models/text_ranking.py b/modelscope/models/nlp/task_models/text_ranking.py
new file mode 100644
index 00000000..e883d76c
--- /dev/null
+++ b/modelscope/models/nlp/task_models/text_ranking.py
@@ -0,0 +1,58 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import Any, Dict
+
+import numpy as np
+
+from modelscope.metainfo import Heads, TaskModels
+from modelscope.models.builder import MODELS
+from modelscope.models.nlp.task_models.task_model import EncoderModel
+from modelscope.utils.constant import Tasks
+from modelscope.utils.hub import parse_label_mapping
+
+__all__ = ['ModelForTextRanking']
+
+
+@MODELS.register_module(
+    Tasks.text_ranking, module_name=TaskModels.text_ranking)
+class ModelForTextRanking(EncoderModel):
+    task = Tasks.text_ranking
+
+    # The default base head type is text-ranking for this head
+    head_type = Heads.text_ranking
+
+    def __init__(self, model_dir: str, *args, **kwargs):
+        """initialize the sequence classification model from the `model_dir` path.
+
+        Args:
+            model_dir (str): the model path.
+        """
+        # get the num_labels from label_mapping.json
+        self.id2label = {}
+
+        # get the num_labels
+        num_labels = kwargs.get('num_labels')
+        if num_labels is None:
+            label2id = parse_label_mapping(model_dir)
+            if label2id is not None and len(label2id) > 0:
+                num_labels = len(label2id)
+                self.id2label = {id: label for label, id in label2id.items()}
+            elif label2id is None:
+                num_labels = 1
+        kwargs['num_labels'] = num_labels
+        super().__init__(model_dir, *args, **kwargs)
+
+    def parse_encoder_cfg(self):
+        encoder_cfg = super().parse_encoder_cfg()
+        encoder_cfg['add_pooling_layer'] = True
+        return encoder_cfg
+
+    def parse_head_cfg(self):
+        head_cfg = super().parse_head_cfg()
+        if hasattr(head_cfg, 'classifier_dropout'):
+            head_cfg['classifier_dropout'] = (
+                head_cfg.classifier_dropout if head_cfg['classifier_dropout']
+                is not None else head_cfg.hidden_dropout_prob)
+        else:
+            head_cfg['classifier_dropout'] = head_cfg.hidden_dropout_prob
+        head_cfg['num_labels'] = self.config.num_labels
+        return head_cfg
diff --git a/modelscope/models/nlp/task_models/token_classification.py b/modelscope/models/nlp/task_models/token_classification.py
index 40543dc8..aa84eaf0 100644
--- a/modelscope/models/nlp/task_models/token_classification.py
+++ b/modelscope/models/nlp/task_models/token_classification.py
@@ -3,16 +3,15 @@ from typing import Any, Dict
 
 import torch
 
-from modelscope.metainfo import Models, TaskModels
+from modelscope.metainfo import Heads, Models, TaskModels
 from modelscope.models.builder import MODELS
-from modelscope.models.nlp.task_models.task_model import \
-    SingleBackboneTaskModelBase
+from modelscope.models.nlp.task_models.task_model import EncoderModel
 from modelscope.outputs import (AttentionTokenClassificationModelOutput,
                                 OutputKeys)
 from modelscope.utils.constant import Tasks
 from modelscope.utils.hub import parse_label_mapping
 
-__all__ = ['TokenClassificationModel']
+__all__ = ['ModelForTokenClassification', 'ModelForTokenClassificationWithCRF']
 
 
 @MODELS.register_module(
@@ -22,17 +21,24 @@ __all__ = ['TokenClassificationModel']
 @MODELS.register_module(
     Tasks.named_entity_recognition,
     module_name=Models.token_classification_for_ner)
-class TokenClassificationModel(SingleBackboneTaskModelBase):
+class ModelForTokenClassification(EncoderModel):
+    task = Tasks.token_classification
+
+    # The default base head type is token-classification for this head
+    head_type = Heads.token_classification
+
+    # The default base model prefix for this task is encoder and ignore the base model prefix
+    base_model_prefix = 'encoder'
+    override_base_model_prefix = True
 
     def __init__(self, model_dir: str, *args, **kwargs):
-        """initialize the token classification model from the `model_dir` path.
+        """initialize the sequence classification model from the `model_dir` path.
 
         Args:
             model_dir (str): the model path.
         """
-        super().__init__(model_dir, *args, **kwargs)
-        if 'base_model_prefix' in kwargs:
-            self._base_model_prefix = kwargs['base_model_prefix']
+        # get the num_labels from label_mapping.json
+        self.id2label = {}
 
         # get the num_labels
         num_labels = kwargs.get('num_labels')
@@ -41,35 +47,82 @@ class TokenClassificationModel(SingleBackboneTaskModelBase):
             if label2id is not None and len(label2id) > 0:
                 num_labels = len(label2id)
             self.id2label = {id: label for label, id in label2id.items()}
-        self.head_cfg['num_labels'] = num_labels
+        kwargs['num_labels'] = num_labels
+        super().__init__(model_dir, *args, **kwargs)
 
-        self.build_backbone(self.backbone_cfg)
-        self.build_head(self.head_cfg)
+    def parse_head_cfg(self):
+        head_cfg = super().parse_head_cfg()
+        if hasattr(head_cfg, 'classifier_dropout'):
+            head_cfg['classifier_dropout'] = (
+                head_cfg.classifier_dropout if head_cfg['classifier_dropout']
+                is not None else head_cfg.hidden_dropout_prob)
+        else:
+            head_cfg['classifier_dropout'] = head_cfg.hidden_dropout_prob
+        head_cfg['num_labels'] = self.config.num_labels
+        return head_cfg
 
-    def forward(
-            self,
-            **input: Dict[str,
-                          Any]) -> AttentionTokenClassificationModelOutput:
-        labels = None
-        if OutputKeys.LABEL in input:
-            labels = input.pop(OutputKeys.LABEL)
-        elif OutputKeys.LABELS in input:
-            labels = input.pop(OutputKeys.LABELS)
+    def forward(self,
+                input_ids=None,
+                attention_mask=None,
+                token_type_ids=None,
+                position_ids=None,
+                head_mask=None,
+                inputs_embeds=None,
+                labels=None,
+                output_attentions=None,
+                output_hidden_states=None,
+                return_dict=None,
+                offset_mapping=None,
+                label_mask=None,
+                **kwargs):
+        kwargs['offset_mapping'] = offset_mapping
+        kwargs['label_mask'] = label_mask
+        outputs = super().forward(input_ids, attention_mask, token_type_ids,
+                                  position_ids, head_mask, inputs_embeds,
+                                  labels, output_attentions,
+                                  output_hidden_states, **kwargs)
 
-        outputs = super().forward(input)
-        sequence_output = outputs[0]
-        logits = self.head.forward(sequence_output)
-        loss = None
-        if labels in input:
-            loss = self.compute_loss(outputs, labels)
+        outputs.offset_mapping = offset_mapping
+        outputs.label_mask = label_mask
+
+        return outputs
+
+
+@MODELS.register_module(Tasks.transformer_crf, module_name=Models.tcrf)
+@MODELS.register_module(Tasks.token_classification, module_name=Models.tcrf)
+@MODELS.register_module(
+    Tasks.token_classification, module_name=Models.tcrf_wseg)
+@MODELS.register_module(
+    Tasks.named_entity_recognition, module_name=Models.tcrf)
+@MODELS.register_module(Tasks.part_of_speech, module_name=Models.tcrf)
+@MODELS.register_module(Tasks.word_segmentation, module_name=Models.tcrf)
+@MODELS.register_module(Tasks.word_segmentation, module_name=Models.tcrf_wseg)
+class ModelForTokenClassificationWithCRF(ModelForTokenClassification):
+    head_type = Heads.transformer_crf
+    base_model_prefix = 'encoder'
+
+    def postprocess(self, inputs, **kwargs):
+        predicts = self.head.decode(inputs['logits'], inputs['label_mask'])
+        offset_mapping = inputs['offset_mapping']
+        mask = inputs['label_mask']
+
+        # revert predicts to original position with respect of label mask
+        masked_predict = torch.zeros_like(predicts)
+        for i in range(len(mask)):
+            masked_lengths = mask[i].sum(-1).long().cpu().item()
+            selected_predicts = torch.narrow(
+                predicts[i], 0, 0,
+                masked_lengths)  # index_select only move loc, not resize
+            mask_position = mask[i].bool()
+            masked_predict[i][mask_position] = selected_predicts
+        predicts = masked_predict
 
         return AttentionTokenClassificationModelOutput(
-            loss=loss,
-            logits=logits,
-            hidden_states=outputs.hidden_states,
-            attentions=outputs.attentions,
-            offset_mapping=input.get('offset_mapping'),
-            label_mask=input.get('label_mask'))
-
-    def extract_logits(self, outputs):
-        return outputs[OutputKeys.LOGITS].cpu().detach()
+            loss=None,
+            logits=None,
+            hidden_states=None,
+            attentions=None,
+            label_mask=mask,
+            offset_mapping=offset_mapping,
+            predictions=predicts,
+        )
diff --git a/modelscope/models/nlp/unite/modeling_unite.py b/modelscope/models/nlp/unite/modeling_unite.py
index 6969e0c7..deea737d 100644
--- a/modelscope/models/nlp/unite/modeling_unite.py
+++ b/modelscope/models/nlp/unite/modeling_unite.py
@@ -1,9 +1,9 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 """PyTorch UniTE model."""
 
-import math
 import warnings
 from dataclasses import dataclass
+from math import ceil
 from typing import Dict, List, Optional, Tuple, Union
 
 import numpy as np
@@ -279,8 +279,8 @@ class UniTEForTranslationEvaluation(TorchModel):
         pred = self.estimator(mix_states)
         return pred.squeeze(dim=-1)
 
-    def load_checkpoint(self, path: str):
-        state_dict = torch.load(path)
+    def load_checkpoint(self, path: str, device: torch.device):
+        state_dict = torch.load(path, map_location=device)
         self.load_state_dict(state_dict)
         logger.info('Loading checkpoint parameters from %s' % path)
         return
diff --git a/modelscope/models/nlp/use/user_satisfaction_estimation.py b/modelscope/models/nlp/use/user_satisfaction_estimation.py
index 9fe47b74..f006f544 100644
--- a/modelscope/models/nlp/use/user_satisfaction_estimation.py
+++ b/modelscope/models/nlp/use/user_satisfaction_estimation.py
@@ -66,7 +66,9 @@ class UserSatisfactionEstimation(TorchModel):
            input_ids (Tensor): the preprocessed dialogue input
         Returns:
            output (Dict[str, Any] or DialogueUserSatisfactionEstimationModelOutput): The results of user satisfaction.
-           Example: {'logits': tensor([[-2.1795,  1.1323,  1.8605]])}
+
+        Example:
+            >>> {'logits': tensor([[-2.1795,  1.1323,  1.8605]])}
         """
         logits = self.model(input_ids)
         return DialogueUserSatisfactionEstimationModelOutput(logits=logits)
diff --git a/modelscope/models/nlp/xlm_roberta/__init__.py b/modelscope/models/nlp/xlm_roberta/__init__.py
new file mode 100644
index 00000000..c012da59
--- /dev/null
+++ b/modelscope/models/nlp/xlm_roberta/__init__.py
@@ -0,0 +1,37 @@
+# Copyright 2021-2022 The Alibaba DAMO NLP Team Authors.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .configuration import XLMRobertaConfig
+    from .backbone import XLMRobertaModel
+else:
+    _import_structure = {
+        'configuration': ['XLMRobertaConfig'],
+        'backbone': ['XLMRobertaModel'],
+    }
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/models/nlp/xlm_roberta/backbone.py b/modelscope/models/nlp/xlm_roberta/backbone.py
new file mode 100644
index 00000000..d1a7cf16
--- /dev/null
+++ b/modelscope/models/nlp/xlm_roberta/backbone.py
@@ -0,0 +1,976 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch XLM-RoBERTa model."""
+
+import math
+
+import torch
+import torch.utils.checkpoint
+from packaging import version
+from torch import nn
+from transformers.activations import ACT2FN
+from transformers.modeling_utils import (PreTrainedModel,
+                                         apply_chunking_to_forward,
+                                         find_pruneable_heads_and_indices,
+                                         prune_linear_layer)
+
+from modelscope.metainfo import Models
+from modelscope.models import Model, TorchModel
+from modelscope.models.builder import MODELS
+from modelscope.outputs import AttentionBackboneModelOutput
+from modelscope.utils.constant import Tasks
+from modelscope.utils.logger import get_logger
+from modelscope.utils.nlp.utils import parse_labels_in_order
+from .configuration import XLMRobertaConfig
+
+logger = get_logger()
+
+_CONFIG_FOR_DOC = 'XLMRobertaConfig'
+
+
+# Copied from transformers.models.roberta.modeling_roberta.create_position_ids_from_input_ids
+def create_position_ids_from_input_ids(input_ids,
+                                       padding_idx,
+                                       past_key_values_length=0):
+    """
+    Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
+    are ignored. This is modified from fairseq's `utils.make_positions`.
+
+    Args:
+        x: torch.Tensor x:
+
+    Returns: torch.Tensor
+    """
+    # The series of casts and type-conversions here are carefully balanced to both work with ONNX export and XLA.
+    mask = input_ids.ne(padding_idx).int()
+    incremental_indices = (torch.cumsum(mask, dim=1).type_as(mask)
+                           + past_key_values_length) * mask
+    return incremental_indices.long() + padding_idx
+
+
+# Copied from transformers.models.roberta.modeling_roberta.RobertaEmbeddings with Roberta->XLMRoberta
+class XLMRobertaEmbeddings(nn.Module):
+    """
+    Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
+    """
+
+    # Copied from transformers.models.bert.modeling_bert.BertEmbeddings.__init__
+    def __init__(self, config):
+        super().__init__()
+        self.word_embeddings = nn.Embedding(
+            config.vocab_size,
+            config.hidden_size,
+            padding_idx=config.pad_token_id)
+        self.position_embeddings = nn.Embedding(config.max_position_embeddings,
+                                                config.hidden_size)
+        self.token_type_embeddings = nn.Embedding(config.type_vocab_size,
+                                                  config.hidden_size)
+
+        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
+        # any TensorFlow checkpoint file
+        self.LayerNorm = nn.LayerNorm(
+            config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        # position_ids (1, len position emb) is contiguous in memory and exported when serialized
+        self.position_embedding_type = getattr(config,
+                                               'position_embedding_type',
+                                               'absolute')
+        self.register_buffer(
+            'position_ids',
+            torch.arange(config.max_position_embeddings).expand((1, -1)))
+        self.register_buffer(
+            'token_type_ids',
+            torch.zeros(self.position_ids.size(), dtype=torch.long),
+            persistent=False)
+
+        # End copy
+        self.padding_idx = config.pad_token_id
+        self.position_embeddings = nn.Embedding(
+            config.max_position_embeddings,
+            config.hidden_size,
+            padding_idx=self.padding_idx)
+
+    def forward(self,
+                input_ids=None,
+                token_type_ids=None,
+                position_ids=None,
+                inputs_embeds=None,
+                past_key_values_length=0):
+        if position_ids is None:
+            if input_ids is not None:
+                # Create the position ids from the input token ids. Any padded tokens remain padded.
+                position_ids = create_position_ids_from_input_ids(
+                    input_ids, self.padding_idx, past_key_values_length)
+            else:
+                position_ids = self.create_position_ids_from_inputs_embeds(
+                    inputs_embeds)
+
+        if input_ids is not None:
+            input_shape = input_ids.size()
+        else:
+            input_shape = inputs_embeds.size()[:-1]
+
+        seq_length = input_shape[1]
+
+        # Setting the token_type_ids to the registered buffer in constructor where it is all zeros, which usually occurs
+        # when its auto-generated, registered buffer helps users when tracing the model without
+        # passing token_type_ids, solves
+        # issue #5664
+        if token_type_ids is None:
+            if hasattr(self, 'token_type_ids'):
+                buffered_token_type_ids = self.token_type_ids[:, :seq_length]
+                buffered_token_type_ids_expanded = buffered_token_type_ids.expand(
+                    input_shape[0], seq_length)
+                token_type_ids = buffered_token_type_ids_expanded
+            else:
+                token_type_ids = torch.zeros(
+                    input_shape,
+                    dtype=torch.long,
+                    device=self.position_ids.device)
+
+        if inputs_embeds is None:
+            inputs_embeds = self.word_embeddings(input_ids)
+        token_type_embeddings = self.token_type_embeddings(token_type_ids)
+
+        embeddings = inputs_embeds + token_type_embeddings
+        if self.position_embedding_type == 'absolute':
+            position_embeddings = self.position_embeddings(position_ids)
+            embeddings += position_embeddings
+        embeddings = self.LayerNorm(embeddings)
+        embeddings = self.dropout(embeddings)
+        return embeddings
+
+    def create_position_ids_from_inputs_embeds(self, inputs_embeds):
+        """
+        We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.
+
+        Args:
+            inputs_embeds: torch.Tensor
+
+        Returns: torch.Tensor
+        """
+        input_shape = inputs_embeds.size()[:-1]
+        sequence_length = input_shape[1]
+
+        position_ids = torch.arange(
+            self.padding_idx + 1,
+            sequence_length + self.padding_idx + 1,
+            dtype=torch.long,
+            device=inputs_embeds.device)
+        return position_ids.unsqueeze(0).expand(input_shape)
+
+
+# Copied from transformers.models.roberta.modeling_roberta.RobertaSelfAttention with Roberta->XLMRoberta
+class XLMRobertaSelfAttention(nn.Module):
+
+    def __init__(self, config, position_embedding_type=None):
+        super().__init__()
+        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(
+                config, 'embedding_size'):
+            raise ValueError(
+                f'The hidden size ({config.hidden_size}) is not a multiple of the number of attention '
+                f'heads ({config.num_attention_heads})')
+
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size
+                                       / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+
+        self.query = nn.Linear(config.hidden_size, self.all_head_size)
+        self.key = nn.Linear(config.hidden_size, self.all_head_size)
+        self.value = nn.Linear(config.hidden_size, self.all_head_size)
+
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+        self.position_embedding_type = position_embedding_type or getattr(
+            config, 'position_embedding_type', 'absolute')
+        if self.position_embedding_type == 'relative_key' or self.position_embedding_type == 'relative_key_query':
+            self.max_position_embeddings = config.max_position_embeddings
+            self.distance_embedding = nn.Embedding(
+                2 * config.max_position_embeddings - 1,
+                self.attention_head_size)
+
+        self.is_decoder = config.is_decoder
+
+    def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor:
+        new_x_shape = x.size()[:-1] + (self.num_attention_heads,
+                                       self.attention_head_size)
+        x = x.view(new_x_shape)
+        return x.permute(0, 2, 1, 3)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=False,
+    ):
+        mixed_query_layer = self.query(hidden_states)
+
+        # If this is instantiated as a cross-attention module, the keys
+        # and values come from an encoder; the attention mask needs to be
+        # such that the encoder's padding tokens are not attended to.
+        is_cross_attention = encoder_hidden_states is not None
+
+        if is_cross_attention and past_key_value is not None:
+            # reuse k,v, cross_attentions
+            key_layer = past_key_value[0]
+            value_layer = past_key_value[1]
+            attention_mask = encoder_attention_mask
+        elif is_cross_attention:
+            key_layer = self.transpose_for_scores(
+                self.key(encoder_hidden_states))
+            value_layer = self.transpose_for_scores(
+                self.value(encoder_hidden_states))
+            attention_mask = encoder_attention_mask
+        elif past_key_value is not None:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
+            key_layer = torch.cat([past_key_value[0], key_layer], dim=2)
+            value_layer = torch.cat([past_key_value[1], value_layer], dim=2)
+        else:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
+
+        query_layer = self.transpose_for_scores(mixed_query_layer)
+
+        use_cache = past_key_value is not None
+        if self.is_decoder:
+            # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
+            # Further calls to cross_attention layer can then reuse all cross-attention
+            # key/value_states (first "if" case)
+            # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
+            # all previous decoder key/value_states. Further calls to uni-directional self-attention
+            # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
+            # if encoder bi-directional self-attention `past_key_value` is always `None`
+            past_key_value = (key_layer, value_layer)
+
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = torch.matmul(query_layer,
+                                        key_layer.transpose(-1, -2))
+
+        if self.position_embedding_type == 'relative_key' or self.position_embedding_type == 'relative_key_query':
+            query_length, key_length = query_layer.shape[2], key_layer.shape[2]
+            if use_cache:
+                position_ids_l = torch.tensor(
+                    key_length - 1,
+                    dtype=torch.long,
+                    device=hidden_states.device).view(-1, 1)
+            else:
+                position_ids_l = torch.arange(
+                    query_length,
+                    dtype=torch.long,
+                    device=hidden_states.device).view(-1, 1)
+            position_ids_r = torch.arange(
+                key_length, dtype=torch.long,
+                device=hidden_states.device).view(1, -1)
+            distance = position_ids_l - position_ids_r
+
+            positional_embedding = self.distance_embedding(
+                distance + self.max_position_embeddings - 1)
+            positional_embedding = positional_embedding.to(
+                dtype=query_layer.dtype)  # fp16 compatibility
+
+            if self.position_embedding_type == 'relative_key':
+                relative_position_scores = torch.einsum(
+                    'bhld,lrd->bhlr', query_layer, positional_embedding)
+                attention_scores = attention_scores + relative_position_scores
+            elif self.position_embedding_type == 'relative_key_query':
+                relative_position_scores_query = torch.einsum(
+                    'bhld,lrd->bhlr', query_layer, positional_embedding)
+                relative_position_scores_key = torch.einsum(
+                    'bhrd,lrd->bhlr', key_layer, positional_embedding)
+                attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key
+
+        attention_scores = attention_scores / math.sqrt(
+            self.attention_head_size)
+        if attention_mask is not None:
+            # Apply the attention mask is (precomputed for all layers in XLMRobertaModel forward() function)
+            attention_scores = attention_scores + attention_mask
+
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.functional.softmax(attention_scores, dim=-1)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(attention_probs)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs = attention_probs * head_mask
+
+        context_layer = torch.matmul(attention_probs, value_layer)
+
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (
+            self.all_head_size, )
+        context_layer = context_layer.view(new_context_layer_shape)
+
+        outputs = (context_layer,
+                   attention_probs) if output_attentions else (context_layer, )
+
+        if self.is_decoder:
+            outputs = outputs + (past_key_value, )
+        return outputs
+
+
+# Copied from transformers.models.roberta.modeling_roberta.RobertaSelfOutput
+class XLMRobertaSelfOutput(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(
+            config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states: torch.Tensor,
+                input_tensor: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+# Copied from transformers.models.roberta.modeling_roberta.RobertaAttention with Roberta->XLMRoberta
+class XLMRobertaAttention(nn.Module):
+
+    def __init__(self, config, position_embedding_type=None):
+        super().__init__()
+        self.self = XLMRobertaSelfAttention(
+            config, position_embedding_type=position_embedding_type)
+        self.output = XLMRobertaSelfOutput(config)
+        self.pruned_heads = set()
+
+    def prune_heads(self, heads):
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(
+            heads, self.self.num_attention_heads,
+            self.self.attention_head_size, self.pruned_heads)
+
+        # Prune linear layers
+        self.self.query = prune_linear_layer(self.self.query, index)
+        self.self.key = prune_linear_layer(self.self.key, index)
+        self.self.value = prune_linear_layer(self.self.value, index)
+        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
+
+        # Update hyper params and store pruned heads
+        self.self.num_attention_heads = self.self.num_attention_heads - len(
+            heads)
+        self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
+        self.pruned_heads = self.pruned_heads.union(heads)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=False,
+    ):
+        self_outputs = self.self(
+            hidden_states,
+            attention_mask,
+            head_mask,
+            encoder_hidden_states,
+            encoder_attention_mask,
+            past_key_value,
+            output_attentions,
+        )
+        attention_output = self.output(self_outputs[0], hidden_states)
+        outputs = (attention_output,
+                   ) + self_outputs[1:]  # add attentions if we output them
+        return outputs
+
+
+# Copied from transformers.models.roberta.modeling_roberta.RobertaIntermediate
+class XLMRobertaIntermediate(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        return hidden_states
+
+
+# Copied from transformers.models.roberta.modeling_roberta.RobertaOutput
+class XLMRobertaOutput(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(
+            config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states: torch.Tensor,
+                input_tensor: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+# Copied from transformers.models.roberta.modeling_roberta.RobertaLayer with Roberta->XLMRoberta
+class XLMRobertaLayer(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.chunk_size_feed_forward = config.chunk_size_feed_forward
+        self.seq_len_dim = 1
+        self.attention = XLMRobertaAttention(config)
+        self.is_decoder = config.is_decoder
+        self.add_cross_attention = config.add_cross_attention
+        if self.add_cross_attention:
+            if not self.is_decoder:
+                raise ValueError(
+                    f'{self} should be used as a decoder model if cross attention is added'
+                )
+            self.crossattention = XLMRobertaAttention(
+                config, position_embedding_type='absolute')
+        self.intermediate = XLMRobertaIntermediate(config)
+        self.output = XLMRobertaOutput(config)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=False,
+    ):
+        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
+        self_attn_past_key_value = past_key_value[:
+                                                  2] if past_key_value is not None else None
+        self_attention_outputs = self.attention(
+            hidden_states,
+            attention_mask,
+            head_mask,
+            output_attentions=output_attentions,
+            past_key_value=self_attn_past_key_value,
+        )
+        attention_output = self_attention_outputs[0]
+
+        # if decoder, the last output is tuple of self-attn cache
+        if self.is_decoder:
+            outputs = self_attention_outputs[1:-1]
+            present_key_value = self_attention_outputs[-1]
+        else:
+            outputs = self_attention_outputs[
+                1:]  # add self attentions if we output attention weights
+
+        cross_attn_present_key_value = None
+        if self.is_decoder and encoder_hidden_states is not None:
+            if not hasattr(self, 'crossattention'):
+                raise ValueError(
+                    f'If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention layers'
+                    ' by setting `config.add_cross_attention=True`')
+
+            # cross_attn cached key/values tuple is at positions 3,4 of past_key_value tuple
+            cross_attn_past_key_value = past_key_value[
+                -2:] if past_key_value is not None else None
+            cross_attention_outputs = self.crossattention(
+                attention_output,
+                attention_mask,
+                head_mask,
+                encoder_hidden_states,
+                encoder_attention_mask,
+                cross_attn_past_key_value,
+                output_attentions,
+            )
+            attention_output = cross_attention_outputs[0]
+            outputs = outputs + cross_attention_outputs[
+                1:-1]  # add cross attentions if we output attention weights
+
+            # add cross-attn cache to positions 3,4 of present_key_value tuple
+            cross_attn_present_key_value = cross_attention_outputs[-1]
+            present_key_value = present_key_value + cross_attn_present_key_value
+
+        layer_output = apply_chunking_to_forward(self.feed_forward_chunk,
+                                                 self.chunk_size_feed_forward,
+                                                 self.seq_len_dim,
+                                                 attention_output)
+        outputs = (layer_output, ) + outputs
+
+        # if decoder, return the attn key/values as the last output
+        if self.is_decoder:
+            outputs = outputs + (present_key_value, )
+
+        return outputs
+
+    def feed_forward_chunk(self, attention_output):
+        intermediate_output = self.intermediate(attention_output)
+        layer_output = self.output(intermediate_output, attention_output)
+        return layer_output
+
+
+# Copied from transformers.models.roberta.modeling_roberta.RobertaEncoder with Roberta->XLMRoberta
+class XLMRobertaEncoder(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.layer = nn.ModuleList(
+            [XLMRobertaLayer(config) for _ in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_values=None,
+        use_cache=None,
+        output_attentions=False,
+        output_hidden_states=False,
+        return_dict=True,
+    ):
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+        all_cross_attentions = (
+        ) if output_attentions and self.config.add_cross_attention else None
+
+        next_decoder_cache = () if use_cache else None
+        for i, layer_module in enumerate(self.layer):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states, )
+
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+            past_key_value = past_key_values[
+                i] if past_key_values is not None else None
+
+            if self.gradient_checkpointing and self.training:
+
+                if use_cache:
+                    logger.warning(
+                        '`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...'
+                    )
+                    use_cache = False
+
+                def create_custom_forward(module):
+
+                    def custom_forward(*inputs):
+                        return module(*inputs, past_key_value,
+                                      output_attentions)
+
+                    return custom_forward
+
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(layer_module),
+                    hidden_states,
+                    attention_mask,
+                    layer_head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                )
+            else:
+                layer_outputs = layer_module(
+                    hidden_states,
+                    attention_mask,
+                    layer_head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    past_key_value,
+                    output_attentions,
+                )
+
+            hidden_states = layer_outputs[0]
+            if use_cache:
+                next_decoder_cache += (layer_outputs[-1], )
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (
+                    layer_outputs[1], )
+                if self.config.add_cross_attention:
+                    all_cross_attentions = all_cross_attentions + (
+                        layer_outputs[2], )
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states, )
+
+        if not return_dict:
+            return tuple(v for v in [
+                hidden_states,
+                next_decoder_cache,
+                all_hidden_states,
+                all_self_attentions,
+                all_cross_attentions,
+            ] if v is not None)
+        return AttentionBackboneModelOutput(
+            last_hidden_state=hidden_states,
+            past_key_values=next_decoder_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+            cross_attentions=all_cross_attentions,
+        )
+
+
+# Copied from transformers.models.roberta.modeling_roberta.RobertaPooler
+class XLMRobertaPooler(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.activation = nn.Tanh()
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the first token.
+        first_token_tensor = hidden_states[:, 0]
+        pooled_output = self.dense(first_token_tensor)
+        pooled_output = self.activation(pooled_output)
+        return pooled_output
+
+
+# Copied from transformers.models.roberta.modeling_roberta.RobertaPreTrainedModel with Roberta->XLMRoberta
+class XLMRobertaPreTrainedModel(TorchModel, PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = XLMRobertaConfig
+    base_model_prefix = 'roberta'
+    supports_gradient_checkpointing = True
+    _no_split_modules = []
+
+    # Copied from transformers.models.bert.modeling_bert.BertPreTrainedModel._init_weights
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, nn.Linear):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(
+                mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(
+                mean=0.0, std=self.config.initializer_range)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, XLMRobertaEncoder):
+            module.gradient_checkpointing = value
+
+    def update_keys_to_ignore(self, config, del_keys_to_ignore):
+        """Remove some keys from ignore list"""
+        if not config.tie_word_embeddings:
+            # must make a new list, or the class variable gets modified!
+            self._keys_to_ignore_on_save = [
+                k for k in self._keys_to_ignore_on_save
+                if k not in del_keys_to_ignore
+            ]
+            self._keys_to_ignore_on_load_missing = [
+                k for k in self._keys_to_ignore_on_load_missing
+                if k not in del_keys_to_ignore
+            ]
+
+    @classmethod
+    def _instantiate(cls, **kwargs):
+        """Instantiate the model.
+
+        Args:
+            kwargs: Input args.
+                    model_dir: The model dir used to load the checkpoint and the label information.
+                    num_labels: An optional arg to tell the model how many classes to initialize.
+                                    Method will call utils.parse_label_mapping if num_labels not supplied.
+                                    If num_labels is not found, the model will use the default setting (2 classes).
+
+        Returns:
+            The loaded model, which is initialized by transformers.PreTrainedModel.from_pretrained
+        """
+
+        model_dir = kwargs.pop('model_dir', None)
+        cfg = kwargs.pop('cfg', None)
+        model_args = parse_labels_in_order(model_dir, cfg, **kwargs)
+        if model_dir is None:
+            config = XLMRobertaConfig(**model_args)
+            model = cls(config)
+        else:
+            model = super(Model, cls).from_pretrained(
+                pretrained_model_name_or_path=model_dir, **model_args)
+        model.model_dir = model_dir
+        return model
+
+
+@MODELS.register_module(
+    group_key=Tasks.backbone, module_name=Models.xlm_roberta)
+# Copied from transformers.models.roberta.modeling_roberta.RobertaModel with Roberta->XLMRoberta, ROBERTA->XLM_ROBERTA
+class XLMRobertaModel(XLMRobertaPreTrainedModel):
+    """
+
+    The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
+    cross-attention is added between the self-attention layers, following the architecture described in *Attention is
+    all you need*_ by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz
+    Kaiser and Illia Polosukhin.
+
+    To behave as an decoder the model needs to be initialized with the `is_decoder` argument of the configuration set
+    to `True`. To be used in a Seq2Seq model, the model needs to initialized with both `is_decoder` argument and
+    `add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an input to the forward pass.
+
+    .. _*Attention is all you need*: https://arxiv.org/abs/1706.03762
+
+    """
+
+    _keys_to_ignore_on_load_missing = [r'position_ids']
+
+    # Copied from transformers.models.bert.modeling_bert.BertModel.__init__ with Bert->XLMRoberta
+    def __init__(self, config, add_pooling_layer=True):
+        super().__init__(config)
+        self.config = config
+
+        self.embeddings = XLMRobertaEmbeddings(config)
+        self.encoder = XLMRobertaEncoder(config)
+
+        self.pooler = XLMRobertaPooler(config) if add_pooling_layer else None
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.embeddings.word_embeddings
+
+    def set_input_embeddings(self, value):
+        self.embeddings.word_embeddings = value
+
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+
+    # Copied from transformers.models.bert.modeling_bert.BertModel.forward
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_values=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        Args:
+        input_ids (`torch.LongTensor` of shape `((batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary.
+
+            Indices can be obtained using [`BertTokenizer`]. See
+            [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`]
+            for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.FloatTensor` of shape `((batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask
+            values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        token_type_ids (`torch.LongTensor` of shape `((batch_size, sequence_length)`, *optional*):
+            Segment token indices to indicate first and second portions of the
+            inputs. Indices are selected in `[0, 1]`:
+
+            - 0 corresponds to a *sentence A* token,
+            - 1 corresponds to a *sentence B* token.
+
+            [What are token type IDs?](../glossary#token-type-ids)
+        position_ids (`torch.LongTensor` of shape `((batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position
+            embeddings. Selected in the range `[0,
+            config.max_position_embeddings - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers,
+        num_heads)`, *optional*):
+            Mask to nullify selected heads of the self-attention modules. Mask
+            values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        inputs_embeds (`torch.FloatTensor` of shape `((batch_size, sequence_length, hidden_size)`,
+        *optional*):
+            Optionally, instead of passing `input_ids` you can choose to
+            directly pass an embedded representation. This is useful if you want
+            more control over how to convert `input_ids` indices into associated
+            vectors than the model's internal embedding lookup matrix.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention
+            layers. See `attentions` under returned tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See
+            `hidden_states` under returned tensors for more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~file_utils.ModelOutput`] instead of a
+            plain tuple.
+        encoder_hidden_states  (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
+            the model is configured as a decoder.
+        encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
+            the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+        past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors
+            of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+            Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
+
+            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
+            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
+            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else
+            self.config.output_hidden_states)
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if self.config.is_decoder:
+            use_cache = use_cache if use_cache is not None else self.config.use_cache
+        else:
+            use_cache = False
+
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError(
+                'You cannot specify both input_ids and inputs_embeds at the same time'
+            )
+        elif input_ids is not None:
+            input_shape = input_ids.size()
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+        else:
+            raise ValueError(
+                'You have to specify either input_ids or inputs_embeds')
+
+        batch_size, seq_length = input_shape
+        device = input_ids.device if input_ids is not None else inputs_embeds.device
+
+        # past_key_values_length
+        past_key_values_length = past_key_values[0][0].shape[
+            2] if past_key_values is not None else 0
+
+        if attention_mask is None:
+            attention_mask = torch.ones(
+                ((batch_size, seq_length + past_key_values_length)),
+                device=device)
+
+        if token_type_ids is None:
+            if hasattr(self.embeddings, 'token_type_ids'):
+                buffered_token_type_ids = self.embeddings.token_type_ids[:, :
+                                                                         seq_length]
+                buffered_token_type_ids_expanded = buffered_token_type_ids.expand(
+                    batch_size, seq_length)
+                token_type_ids = buffered_token_type_ids_expanded
+            else:
+                token_type_ids = torch.zeros(
+                    input_shape, dtype=torch.long, device=device)
+
+        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+        # ourselves in which case we just need to make it broadcastable to all heads.
+        extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(
+            attention_mask, input_shape)
+
+        # If a 2D or 3D attention mask is provided for the cross-attention
+        # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
+        if self.config.is_decoder and encoder_hidden_states is not None:
+            encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size(
+            )
+            encoder_hidden_shape = (encoder_batch_size,
+                                    encoder_sequence_length)
+            if encoder_attention_mask is None:
+                encoder_attention_mask = torch.ones(
+                    encoder_hidden_shape, device=device)
+            encoder_extended_attention_mask = self.invert_attention_mask(
+                encoder_attention_mask)
+        else:
+            encoder_extended_attention_mask = None
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        head_mask = self.get_head_mask(head_mask,
+                                       self.config.num_hidden_layers)
+
+        embedding_output = self.embeddings(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            token_type_ids=token_type_ids,
+            inputs_embeds=inputs_embeds,
+            past_key_values_length=past_key_values_length,
+        )
+        encoder_outputs = self.encoder(
+            embedding_output,
+            attention_mask=extended_attention_mask,
+            head_mask=head_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_extended_attention_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = encoder_outputs[0]
+        pooled_output = self.pooler(
+            sequence_output) if self.pooler is not None else None
+
+        if not return_dict:
+            return (sequence_output, pooled_output) + encoder_outputs[1:]
+
+        return AttentionBackboneModelOutput(
+            last_hidden_state=sequence_output,
+            pooler_output=pooled_output,
+            past_key_values=encoder_outputs.past_key_values,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+            cross_attentions=encoder_outputs.cross_attentions,
+        )
diff --git a/modelscope/models/nlp/xlm_roberta/configuration.py b/modelscope/models/nlp/xlm_roberta/configuration.py
new file mode 100644
index 00000000..3a245945
--- /dev/null
+++ b/modelscope/models/nlp/xlm_roberta/configuration.py
@@ -0,0 +1,152 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" XLM-RoBERTa configuration"""
+from collections import OrderedDict
+from typing import Mapping
+
+from transformers.configuration_utils import PretrainedConfig
+from transformers.onnx import OnnxConfig
+
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+
+class XLMRobertaConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`XLMRobertaModel`] or a [`TFXLMRobertaModel`]. It
+    is used to instantiate a XLM-RoBERTa model according to the specified arguments, defining the model architecture.
+    Instantiating a configuration with the defaults will yield a similar configuration to that of the XLMRoBERTa
+    [xlm-roberta-base](https://huggingface.co/xlm-roberta-base) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 30522):
+            Vocabulary size of the XLM-RoBERTa model. Defines the number of different tokens that can be represented by
+            the `inputs_ids` passed when calling [`XLMRobertaModel`] or [`TFXLMRobertaModel`].
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
+        hidden_act (`str` or `Callable`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the attention probabilities.
+        max_position_embeddings (`int`, *optional*, defaults to 512):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        type_vocab_size (`int`, *optional*, defaults to 2):
+            The vocabulary size of the `token_type_ids` passed when calling [`XLMRobertaModel`] or
+            [`TFXLMRobertaModel`].
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        position_embedding_type (`str`, *optional*, defaults to `"absolute"`):
+            Type of position embedding. Choose one of `"absolute"`, `"relative_key"`, `"relative_key_query"`. For
+            positional embeddings use `"absolute"`. For more information on `"relative_key"`, please refer to
+            [Self-Attention with Relative Position Representations (Shaw et al.)](https://arxiv.org/abs/1803.02155).
+            For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models
+            with Better Relative Position Embeddings (Huang et al.)](https://arxiv.org/abs/2009.13658).
+        is_decoder (`bool`, *optional*, defaults to `False`):
+            Whether the model is used as a decoder or not. If `False`, the model is used as an encoder.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        classifier_dropout (`float`, *optional*):
+            The dropout ratio for the classification head.
+
+    Examples:
+
+    ```python
+    >>> from transformers import XLMRobertaConfig, XLMRobertaModel
+
+    >>> # Initializing a XLM-RoBERTa xlm-roberta-base style configuration
+    >>> configuration = XLMRobertaConfig()
+
+    >>> # Initializing a model (with random weights) from the xlm-roberta-base style configuration
+    >>> model = XLMRobertaModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = 'xlm-roberta'
+
+    def __init__(self,
+                 vocab_size=30522,
+                 hidden_size=768,
+                 num_hidden_layers=12,
+                 num_attention_heads=12,
+                 intermediate_size=3072,
+                 hidden_act='gelu',
+                 hidden_dropout_prob=0.1,
+                 attention_probs_dropout_prob=0.1,
+                 max_position_embeddings=512,
+                 type_vocab_size=2,
+                 initializer_range=0.02,
+                 layer_norm_eps=1e-12,
+                 pad_token_id=1,
+                 bos_token_id=0,
+                 eos_token_id=2,
+                 position_embedding_type='absolute',
+                 use_cache=True,
+                 classifier_dropout=None,
+                 **kwargs):
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            **kwargs)
+
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.hidden_act = hidden_act
+        self.intermediate_size = intermediate_size
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.type_vocab_size = type_vocab_size
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.position_embedding_type = position_embedding_type
+        self.use_cache = use_cache
+        self.classifier_dropout = classifier_dropout
+
+
+# Copied from transformers.models.roberta.configuration_roberta.RobertaOnnxConfig with Roberta->XLMRoberta
+class XLMRobertaOnnxConfig(OnnxConfig):
+
+    @property
+    def inputs(self) -> Mapping[str, Mapping[int, str]]:
+        if self.task == 'multiple-choice':
+            dynamic_axis = {0: 'batch', 1: 'choice', 2: 'sequence'}
+        else:
+            dynamic_axis = {0: 'batch', 1: 'sequence'}
+        return OrderedDict([
+            ('input_ids', dynamic_axis),
+            ('attention_mask', dynamic_axis),
+        ])
diff --git a/modelscope/msdatasets/audio/__init__.py b/modelscope/msdatasets/audio/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/modelscope/msdatasets/audio/asr_dataset.py b/modelscope/msdatasets/audio/asr_dataset.py
new file mode 100644
index 00000000..c0696615
--- /dev/null
+++ b/modelscope/msdatasets/audio/asr_dataset.py
@@ -0,0 +1,48 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import os
+
+from modelscope.msdatasets.ms_dataset import MsDataset
+
+
+class ASRDataset(MsDataset):
+    """ASR dataset for speech recognition.
+    support load dataset from msdataset hub or local data_dir (including wav.scp and text)
+    For more details, please refer to
+        https://github.com/alibaba-damo-academy/FunASR/blob/main/funasr/datasets/ms_dataset.py.
+    """
+
+    @classmethod
+    def load_core(cls, data_dir, data_set):
+        wav_file = os.path.join(data_dir, data_set, 'wav.scp')
+        text_file = os.path.join(data_dir, data_set, 'text')
+        with open(wav_file) as f:
+            wav_lines = f.readlines()
+        with open(text_file) as f:
+            text_lines = f.readlines()
+        data_list = []
+        for wav_line, text_line in zip(wav_lines, text_lines):
+            item = {}
+            item['Audio:FILE'] = wav_line.strip().split()[-1]
+            item['Text:LABEL'] = ' '.join(text_line.strip().split()[1:])
+            data_list.append(item)
+        return data_list
+
+    @classmethod
+    def load(cls,
+             dataset_name,
+             namespace='speech_asr',
+             train_set='train',
+             dev_set='validation'):
+        if os.path.exists(dataset_name):
+            data_dir = dataset_name
+            ds_dict = {}
+            ds_dict['train'] = cls.load_core(data_dir, train_set)
+            ds_dict['validation'] = cls.load_core(data_dir, dev_set)
+            ds_dict['raw_data_dir'] = data_dir
+            return ds_dict
+        else:
+            from modelscope.msdatasets import MsDataset
+            ds_dict = MsDataset.load(
+                dataset_name=dataset_name, namespace=namespace)
+            return ds_dict
diff --git a/modelscope/msdatasets/cv/object_detection/__init__.py b/modelscope/msdatasets/cv/object_detection/__init__.py
index 30af2d9b..403163e9 100644
--- a/modelscope/msdatasets/cv/object_detection/__init__.py
+++ b/modelscope/msdatasets/cv/object_detection/__init__.py
@@ -4,11 +4,11 @@ from typing import TYPE_CHECKING
 from modelscope.utils.import_utils import LazyImportModule
 
 if TYPE_CHECKING:
-    from .easycv_detection import DetDataset, DetImagesMixDataset
+    from .detection_dataset import DetDataset, DetImagesMixDataset
 
 else:
     _import_structure = {
-        'easycv_detection': ['DetDataset', 'DetImagesMixDataset']
+        'detection_dataset': ['DetDataset', 'DetImagesMixDataset']
     }
 
     import sys
diff --git a/modelscope/msdatasets/dataset_cls/dataset.py b/modelscope/msdatasets/dataset_cls/dataset.py
index 49313e90..57ee8150 100644
--- a/modelscope/msdatasets/dataset_cls/dataset.py
+++ b/modelscope/msdatasets/dataset_cls/dataset.py
@@ -4,7 +4,6 @@ import copy
 import os
 
 import datasets
-import torchaudio
 from datasets import IterableDataset
 from PIL import Image
 
@@ -94,6 +93,7 @@ class NativeIterableDataset(IterableDataset):
                         if k.endswith('Image:FILE'):
                             ret[k + ':Object'] = Image.open(fp=ex_cache_path)
                         if k.endswith('Audio:FILE'):
+                            import torchaudio
                             waveform_and_rate = torchaudio.load(ex_cache_path)
                             ret[k + ':Object'] = waveform_and_rate
                 entity = ret
diff --git a/modelscope/msdatasets/ms_dataset.py b/modelscope/msdatasets/ms_dataset.py
index dc0e1e48..9f34186c 100644
--- a/modelscope/msdatasets/ms_dataset.py
+++ b/modelscope/msdatasets/ms_dataset.py
@@ -6,7 +6,6 @@ from typing import (Any, Callable, Dict, Iterable, List, Mapping, Optional,
                     Sequence, Union)
 
 import numpy as np
-import torch
 from datasets import Dataset, DatasetDict, IterableDataset, IterableDatasetDict
 from datasets.packaged_modules import _PACKAGED_DATASETS_MODULES
 from datasets.utils.file_utils import is_relative_path
@@ -43,42 +42,6 @@ def format_list(para) -> List:
     return para
 
 
-class MsMapDataset(torch.utils.data.Dataset):
-
-    def __init__(self, dataset: Iterable, preprocessor_list, retained_columns,
-                 columns, to_tensor):
-        super(MsDataset).__init__()
-        self.dataset = dataset
-        self.preprocessor_list = preprocessor_list
-        self.to_tensor = to_tensor
-        self.retained_columns = retained_columns
-        self.columns = columns
-
-    def __len__(self):
-        return len(self.dataset)
-
-    def type_converter(self, x):
-        if self.to_tensor:
-            return torch.tensor(x)
-        else:
-            return x
-
-    def __getitem__(self, index):
-        item_dict = self.dataset[index]
-        res = {
-            k: self.type_converter(item_dict[k])
-            for k in self.columns
-            if (not self.to_tensor) or k in self.retained_columns
-        }
-        for preprocessor in self.preprocessor_list:
-            res.update({
-                k: self.type_converter(v)
-                for k, v in preprocessor(item_dict).items()
-                if (not self.to_tensor) or k in self.retained_columns
-            })
-        return res
-
-
 class MsDataset:
     """
     ModelScope Dataset (aka, MsDataset) is backed by a huggingface Dataset to
@@ -206,9 +169,9 @@ class MsDataset:
 
             Args:
                 dataset_name (str): Path or name of the dataset.
-                                    The form of `namespace/dataset_name` is also supported.
+                    The form of `namespace/dataset_name` is also supported.
                 namespace(str, optional): Namespace of the dataset. It should not be None if you load a remote dataset
-                from Hubs.modelscope,
+                    from Hubs.modelscope,
                 namespace (str, optional):
                     Namespace of the dataset. It should not be None if you load a remote dataset
                     from Hubs.modelscope,
@@ -303,6 +266,7 @@ class MsDataset:
         columns: Union[str, List[str]] = None,
         to_tensor: bool = True,
     ):
+        import torch
         preprocessor_list = preprocessors if isinstance(
             preprocessors, list) else [preprocessors]
 
@@ -332,6 +296,42 @@ class MsDataset:
                     continue
                 retained_columns.append(k)
 
+        class MsMapDataset(torch.utils.data.Dataset):
+
+            def __init__(self, dataset: Iterable, preprocessor_list,
+                         retained_columns, columns, to_tensor):
+                super(MsDataset).__init__()
+                self.dataset = dataset
+                self.preprocessor_list = preprocessor_list
+                self.to_tensor = to_tensor
+                self.retained_columns = retained_columns
+                self.columns = columns
+
+            def __len__(self):
+                return len(self.dataset)
+
+            def type_converter(self, x):
+                import torch
+                if self.to_tensor:
+                    return torch.tensor(x)
+                else:
+                    return x
+
+            def __getitem__(self, index):
+                item_dict = self.dataset[index]
+                res = {
+                    k: self.type_converter(item_dict[k])
+                    for k in self.columns
+                    if (not self.to_tensor) or k in self.retained_columns
+                }
+                for preprocessor in self.preprocessor_list:
+                    res.update({
+                        k: self.type_converter(v)
+                        for k, v in preprocessor(item_dict).items()
+                        if (not self.to_tensor) or k in self.retained_columns
+                    })
+                return res
+
         return MsMapDataset(self._hf_ds, preprocessor_list, retained_columns,
                             columns, to_tensor)
 
diff --git a/modelscope/msdatasets/task_datasets/__init__.py b/modelscope/msdatasets/task_datasets/__init__.py
index 3494c8da..167af6db 100644
--- a/modelscope/msdatasets/task_datasets/__init__.py
+++ b/modelscope/msdatasets/task_datasets/__init__.py
@@ -15,6 +15,7 @@ if TYPE_CHECKING:
     from .image_inpainting import ImageInpaintingDataset
     from .text_ranking_dataset import TextRankingDataset
     from .referring_video_object_segmentation import ReferringVideoObjectSegmentationDataset
+    from .bad_image_detecting import BadImageDetectingDataset
 
 else:
     _import_structure = {
@@ -35,6 +36,7 @@ else:
         ['ImagePortraitEnhancementDataset'],
         'referring_video_object_segmentation':
         ['ReferringVideoObjectSegmentationDataset'],
+        'bad_image_detecting': ['BadImageDetectingDataset'],
     }
     import sys
 
diff --git a/modelscope/msdatasets/task_datasets/bad_image_detecting/__init__.py b/modelscope/msdatasets/task_datasets/bad_image_detecting/__init__.py
new file mode 100644
index 00000000..e8439865
--- /dev/null
+++ b/modelscope/msdatasets/task_datasets/bad_image_detecting/__init__.py
@@ -0,0 +1,22 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .bad_image_detecting_dataset import BadImageDetectingDataset
+
+else:
+    _import_structure = {
+        'bad_image_detecting_dataset': ['BadImageDetectingDataset'],
+    }
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/msdatasets/task_datasets/bad_image_detecting/bad_image_detecting_dataset.py b/modelscope/msdatasets/task_datasets/bad_image_detecting/bad_image_detecting_dataset.py
new file mode 100644
index 00000000..91ef5d13
--- /dev/null
+++ b/modelscope/msdatasets/task_datasets/bad_image_detecting/bad_image_detecting_dataset.py
@@ -0,0 +1,39 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import cv2
+import numpy as np
+
+from modelscope.metainfo import Models
+from modelscope.msdatasets.task_datasets.builder import TASK_DATASETS
+from modelscope.msdatasets.task_datasets.torch_base_dataset import \
+    TorchTaskDataset
+from modelscope.preprocessors import LoadImage
+from modelscope.preprocessors.cv.bad_image_preprocessor import \
+    BadImageDetectingPreprocessor
+from modelscope.utils.constant import Tasks
+
+
+@TASK_DATASETS.register_module(
+    Tasks.bad_image_detecting, module_name=Models.bad_image_detecting)
+class BadImageDetectingDataset(TorchTaskDataset):
+    """Paired image dataset for bad image detecting.
+    """
+
+    def __init__(self, dataset, opt):
+        self.dataset = dataset
+        self.opt = opt
+        self.preprocessor = BadImageDetectingPreprocessor()
+
+    def __len__(self):
+        return len(self.dataset)
+
+    def __getitem__(self, index):
+
+        # Load input video paths.
+        item_dict = self.dataset[index]
+        iterm_label = item_dict['label']
+
+        img = LoadImage.convert_to_ndarray(input)
+        img = self.preprocessor(img)
+
+        return {'input': img['input'], 'target': iterm_label}
diff --git a/modelscope/msdatasets/task_datasets/image_quality_assessment_degradation/__init__.py b/modelscope/msdatasets/task_datasets/image_quality_assessment_degradation/__init__.py
new file mode 100644
index 00000000..07e66f2f
--- /dev/null
+++ b/modelscope/msdatasets/task_datasets/image_quality_assessment_degradation/__init__.py
@@ -0,0 +1,23 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .image_quality_assessment_degradation_dataset import ImageQualityAssessmentDegradationDataset
+
+else:
+    _import_structure = {
+        'image_quality_assessment_degradation_dataset':
+        ['ImageQualityAssessmentDegradationDataset'],
+    }
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/msdatasets/task_datasets/image_quality_assessment_degradation/image_quality_assessment_degradation_dataset.py b/modelscope/msdatasets/task_datasets/image_quality_assessment_degradation/image_quality_assessment_degradation_dataset.py
new file mode 100644
index 00000000..75826065
--- /dev/null
+++ b/modelscope/msdatasets/task_datasets/image_quality_assessment_degradation/image_quality_assessment_degradation_dataset.py
@@ -0,0 +1,48 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import cv2
+import numpy as np
+from torchvision import transforms
+
+from modelscope.metainfo import Models
+from modelscope.msdatasets.task_datasets.builder import TASK_DATASETS
+from modelscope.msdatasets.task_datasets.torch_base_dataset import \
+    TorchTaskDataset
+from modelscope.preprocessors import LoadImage
+from modelscope.utils.constant import Tasks
+
+
+@TASK_DATASETS.register_module(
+    Tasks.image_quality_assessment_degradation,
+    module_name=Models.image_quality_assessment_degradation)
+class ImageQualityAssessmentDegradationDataset(TorchTaskDataset):
+    """Paired image dataset for image quality assessment degradation.
+    """
+
+    def __init__(self, dataset):
+        self.dataset = dataset
+
+    def __len__(self):
+        return len(self.dataset)
+
+    def __getitem__(self, index):
+
+        # Load input video paths.
+        item_dict = self.dataset[index]
+        item_id = item_dict['image:FILE'].split('/')[-1].split('_')[0]
+        item_degree = item_dict['degree']
+        item_distortion_type = '%02d' % item_dict['degradation_category']
+
+        img = LoadImage.convert_to_img(item_dict['image:FILE'])
+        w, h = img.size
+        if h * w < 1280 * 720:
+            img = transforms.functional.resize(img, 720)
+        test_transforms = transforms.Compose([transforms.ToTensor()])
+        img = test_transforms(img)
+
+        return {
+            'input': img,
+            'item_id': item_id,
+            'target': item_degree,
+            'distortion_type': item_distortion_type
+        }
diff --git a/modelscope/msdatasets/task_datasets/image_quality_assmessment_mos/__init__.py b/modelscope/msdatasets/task_datasets/image_quality_assmessment_mos/__init__.py
new file mode 100644
index 00000000..d66f3a04
--- /dev/null
+++ b/modelscope/msdatasets/task_datasets/image_quality_assmessment_mos/__init__.py
@@ -0,0 +1,23 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .image_quality_assessment_mos_dataset import ImageQualityAssessmentMosDataset
+
+else:
+    _import_structure = {
+        'image_quality_assessment_mos_dataset':
+        ['ImageQualityAssessmentMosDataset'],
+    }
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/msdatasets/task_datasets/image_quality_assmessment_mos/image_quality_assessment_mos_dataset.py b/modelscope/msdatasets/task_datasets/image_quality_assmessment_mos/image_quality_assessment_mos_dataset.py
new file mode 100644
index 00000000..3d8ed297
--- /dev/null
+++ b/modelscope/msdatasets/task_datasets/image_quality_assmessment_mos/image_quality_assessment_mos_dataset.py
@@ -0,0 +1,41 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import cv2
+import numpy as np
+
+from modelscope.metainfo import Models
+from modelscope.msdatasets.task_datasets.builder import TASK_DATASETS
+from modelscope.msdatasets.task_datasets.torch_base_dataset import \
+    TorchTaskDataset
+from modelscope.preprocessors.cv import ImageQualityAssessmentMosPreprocessor
+from modelscope.utils.constant import Tasks
+
+
+@TASK_DATASETS.register_module(
+    Tasks.image_quality_assessment_mos,
+    module_name=Models.image_quality_assessment_mos)
+class ImageQualityAssessmentMosDataset(TorchTaskDataset):
+    """Paired image dataset for image quality assessment mos.
+    """
+
+    def __init__(self,
+                 dataset,
+                 opt,
+                 preprocessor=ImageQualityAssessmentMosPreprocessor()):
+        self.preprocessor = preprocessor
+        self.dataset = dataset
+        self.opt = opt
+        self.scale = 0.2
+
+    def __len__(self):
+        return len(self.dataset)
+
+    def __getitem__(self, index):
+
+        # Load input video paths.
+        item_dict = self.dataset[index]
+        iterm_mos = float(item_dict['mos']) * self.scale
+
+        img = self.preprocessor(item_dict['image:FILE'])
+
+        return {'input': img['input'].squeeze(0), 'target': iterm_mos}
diff --git a/modelscope/msdatasets/task_datasets/reds_image_deblurring_dataset.py b/modelscope/msdatasets/task_datasets/reds_image_deblurring_dataset.py
new file mode 100644
index 00000000..17b731bc
--- /dev/null
+++ b/modelscope/msdatasets/task_datasets/reds_image_deblurring_dataset.py
@@ -0,0 +1,60 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import cv2
+import numpy as np
+
+from modelscope.metainfo import Datasets
+from modelscope.msdatasets.task_datasets.builder import TASK_DATASETS
+from modelscope.msdatasets.task_datasets.sidd_image_denoising.data_utils import (
+    img2tensor, padding)
+from modelscope.msdatasets.task_datasets.sidd_image_denoising.transforms import (
+    augment, paired_random_crop)
+from modelscope.msdatasets.task_datasets.torch_base_dataset import \
+    TorchTaskDataset
+from modelscope.utils.constant import Tasks
+
+
+def default_loader(path):
+    return cv2.imread(path, cv2.IMREAD_UNCHANGED).astype(np.float32) / 255.0
+
+
+@TASK_DATASETS.register_module(
+    Tasks.image_deblurring, module_name=Datasets.PairedDataset)
+class RedsImageDeblurringDataset(TorchTaskDataset):
+    """Paired image dataset for image restoration.
+    """
+
+    def __init__(self, dataset, opt, is_train):
+        self.dataset = dataset
+        self.opt = opt
+        self.is_train = is_train
+
+    def __len__(self):
+        return len(self.dataset)
+
+    def __getitem__(self, index):
+        item_dict = self.dataset[index]
+        hq_path = item_dict['LQ Frame:FILE']
+        img_hq = default_loader(hq_path)
+        lq_path = item_dict['HQ Frame:FILE']
+        img_lq = default_loader(lq_path)
+
+        # augmentation for training
+        if self.is_train:
+            gt_size = self.opt.gt_size
+            # padding
+            img_hq, img_lq = padding(img_hq, img_lq, gt_size)
+
+            # random crop
+            img_hq, img_lq = paired_random_crop(
+                img_hq, img_lq, gt_size, scale=1)
+
+            # flip, rotation
+            img_hq, img_lq = augment([img_hq, img_lq], self.opt.use_flip,
+                                     self.opt.use_rot)
+
+        # BGR to RGB, HWC to CHW, numpy to tensor
+        img_hq, img_lq = img2tensor([img_hq, img_lq],
+                                    bgr2rgb=True,
+                                    float32=True)
+        return {'input': img_lq, 'target': img_hq}
diff --git a/modelscope/msdatasets/task_datasets/text_ranking_dataset.py b/modelscope/msdatasets/task_datasets/text_ranking_dataset.py
index 54276843..19f07110 100644
--- a/modelscope/msdatasets/task_datasets/text_ranking_dataset.py
+++ b/modelscope/msdatasets/task_datasets/text_ranking_dataset.py
@@ -17,6 +17,8 @@ from .torch_base_dataset import TorchTaskDataset
 
 @TASK_DATASETS.register_module(
     group_key=Tasks.text_ranking, module_name=Models.bert)
+@TASK_DATASETS.register_module(
+    group_key=Tasks.sentence_embedding, module_name=Models.bert)
 class TextRankingDataset(TorchTaskDataset):
 
     def __init__(self,
diff --git a/modelscope/ops/ailut/Ailut/__init__.py b/modelscope/ops/ailut/Ailut/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/modelscope/ops/ailut/Ailut/csrc/__init__.py b/modelscope/ops/ailut/Ailut/csrc/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/modelscope/ops/ailut/Ailut/csrc/ailut_transform.cpp b/modelscope/ops/ailut/Ailut/csrc/ailut_transform.cpp
new file mode 100644
index 00000000..6e82c369
--- /dev/null
+++ b/modelscope/ops/ailut/Ailut/csrc/ailut_transform.cpp
@@ -0,0 +1,220 @@
+#include <torch/extension.h>
+
+/* CUDA Forward Declarations */
+
+void LutTransformForwardCUDAKernelLauncher(
+    const torch::Tensor &input, const torch::Tensor &lut, torch::Tensor output);
+
+
+void LutTransformBackwardCUDAKernelLauncher(
+    const torch::Tensor &grad_output, const torch::Tensor &input,
+    const torch::Tensor &lut, torch::Tensor grad_inp, torch::Tensor grad_lut);
+
+
+void AiLutTransformForwardCUDAKernelLauncher(
+    const torch::Tensor &input, const torch::Tensor &lut,
+    const torch::Tensor &vertices, torch::Tensor output);
+
+
+void AiLutTransformBackwardCUDAKernelLauncher(
+    const torch::Tensor &grad_output, const torch::Tensor &input,
+    const torch::Tensor &lut, const torch::Tensor &vertices,
+    torch::Tensor grad_inp, torch::Tensor grad_lut, torch::Tensor grad_ver);
+
+
+void lut_transform_cuda_forward(
+    const torch::Tensor &input,
+    const torch::Tensor &lut,
+    torch::Tensor output) {
+
+    LutTransformForwardCUDAKernelLauncher(input, lut, output);
+}
+
+
+void lut_transform_cuda_backward(
+    const torch::Tensor &grad_output,
+    const torch::Tensor &input,
+    const torch::Tensor &lut,
+    torch::Tensor grad_inp,
+    torch::Tensor grad_lut) {
+
+    LutTransformBackwardCUDAKernelLauncher(
+        grad_output, input, lut, grad_inp, grad_lut);
+}
+
+
+void ailut_transform_cuda_forward(
+    const torch::Tensor &input,
+    const torch::Tensor &lut,
+    const torch::Tensor &vertices,
+    torch::Tensor output) {
+
+    AiLutTransformForwardCUDAKernelLauncher(input, lut, vertices, output);
+}
+
+
+void ailut_transform_cuda_backward(
+    const torch::Tensor &grad_output,
+    const torch::Tensor &input,
+    const torch::Tensor &lut,
+    const torch::Tensor &vertices,
+    torch::Tensor grad_inp,
+    torch::Tensor grad_lut,
+    torch::Tensor grad_ver) {
+
+    AiLutTransformBackwardCUDAKernelLauncher(
+        grad_output, input, lut, vertices, grad_inp, grad_lut, grad_ver);
+}
+
+
+void lut_transform_cpu_forward(
+    const torch::Tensor &input,
+    const torch::Tensor &lut,
+    torch::Tensor output);
+
+
+void lut_transform_cpu_backward(
+    const torch::Tensor &grad_output,
+    const torch::Tensor &input,
+    const torch::Tensor &lut,
+    torch::Tensor grad_inp,
+    torch::Tensor grad_lut);
+
+
+void ailut_transform_cpu_forward(
+    const torch::Tensor &input,
+    const torch::Tensor &lut,
+    const torch::Tensor &vertices,
+    torch::Tensor output);
+
+
+void ailut_transform_cpu_backward(
+    const torch::Tensor &grad_output,
+    const torch::Tensor &input,
+    const torch::Tensor &lut,
+    const torch::Tensor &vertices,
+    torch::Tensor grad_inp,
+    torch::Tensor grad_lut,
+    torch::Tensor grad_ver);
+
+
+/* C++ Interfaces */
+
+#define CHECK_CUDA(x) TORCH_CHECK(x.device().is_cuda(), #x " must be a CUDA tensor")
+#define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
+#define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x)
+
+
+void lut_transform_forward(
+    const torch::Tensor &input,
+    const torch::Tensor &lut,
+    torch::Tensor output) {
+
+    if (input.device().is_cuda()) {
+        CHECK_INPUT(input);
+        CHECK_INPUT(lut);
+        CHECK_INPUT(output);
+
+        lut_transform_cuda_forward(input, lut, output);
+    } else {
+        CHECK_CONTIGUOUS(input);
+        CHECK_CONTIGUOUS(lut);
+        CHECK_CONTIGUOUS(output);
+
+        lut_transform_cpu_forward(input, lut, output);
+    }
+}
+
+
+void lut_transform_backward(
+    const torch::Tensor &grad_output,
+    const torch::Tensor &input,
+    const torch::Tensor &lut,
+    torch::Tensor grad_inp,
+    torch::Tensor grad_lut) {
+
+    if (input.device().is_cuda()) {
+        CHECK_INPUT(grad_output);
+        CHECK_INPUT(input);
+        CHECK_INPUT(lut);
+        CHECK_INPUT(grad_inp);
+        CHECK_INPUT(grad_lut);
+
+        lut_transform_cuda_backward(grad_output, input, lut, grad_inp, grad_lut);
+    } else {
+        CHECK_CONTIGUOUS(grad_output);
+        CHECK_CONTIGUOUS(input);
+        CHECK_CONTIGUOUS(lut);
+        CHECK_CONTIGUOUS(grad_inp);
+        CHECK_CONTIGUOUS(grad_lut);
+
+        lut_transform_cpu_backward(grad_output, input, lut, grad_inp, grad_lut);
+    }
+}
+
+
+void ailut_transform_forward(
+    const torch::Tensor &input,
+    const torch::Tensor &lut,
+    const torch::Tensor &vertices,
+    torch::Tensor output) {
+
+    if (input.device().is_cuda()) {
+        CHECK_INPUT(input);
+        CHECK_INPUT(lut);
+        CHECK_INPUT(vertices);
+        CHECK_INPUT(output);
+
+        ailut_transform_cuda_forward(input, lut, vertices, output);
+    } else {
+        CHECK_CONTIGUOUS(input);
+        CHECK_CONTIGUOUS(lut);
+        CHECK_CONTIGUOUS(vertices);
+        CHECK_CONTIGUOUS(output);
+
+        ailut_transform_cpu_forward(input, lut, vertices, output);
+    }
+}
+
+
+void ailut_transform_backward(
+    const torch::Tensor &grad_output,
+    const torch::Tensor &input,
+    const torch::Tensor &lut,
+    const torch::Tensor &vertices,
+    torch::Tensor grad_inp,
+    torch::Tensor grad_lut,
+    torch::Tensor grad_ver) {
+
+    if (input.device().is_cuda()) {
+        CHECK_INPUT(grad_output);
+        CHECK_INPUT(input);
+        CHECK_INPUT(lut);
+        CHECK_INPUT(vertices);
+        CHECK_INPUT(grad_inp);
+        CHECK_INPUT(grad_lut);
+        CHECK_INPUT(grad_ver);
+
+        ailut_transform_cuda_backward(grad_output, input, lut, vertices, grad_inp, grad_lut, grad_ver);
+    } else {
+        CHECK_CONTIGUOUS(grad_output);
+        CHECK_CONTIGUOUS(input);
+        CHECK_CONTIGUOUS(lut);
+        CHECK_CONTIGUOUS(vertices);
+        CHECK_CONTIGUOUS(grad_inp);
+        CHECK_CONTIGUOUS(grad_lut);
+        CHECK_CONTIGUOUS(grad_ver);
+
+        ailut_transform_cpu_backward(grad_output, input, lut, vertices, grad_inp, grad_lut, grad_ver);
+    }
+}
+
+
+/* Interfaces Binding */
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("lut_cforward", &lut_transform_forward, "Lut-Transform forward");
+  m.def("lut_cbackward", &lut_transform_backward, "Lut-Transform backward");
+  m.def("ailut_cforward", &ailut_transform_forward, "AiLut-Transform forward");
+  m.def("ailut_cbackward", &ailut_transform_backward, "AiLut-Transform backward");
+}
diff --git a/modelscope/ops/ailut/Ailut/csrc/ailut_transform_cpu.cpp b/modelscope/ops/ailut/Ailut/csrc/ailut_transform_cpu.cpp
new file mode 100644
index 00000000..bfedc0f0
--- /dev/null
+++ b/modelscope/ops/ailut/Ailut/csrc/ailut_transform_cpu.cpp
@@ -0,0 +1,595 @@
+#include <torch/extension.h>
+
+#include <ATen/ATen.h>
+
+
+/* std::clamp is only available since c++17 */
+template <typename scalar_t>
+inline constexpr const scalar_t& clamp(
+    const scalar_t& v, const scalar_t& lo, const scalar_t& hi)
+{
+    return (v < lo) ? lo : ((v > hi) ? hi : v);
+}
+
+
+/* binary search on a sorted array to find and clamp the lower bound */
+template <typename scalar_t>
+inline int32_t lower_bound(
+        const scalar_t *data_ss,
+        int32_t start,
+        int32_t end,
+        scalar_t val) {
+
+    const int32_t ori_start = start;
+    const int32_t upper_bound = end - start - 2;
+    while (start < end) {
+        int64_t mid = start + ((end - start) >> 1);
+        if (!(data_ss[mid] >= val)) {
+            start = mid + 1;
+        }
+        else {
+            end = mid;
+        }
+    }
+    return clamp(start - ori_start - 1, 0, upper_bound);
+}
+
+
+template <typename scalar_t>
+void lut_transform_3d_cpu_forward_impl(
+        const int n,
+        const scalar_t* __restrict__ data_inp,
+        const scalar_t* __restrict__ data_lut,
+        const int height,
+        const int width,
+        const int stride_lut,
+        const int num_channels,
+        scalar_t* __restrict__ data_col) {
+
+    const scalar_t size_bin = 1.0 / (stride_lut - 1);
+
+    for (int index = 0; index < n; ++index) {
+
+        /* retrieve rgb value of the pixel */
+        const scalar_t r = data_inp[index];
+        const scalar_t g = data_inp[index + height * width];
+        const scalar_t b = data_inp[index + height * width * 2];
+
+        /* retrieve index of the interpolation verticess */
+        const int32_t rid = clamp((int32_t)floor(r * (stride_lut - 1)), 0, stride_lut - 2);
+        const int32_t gid = clamp((int32_t)floor(g * (stride_lut - 1)), 0, stride_lut - 2);
+        const int32_t bid = clamp((int32_t)floor(b * (stride_lut - 1)), 0, stride_lut - 2);
+
+        /* utility variables for indexing */
+        const int stride_lut_2 = stride_lut * stride_lut;
+        const int stride_lut_3 = stride_lut_2 * stride_lut;
+
+        /* retrieve the interpolation verticess (number of 8 in case of trilinear interpolation) */
+        const int id000 = (rid    ) + stride_lut * (gid    ) + stride_lut_2 * (bid    );
+        const int id100 = (rid + 1) + stride_lut * (gid    ) + stride_lut_2 * (bid    );
+        const int id010 = (rid    ) + stride_lut * (gid + 1) + stride_lut_2 * (bid    );
+        const int id110 = (rid + 1) + stride_lut * (gid + 1) + stride_lut_2 * (bid    );
+        const int id001 = (rid    ) + stride_lut * (gid    ) + stride_lut_2 * (bid + 1);
+        const int id101 = (rid + 1) + stride_lut * (gid    ) + stride_lut_2 * (bid + 1);
+        const int id011 = (rid    ) + stride_lut * (gid + 1) + stride_lut_2 * (bid + 1);
+        const int id111 = (rid + 1) + stride_lut * (gid + 1) + stride_lut_2 * (bid + 1);
+
+        /* compute interpolation weights */
+        const scalar_t rd = (r - size_bin * rid) / size_bin;
+        const scalar_t gd = (g - size_bin * gid) / size_bin;
+        const scalar_t bd = (b - size_bin * bid) / size_bin;
+
+        const scalar_t w000 = (1 - rd) * (1 - gd) * (1 - bd);
+        const scalar_t w100 = (    rd) * (1 - gd) * (1 - bd);
+        const scalar_t w010 = (1 - rd) * (    gd) * (1 - bd);
+        const scalar_t w110 = (    rd) * (    gd) * (1 - bd);
+        const scalar_t w001 = (1 - rd) * (1 - gd) * (    bd);
+        const scalar_t w101 = (    rd) * (1 - gd) * (    bd);
+        const scalar_t w011 = (1 - rd) * (    gd) * (    bd);
+        const scalar_t w111 = (    rd) * (    gd) * (    bd);
+
+        /* Execute the interpolation */
+        for (int i = 0; i < num_channels; ++i) {
+            data_col[index + height * width * i] =
+                w000 * data_lut[id000 + stride_lut_3 * i] + w100 * data_lut[id100 + stride_lut_3 * i] +
+                w010 * data_lut[id010 + stride_lut_3 * i] + w110 * data_lut[id110 + stride_lut_3 * i] +
+                w001 * data_lut[id001 + stride_lut_3 * i] + w101 * data_lut[id101 + stride_lut_3 * i] +
+                w011 * data_lut[id011 + stride_lut_3 * i] + w111 * data_lut[id111 + stride_lut_3 * i];
+        }
+    }
+}
+
+
+template <typename scalar_t>
+void lut_transform_3d_cpu_backward_impl(
+        const int n,
+        const scalar_t* __restrict__ grad_output,
+        const scalar_t* __restrict__ data_inp,
+        const scalar_t* __restrict__ data_lut,
+        const int height,
+        const int width,
+        const int stride_lut,
+        const int num_channels,
+        scalar_t* __restrict__ grad_inp,
+        scalar_t* __restrict__ grad_lut) {
+
+    const scalar_t size_bin = 1.0 / (stride_lut - 1);
+
+    for (int index = 0; index < n; ++index) {
+
+        /* retrieve rgb value of the pixel */
+        const scalar_t r = data_inp[index];
+        const scalar_t g = data_inp[index + height * width];
+        const scalar_t b = data_inp[index + height * width * 2];
+
+        /* retrieve index of the interpolation verticess */
+        const int32_t rid = clamp((int32_t)floor(r * (stride_lut - 1)), 0, stride_lut - 2);
+        const int32_t gid = clamp((int32_t)floor(g * (stride_lut - 1)), 0, stride_lut - 2);
+        const int32_t bid = clamp((int32_t)floor(b * (stride_lut - 1)), 0, stride_lut - 2);
+
+        /* utility variables for indexing */
+        const int stride_lut_2 = stride_lut * stride_lut;
+        const int stride_lut_3 = stride_lut_2 * stride_lut;
+
+        /* retrieve the interpolation verticess (number of 8 in case of trilinear interpolation) */
+        const int id000 = (rid    ) + stride_lut * (gid    ) + stride_lut_2 * (bid    );
+        const int id100 = (rid + 1) + stride_lut * (gid    ) + stride_lut_2 * (bid    );
+        const int id010 = (rid    ) + stride_lut * (gid + 1) + stride_lut_2 * (bid    );
+        const int id110 = (rid + 1) + stride_lut * (gid + 1) + stride_lut_2 * (bid    );
+        const int id001 = (rid    ) + stride_lut * (gid    ) + stride_lut_2 * (bid + 1);
+        const int id101 = (rid + 1) + stride_lut * (gid    ) + stride_lut_2 * (bid + 1);
+        const int id011 = (rid    ) + stride_lut * (gid + 1) + stride_lut_2 * (bid + 1);
+        const int id111 = (rid + 1) + stride_lut * (gid + 1) + stride_lut_2 * (bid + 1);
+
+        /* compute interpolation weights */
+        const scalar_t rd = (r - size_bin * rid) / size_bin;
+        const scalar_t gd = (g - size_bin * gid) / size_bin;
+        const scalar_t bd = (b - size_bin * bid) / size_bin;
+
+        const scalar_t w000 = (1 - rd) * (1 - gd) * (1 - bd);
+        const scalar_t w100 = (    rd) * (1 - gd) * (1 - bd);
+        const scalar_t w010 = (1 - rd) * (    gd) * (1 - bd);
+        const scalar_t w110 = (    rd) * (    gd) * (1 - bd);
+        const scalar_t w001 = (1 - rd) * (1 - gd) * (    bd);
+        const scalar_t w101 = (    rd) * (1 - gd) * (    bd);
+        const scalar_t w011 = (1 - rd) * (    gd) * (    bd);
+        const scalar_t w111 = (    rd) * (    gd) * (    bd);
+
+        /* derivatives: w to rd */
+        const scalar_t w000_rd = - (1 - gd) * (1 - bd);
+        const scalar_t w100_rd =   (1 - gd) * (1 - bd);
+        const scalar_t w010_rd = - (    gd) * (1 - bd);
+        const scalar_t w110_rd =   (    gd) * (1 - bd);
+        const scalar_t w001_rd = - (1 - gd) * (    bd);
+        const scalar_t w101_rd =   (1 - gd) * (    bd);
+        const scalar_t w011_rd = - (    gd) * (    bd);
+        const scalar_t w111_rd =   (    gd) * (    bd);
+
+        /* derivatives: w to gd */
+        const scalar_t w000_gd = - (1 - rd) * (1 - bd);
+        const scalar_t w100_gd = - (    rd) * (1 - bd);
+        const scalar_t w010_gd =   (1 - rd) * (1 - bd);
+        const scalar_t w110_gd =   (    rd) * (1 - bd);
+        const scalar_t w001_gd = - (1 - rd) * (    bd);
+        const scalar_t w101_gd = - (    rd) * (    bd);
+        const scalar_t w011_gd =   (1 - rd) * (    bd);
+        const scalar_t w111_gd =   (    rd) * (    bd);
+
+        /* derivatives: w to bd */
+        const scalar_t w000_bd = - (1 - rd) * (1 - gd);
+        const scalar_t w100_bd = - (    rd) * (1 - gd);
+        const scalar_t w010_bd = - (1 - rd) * (    gd);
+        const scalar_t w110_bd = - (    rd) * (    gd);
+        const scalar_t w001_bd =   (1 - rd) * (1 - gd);
+        const scalar_t w101_bd =   (    rd) * (1 - gd);
+        const scalar_t w011_bd =   (1 - rd) * (    gd);
+        const scalar_t w111_bd =   (    rd) * (    gd);
+
+        for (int i = 0; i < num_channels; ++i) {
+            scalar_t grad_o_ = grad_output[index + width * height * i];
+
+            /* compute gradient of lut */
+            grad_lut[id000 + stride_lut_3 * i] += grad_o_ * w000;
+            grad_lut[id100 + stride_lut_3 * i] += grad_o_ * w100;
+            grad_lut[id010 + stride_lut_3 * i] += grad_o_ * w010;
+            grad_lut[id110 + stride_lut_3 * i] += grad_o_ * w110;
+            grad_lut[id001 + stride_lut_3 * i] += grad_o_ * w001;
+            grad_lut[id101 + stride_lut_3 * i] += grad_o_ * w101;
+            grad_lut[id011 + stride_lut_3 * i] += grad_o_ * w011;
+            grad_lut[id111 + stride_lut_3 * i] += grad_o_ * w111;
+
+            /* compute gradient of vertices */
+            scalar_t grad_d = 0;
+            const scalar_t lut000 = data_lut[id000 + stride_lut_3 * i];
+            const scalar_t lut100 = data_lut[id100 + stride_lut_3 * i];
+            const scalar_t lut010 = data_lut[id010 + stride_lut_3 * i];
+            const scalar_t lut110 = data_lut[id110 + stride_lut_3 * i];
+            const scalar_t lut001 = data_lut[id001 + stride_lut_3 * i];
+            const scalar_t lut101 = data_lut[id101 + stride_lut_3 * i];
+            const scalar_t lut011 = data_lut[id011 + stride_lut_3 * i];
+            const scalar_t lut111 = data_lut[id111 + stride_lut_3 * i];
+            grad_d = grad_o_ *
+                (w000_rd * lut000 + w100_rd * lut100 + w010_rd * lut010 + w110_rd * lut110 +
+                 w001_rd * lut001 + w101_rd * lut101 + w011_rd * lut011 + w111_rd * lut111);
+            // r
+            grad_inp[index] += grad_d * 1 / size_bin;
+
+            grad_d = grad_o_ *
+                (w000_gd * lut000 + w100_gd * lut100 + w010_gd * lut010 + w110_gd * lut110 +
+                 w001_gd * lut001 + w101_gd * lut101 + w011_gd * lut011 + w111_gd * lut111);
+            // g
+            grad_inp[index + height * width] += grad_d * 1 / size_bin;
+
+            grad_d = grad_o_ *
+                (w000_bd * lut000 + w100_bd * lut100 + w010_bd * lut010 + w110_bd * lut110 +
+                 w001_bd * lut001 + w101_bd * lut101 + w011_bd * lut011 + w111_bd * lut111);
+            // b
+            grad_inp[index + height * width * 2] += grad_d * 1 / size_bin;
+        }
+    }
+}
+
+
+template <typename scalar_t>
+void ailut_transform_3d_cpu_forward_impl(
+        const int n,
+        const scalar_t* __restrict__ data_inp,
+        const scalar_t* __restrict__ data_lut,
+        const scalar_t* __restrict__ data_anc,
+        const int height,
+        const int width,
+        const int stride_lut,
+        const int num_channels,
+        scalar_t* __restrict__ data_col) {
+
+    const static scalar_t eps = 1e-10;
+
+    for (int index = 0; index < n; ++index) {
+
+        /* retrieve rgb value of the pixel */
+        const scalar_t r = data_inp[index];
+        const scalar_t g = data_inp[index + height * width];
+        const scalar_t b = data_inp[index + height * width * 2];
+
+        /* retrieve index of the interpolation verticess */
+        const int32_t rid = lower_bound(data_anc, 0, stride_lut, r);
+        const int32_t gid = lower_bound(data_anc, stride_lut, stride_lut * 2, g);
+        const int32_t bid = lower_bound(data_anc, stride_lut * 2, stride_lut * 3, b);
+
+        /* utility variables for indexing */
+        const int stride_lut_2 = stride_lut * stride_lut;
+        const int stride_lut_3 = stride_lut_2 * stride_lut;
+
+        /* retrieve the interpolation verticess (number of 8 in case of trilinear interpolation) */
+        const int id000 = (rid    ) + stride_lut * (gid    ) + stride_lut_2 * (bid    );
+        const int id100 = (rid + 1) + stride_lut * (gid    ) + stride_lut_2 * (bid    );
+        const int id010 = (rid    ) + stride_lut * (gid + 1) + stride_lut_2 * (bid    );
+        const int id110 = (rid + 1) + stride_lut * (gid + 1) + stride_lut_2 * (bid    );
+        const int id001 = (rid    ) + stride_lut * (gid    ) + stride_lut_2 * (bid + 1);
+        const int id101 = (rid + 1) + stride_lut * (gid    ) + stride_lut_2 * (bid + 1);
+        const int id011 = (rid    ) + stride_lut * (gid + 1) + stride_lut_2 * (bid + 1);
+        const int id111 = (rid + 1) + stride_lut * (gid + 1) + stride_lut_2 * (bid + 1);
+
+        /* compute interpolation weights */
+        const scalar_t r0 = data_anc[rid];
+        const scalar_t r1 = data_anc[rid + 1];
+        const scalar_t g0 = data_anc[gid + stride_lut];
+        const scalar_t g1 = data_anc[gid + stride_lut + 1];
+        const scalar_t b0 = data_anc[bid + stride_lut * 2];
+        const scalar_t b1 = data_anc[bid + stride_lut * 2 + 1];
+
+        const scalar_t rd = (r - r0) / (r1 - r0 + eps);
+        const scalar_t gd = (g - g0) / (g1 - g0 + eps);
+        const scalar_t bd = (b - b0) / (b1 - b0 + eps);
+
+        const scalar_t w000 = (1 - rd) * (1 - gd) * (1 - bd);
+        const scalar_t w100 = (    rd) * (1 - gd) * (1 - bd);
+        const scalar_t w010 = (1 - rd) * (    gd) * (1 - bd);
+        const scalar_t w110 = (    rd) * (    gd) * (1 - bd);
+        const scalar_t w001 = (1 - rd) * (1 - gd) * (    bd);
+        const scalar_t w101 = (    rd) * (1 - gd) * (    bd);
+        const scalar_t w011 = (1 - rd) * (    gd) * (    bd);
+        const scalar_t w111 = (    rd) * (    gd) * (    bd);
+
+        /* Execute the interpolation */
+        for (int i = 0; i < num_channels; ++i) {
+            data_col[index + height * width * i] =
+                w000 * data_lut[id000 + stride_lut_3 * i] + w100 * data_lut[id100 + stride_lut_3 * i] +
+                w010 * data_lut[id010 + stride_lut_3 * i] + w110 * data_lut[id110 + stride_lut_3 * i] +
+                w001 * data_lut[id001 + stride_lut_3 * i] + w101 * data_lut[id101 + stride_lut_3 * i] +
+                w011 * data_lut[id011 + stride_lut_3 * i] + w111 * data_lut[id111 + stride_lut_3 * i];
+        }
+    }
+}
+
+
+template <typename scalar_t>
+void ailut_transform_3d_cpu_backward_impl(
+        const int n,
+        const scalar_t* __restrict__ grad_output,
+        const scalar_t* __restrict__ data_inp,
+        const scalar_t* __restrict__ data_lut,
+        const scalar_t* __restrict__ data_anc,
+        const int height,
+        const int width,
+        const int stride_lut,
+        const int num_channels,
+        scalar_t* __restrict__ grad_inp,
+        scalar_t* __restrict__ grad_lut,
+        scalar_t* __restrict__ grad_ver) {
+
+    const static scalar_t eps = 1e-10;
+
+    for (int index = 0; index < n; ++index) {
+
+        /* retrieve rgb value of the pixel */
+        const scalar_t r = data_inp[index];
+        const scalar_t g = data_inp[index + height * width];
+        const scalar_t b = data_inp[index + height * width * 2];
+
+        /* retrieve index of the interpolation verticess */
+        const int32_t rid = lower_bound(data_anc, 0, stride_lut, r);
+        const int32_t gid = lower_bound(data_anc, stride_lut, stride_lut * 2, g);
+        const int32_t bid = lower_bound(data_anc, stride_lut * 2, stride_lut * 3, b);
+
+        /* utility variables for indexing */
+        const int stride_lut_2 = stride_lut * stride_lut;
+        const int stride_lut_3 = stride_lut_2 * stride_lut;
+
+        /* retrieve the interpolation verticess (number of 8 in case of trilinear interpolation) */
+        const int id000 = (rid    ) + stride_lut * (gid    ) + stride_lut_2 * (bid    );
+        const int id100 = (rid + 1) + stride_lut * (gid    ) + stride_lut_2 * (bid    );
+        const int id010 = (rid    ) + stride_lut * (gid + 1) + stride_lut_2 * (bid    );
+        const int id110 = (rid + 1) + stride_lut * (gid + 1) + stride_lut_2 * (bid    );
+        const int id001 = (rid    ) + stride_lut * (gid    ) + stride_lut_2 * (bid + 1);
+        const int id101 = (rid + 1) + stride_lut * (gid    ) + stride_lut_2 * (bid + 1);
+        const int id011 = (rid    ) + stride_lut * (gid + 1) + stride_lut_2 * (bid + 1);
+        const int id111 = (rid + 1) + stride_lut * (gid + 1) + stride_lut_2 * (bid + 1);
+
+        /* compute interpolation weights */
+        const scalar_t r0 = data_anc[rid];
+        const scalar_t r1 = data_anc[rid + 1];
+        const scalar_t g0 = data_anc[gid + stride_lut];
+        const scalar_t g1 = data_anc[gid + stride_lut + 1];
+        const scalar_t b0 = data_anc[bid + stride_lut * 2];
+        const scalar_t b1 = data_anc[bid + stride_lut * 2 + 1];
+
+        const scalar_t rd = (r - r0) / (r1 - r0 + eps);
+        const scalar_t gd = (g - g0) / (g1 - g0 + eps);
+        const scalar_t bd = (b - b0) / (b1 - b0 + eps);
+
+        const scalar_t w000 = (1 - rd) * (1 - gd) * (1 - bd);
+        const scalar_t w100 = (    rd) * (1 - gd) * (1 - bd);
+        const scalar_t w010 = (1 - rd) * (    gd) * (1 - bd);
+        const scalar_t w110 = (    rd) * (    gd) * (1 - bd);
+        const scalar_t w001 = (1 - rd) * (1 - gd) * (    bd);
+        const scalar_t w101 = (    rd) * (1 - gd) * (    bd);
+        const scalar_t w011 = (1 - rd) * (    gd) * (    bd);
+        const scalar_t w111 = (    rd) * (    gd) * (    bd);
+
+        /* derivatives: rd to r/r0/r1 */
+        const scalar_t rd_r  =          1 / (r1 - r0 + eps);
+        const scalar_t rd_r0 = - (1 - rd) / (r1 - r0 + eps);
+        const scalar_t rd_r1 = - (    rd) / (r1 - r0 + eps);
+        /* derivatives: gd to g/g0/g1 */
+        const scalar_t gd_g  =          1 / (g1 - g0 + eps);
+        const scalar_t gd_g0 = - (1 - gd) / (g1 - g0 + eps);
+        const scalar_t gd_g1 = - (    gd) / (g1 - g0 + eps);
+        /* derivatives: bd to b/b0/b1 */
+        const scalar_t bd_b =           1 / (b1 - b0 + eps);
+        const scalar_t bd_b0 = - (1 - bd) / (b1 - b0 + eps);
+        const scalar_t bd_b1 = - (    bd) / (b1 - b0 + eps);
+
+        /* derivatives: w to rd */
+        const scalar_t w000_rd = - (1 - gd) * (1 - bd);
+        const scalar_t w100_rd =   (1 - gd) * (1 - bd);
+        const scalar_t w010_rd = - (    gd) * (1 - bd);
+        const scalar_t w110_rd =   (    gd) * (1 - bd);
+        const scalar_t w001_rd = - (1 - gd) * (    bd);
+        const scalar_t w101_rd =   (1 - gd) * (    bd);
+        const scalar_t w011_rd = - (    gd) * (    bd);
+        const scalar_t w111_rd =   (    gd) * (    bd);
+
+        /* derivatives: w to gd */
+        const scalar_t w000_gd = - (1 - rd) * (1 - bd);
+        const scalar_t w100_gd = - (    rd) * (1 - bd);
+        const scalar_t w010_gd =   (1 - rd) * (1 - bd);
+        const scalar_t w110_gd =   (    rd) * (1 - bd);
+        const scalar_t w001_gd = - (1 - rd) * (    bd);
+        const scalar_t w101_gd = - (    rd) * (    bd);
+        const scalar_t w011_gd =   (1 - rd) * (    bd);
+        const scalar_t w111_gd =   (    rd) * (    bd);
+
+        /* derivatives: w to bd */
+        const scalar_t w000_bd = - (1 - rd) * (1 - gd);
+        const scalar_t w100_bd = - (    rd) * (1 - gd);
+        const scalar_t w010_bd = - (1 - rd) * (    gd);
+        const scalar_t w110_bd = - (    rd) * (    gd);
+        const scalar_t w001_bd =   (1 - rd) * (1 - gd);
+        const scalar_t w101_bd =   (    rd) * (1 - gd);
+        const scalar_t w011_bd =   (1 - rd) * (    gd);
+        const scalar_t w111_bd =   (    rd) * (    gd);
+
+        for (int i = 0; i < num_channels; ++i) {
+            scalar_t grad_o_ = grad_output[index + width * height * i];
+
+            /* compute gradient of lut */
+            grad_lut[id000 + stride_lut_3 * i] += grad_o_ * w000;
+            grad_lut[id100 + stride_lut_3 * i] += grad_o_ * w100;
+            grad_lut[id010 + stride_lut_3 * i] += grad_o_ * w010;
+            grad_lut[id110 + stride_lut_3 * i] += grad_o_ * w110;
+            grad_lut[id001 + stride_lut_3 * i] += grad_o_ * w001;
+            grad_lut[id101 + stride_lut_3 * i] += grad_o_ * w101;
+            grad_lut[id011 + stride_lut_3 * i] += grad_o_ * w011;
+            grad_lut[id111 + stride_lut_3 * i] += grad_o_ * w111;
+
+            /* compute gradient of vertices */
+            scalar_t grad_d = 0;
+            const scalar_t lut000 = data_lut[id000 + stride_lut_3 * i];
+            const scalar_t lut100 = data_lut[id100 + stride_lut_3 * i];
+            const scalar_t lut010 = data_lut[id010 + stride_lut_3 * i];
+            const scalar_t lut110 = data_lut[id110 + stride_lut_3 * i];
+            const scalar_t lut001 = data_lut[id001 + stride_lut_3 * i];
+            const scalar_t lut101 = data_lut[id101 + stride_lut_3 * i];
+            const scalar_t lut011 = data_lut[id011 + stride_lut_3 * i];
+            const scalar_t lut111 = data_lut[id111 + stride_lut_3 * i];
+            grad_d = grad_o_ *
+                (w000_rd * lut000 + w100_rd * lut100 + w010_rd * lut010 + w110_rd * lut110 +
+                 w001_rd * lut001 + w101_rd * lut101 + w011_rd * lut011 + w111_rd * lut111);
+            // r0/r1
+            grad_ver[rid    ] += grad_d * rd_r0;
+            grad_ver[rid + 1] += grad_d * rd_r1;
+            // r
+            grad_inp[index] += grad_d * rd_r;
+
+            grad_d = grad_o_ *
+                (w000_gd * lut000 + w100_gd * lut100 + w010_gd * lut010 + w110_gd * lut110 +
+                 w001_gd * lut001 + w101_gd * lut101 + w011_gd * lut011 + w111_gd * lut111);
+            // g0/g1
+            grad_ver[stride_lut + gid    ] += grad_d * gd_g0;
+            grad_ver[stride_lut + gid + 1] += grad_d * gd_g1;
+            // g
+            grad_inp[index + height * width] += grad_d * gd_g;
+
+            grad_d = grad_o_ *
+                (w000_bd * lut000 + w100_bd * lut100 + w010_bd * lut010 + w110_bd * lut110 +
+                 w001_bd * lut001 + w101_bd * lut101 + w011_bd * lut011 + w111_bd * lut111);
+            // b0/b1
+            grad_ver[stride_lut * 2 + bid    ] += grad_d * bd_b0;
+            grad_ver[stride_lut * 2 + bid + 1] += grad_d * bd_b1;
+            // b
+            grad_inp[index + height * width * 2] += grad_d * bd_b;
+        }
+    }
+}
+
+
+void lut_transform_cpu_forward(
+    const torch::Tensor &input,
+    const torch::Tensor &lut,
+    torch::Tensor output) {
+
+    /* retrieve some meta-information of the input tensors */
+    int batch_size = input.size(0);
+    int height     = input.size(2);
+    int width      = input.size(3);
+
+    int num_channels = lut.size(1);
+    int stride_lut   = lut.size(2);
+
+    int num_kernels = height * width;
+
+    for (int elt = 0; elt < batch_size; ++elt) {
+        AT_DISPATCH_FLOATING_TYPES(
+            input.scalar_type(), "lut_transform_cpu_forward", ([&] {
+                const scalar_t *data_inp = input[elt].data_ptr<scalar_t>();
+                const scalar_t *data_lut = lut[elt].data_ptr<scalar_t>();
+                scalar_t *data_col = output[elt].data_ptr<scalar_t>();
+
+                lut_transform_3d_cpu_forward_impl(
+                    num_kernels, data_inp, data_lut,
+                    height, width, stride_lut, num_channels,
+                    data_col);
+            }));
+    }
+}
+
+
+void lut_transform_cpu_backward(
+    const torch::Tensor &grad_output, const torch::Tensor &input,
+    const torch::Tensor &lut, torch::Tensor grad_inp, torch::Tensor grad_lut) {
+
+    /* retrieve some meta-information of the input tensors */
+    int batch_size = input.size(0);
+    int height     = input.size(2);
+    int width      = input.size(3);
+
+    int num_channels = lut.size(1);
+    int stride_lut   = lut.size(2);
+
+    int num_kernels = height * width;
+
+    for (int elt = 0; elt < batch_size; ++elt) {
+        AT_DISPATCH_FLOATING_TYPES(
+            input.scalar_type(), "lut_transform_cpu_backward", ([&] {
+                const scalar_t *grad_out = grad_output[elt].data_ptr<scalar_t>();
+                const scalar_t *data_inp = input[elt].data_ptr<scalar_t>();
+                const scalar_t *data_lut = lut[elt].data_ptr<scalar_t>();
+                scalar_t *grad_inp_  = grad_inp[elt].data_ptr<scalar_t>();
+                scalar_t *grad_lut_ = grad_lut[elt].data_ptr<scalar_t>();
+
+                lut_transform_3d_cpu_backward_impl(
+                    num_kernels, grad_out, data_inp, data_lut,
+                    height, width, stride_lut, num_channels,
+                    grad_inp_, grad_lut_);
+            }));
+    }
+}
+
+
+void ailut_transform_cpu_forward(
+    const torch::Tensor &input,
+    const torch::Tensor &lut,
+    const torch::Tensor &vertices,
+    torch::Tensor output) {
+
+    /* retrieve some meta-information of the input tensors */
+    int batch_size = input.size(0);
+    int height     = input.size(2);
+    int width      = input.size(3);
+
+    int num_channels = lut.size(1);
+    int stride_lut   = lut.size(2);
+
+    int num_kernels = height * width;
+
+    for (int elt = 0; elt < batch_size; ++elt) {
+        AT_DISPATCH_FLOATING_TYPES(
+            input.scalar_type(), "ailut_transform_cpu_forward", ([&] {
+                const scalar_t *data_inp = input[elt].data_ptr<scalar_t>();
+                const scalar_t *data_lut = lut[elt].data_ptr<scalar_t>();
+                const scalar_t *data_anc = vertices[elt].data_ptr<scalar_t>();
+                scalar_t *data_col = output[elt].data_ptr<scalar_t>();
+
+                ailut_transform_3d_cpu_forward_impl(
+                    num_kernels, data_inp, data_lut, data_anc,
+                    height, width, stride_lut, num_channels,
+                    data_col);
+            }));
+    }
+}
+
+
+void ailut_transform_cpu_backward(
+    const torch::Tensor &grad_output, const torch::Tensor &input,
+    const torch::Tensor &lut, const torch::Tensor &vertices,
+    torch::Tensor grad_inp, torch::Tensor grad_lut, torch::Tensor grad_ver) {
+
+    /* retrieve some meta-information of the input tensors */
+    int batch_size = input.size(0);
+    int height     = input.size(2);
+    int width      = input.size(3);
+
+    int num_channels = lut.size(1);
+    int stride_lut   = lut.size(2);
+
+    int num_kernels = height * width;
+
+    for (int elt = 0; elt < batch_size; ++elt) {
+        AT_DISPATCH_FLOATING_TYPES(
+            input.scalar_type(), "ailut_transform_cpu_backward", ([&] {
+                const scalar_t *grad_out = grad_output[elt].data_ptr<scalar_t>();
+                const scalar_t *data_inp = input[elt].data_ptr<scalar_t>();
+                const scalar_t *data_lut = lut[elt].data_ptr<scalar_t>();
+                const scalar_t *data_anc = vertices[elt].data_ptr<scalar_t>();
+                scalar_t *grad_inp_  = grad_inp[elt].data_ptr<scalar_t>();
+                scalar_t *grad_lut_ = grad_lut[elt].data_ptr<scalar_t>();
+                scalar_t *grad_ver_ = grad_ver[elt].data_ptr<scalar_t>();
+
+                ailut_transform_3d_cpu_backward_impl(
+                    num_kernels, grad_out, data_inp, data_lut, data_anc,
+                    height, width, stride_lut, num_channels,
+                    grad_inp_, grad_lut_, grad_ver_);
+            }));
+    }
+}
diff --git a/modelscope/ops/ailut/Ailut/csrc/ailut_transform_cuda.cu b/modelscope/ops/ailut/Ailut/csrc/ailut_transform_cuda.cu
new file mode 100644
index 00000000..5b7196fb
--- /dev/null
+++ b/modelscope/ops/ailut/Ailut/csrc/ailut_transform_cuda.cu
@@ -0,0 +1,699 @@
+#include <torch/extension.h>
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+
+#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 600
+#else
+__device__ double atomicAdd(double* address, double val)
+{
+    unsigned long long int* address_as_ull = (unsigned long long int*)address;
+    unsigned long long int old = *address_as_ull, assumed;
+    do {
+        assumed = old;
+        old = atomicCAS(address_as_ull, assumed,
+                __double_as_longlong(val + __longlong_as_double(assumed)));
+    } while (assumed != old);
+    return __longlong_as_double(old);
+}
+#endif
+
+#define CUDA_1D_KERNEL_LOOP(i, n)                                \
+    for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
+         i += blockDim.x * gridDim.x)
+
+#define THREADS_PER_BLOCK 512
+
+inline int GET_BLOCKS(const int N) {
+  int optimal_block_num = (N + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
+  int max_block_num = at::cuda::getCurrentDeviceProperties()->maxGridSize[0];
+  return min(optimal_block_num, max_block_num);
+}
+
+
+/* std::clamp is only available since c++17 */
+template <typename scalar_t>
+inline __device__ constexpr const scalar_t& clamp(
+    const scalar_t& v, const scalar_t& lo, const scalar_t& hi)
+{
+    return (v < lo) ? lo : ((v > hi) ? hi : v);
+}
+
+
+/* binary search on a sorted array to find and clamp the lower bound */
+template <typename scalar_t>
+inline __device__ int32_t lower_bound(
+        const scalar_t *data_ss,
+        int32_t start,
+        int32_t end,
+        scalar_t val) {
+
+    const int32_t ori_start = start;
+    const int32_t upper_bound = end - start - 2;
+    while (start < end) {
+        int64_t mid = start + ((end - start) >> 1);
+        if (!(data_ss[mid] >= val)) {
+            start = mid + 1;
+        }
+        else {
+            end = mid;
+        }
+    }
+    return clamp(start - ori_start - 1, 0, upper_bound);
+}
+
+
+void ailut_transform_sanity_check(
+    const torch::Tensor input, const torch::Tensor lut,
+    const torch::Tensor vertices, torch::Tensor output) {
+
+    TORCH_CHECK((input.ndimension() == 4),
+                "4D input tensor (b, 3, h, w) expected, but got: ",
+                input.ndimension());
+    TORCH_CHECK((input.size(1) == 3),
+                "3-channel image expected, but got: ",
+                input.size(1));
+    TORCH_CHECK((lut.ndimension() == (input.size(1) + 2)),
+                (input.size(1) + 2),
+                "D lut tensor (b, m, d[, d, [d]]) expected, but got: ",
+                lut.ndimension());
+    TORCH_CHECK((vertices.ndimension() == 3),
+                "3D vertices tensor (b, 3, d) expected, but got: ",
+                vertices.ndimension());
+    TORCH_CHECK((vertices.size(1) == input.size(1)),
+                "vertices.size(1) should be the same as input.size(1), but got: ",
+                vertices.size(1), "for vertices, and",
+                input.size(1), "for input");
+    TORCH_CHECK((vertices.size(2) == lut.size(2)),
+                "the length of vertices list should be the same as the number ",
+                "of bins in the 3D lookup table, but got: ", vertices.size(2),
+                " (", lut.size(2), " expected)");
+    TORCH_CHECK((input.size(0) == lut.size(0) && input.size(0) == vertices.size(0)),
+                "input, lut and vertices should have identical batch size, but got: ",
+                "input (", input.size(0), "), lut (", input.size(1),"), vertices (",
+                input.size(2), ")");
+}
+
+
+template <typename scalar_t>
+__launch_bounds__(THREADS_PER_BLOCK)
+__global__ void lut_transform_3d_cuda_forward_kernel(
+        const int n,
+        const scalar_t* __restrict__ data_inp,
+        const scalar_t* __restrict__ data_lut,
+        const int height,
+        const int width,
+        const int stride_lut,
+        const int num_channels,
+        scalar_t* __restrict__ data_col) {
+
+    const scalar_t size_bin = 1.0 / (stride_lut - 1);
+
+    CUDA_1D_KERNEL_LOOP(index, n) {
+
+        /* retrieve rgb value of the pixel */
+        const scalar_t r = data_inp[index];
+        const scalar_t g = data_inp[index + height * width];
+        const scalar_t b = data_inp[index + height * width * 2];
+
+        /* retrieve index of the interpolation verticess */
+        const int32_t rid = clamp((int32_t)floor(r * (stride_lut - 1)), 0, stride_lut - 2);
+        const int32_t gid = clamp((int32_t)floor(g * (stride_lut - 1)), 0, stride_lut - 2);
+        const int32_t bid = clamp((int32_t)floor(b * (stride_lut - 1)), 0, stride_lut - 2);
+
+        /* utility variables for indexing */
+        const int stride_lut_2 = stride_lut * stride_lut;
+        const int stride_lut_3 = stride_lut_2 * stride_lut;
+
+        /* retrieve the interpolation verticess (number of 8 in case of trilinear interpolation) */
+        const int id000 = (rid    ) + stride_lut * (gid    ) + stride_lut_2 * (bid    );
+        const int id100 = (rid + 1) + stride_lut * (gid    ) + stride_lut_2 * (bid    );
+        const int id010 = (rid    ) + stride_lut * (gid + 1) + stride_lut_2 * (bid    );
+        const int id110 = (rid + 1) + stride_lut * (gid + 1) + stride_lut_2 * (bid    );
+        const int id001 = (rid    ) + stride_lut * (gid    ) + stride_lut_2 * (bid + 1);
+        const int id101 = (rid + 1) + stride_lut * (gid    ) + stride_lut_2 * (bid + 1);
+        const int id011 = (rid    ) + stride_lut * (gid + 1) + stride_lut_2 * (bid + 1);
+        const int id111 = (rid + 1) + stride_lut * (gid + 1) + stride_lut_2 * (bid + 1);
+
+        /* compute interpolation weights */
+        const scalar_t rd = (r - size_bin * rid) / size_bin;
+        const scalar_t gd = (g - size_bin * gid) / size_bin;
+        const scalar_t bd = (b - size_bin * bid) / size_bin;
+
+        const scalar_t w000 = (1 - rd) * (1 - gd) * (1 - bd);
+        const scalar_t w100 = (    rd) * (1 - gd) * (1 - bd);
+        const scalar_t w010 = (1 - rd) * (    gd) * (1 - bd);
+        const scalar_t w110 = (    rd) * (    gd) * (1 - bd);
+        const scalar_t w001 = (1 - rd) * (1 - gd) * (    bd);
+        const scalar_t w101 = (    rd) * (1 - gd) * (    bd);
+        const scalar_t w011 = (1 - rd) * (    gd) * (    bd);
+        const scalar_t w111 = (    rd) * (    gd) * (    bd);
+
+        /* Execute the interpolation */
+        for (int i = 0; i < num_channels; ++i) {
+            data_col[index + height * width * i] =
+                w000 * data_lut[id000 + stride_lut_3 * i] + w100 * data_lut[id100 + stride_lut_3 * i] +
+                w010 * data_lut[id010 + stride_lut_3 * i] + w110 * data_lut[id110 + stride_lut_3 * i] +
+                w001 * data_lut[id001 + stride_lut_3 * i] + w101 * data_lut[id101 + stride_lut_3 * i] +
+                w011 * data_lut[id011 + stride_lut_3 * i] + w111 * data_lut[id111 + stride_lut_3 * i];
+        }
+    }
+}
+
+
+
+template <typename scalar_t>
+__launch_bounds__(THREADS_PER_BLOCK)
+__global__ void lut_transform_3d_cuda_backward_kernel(
+        const int n,
+        const scalar_t* __restrict__ grad_output,
+        const scalar_t* __restrict__ data_inp,
+        const scalar_t* __restrict__ data_lut,
+        const int height,
+        const int width,
+        const int stride_lut,
+        const int num_channels,
+        scalar_t* __restrict__ grad_inp,
+        scalar_t* __restrict__ grad_lut) {
+
+    const scalar_t size_bin = 1.0 / (stride_lut - 1);
+
+    CUDA_1D_KERNEL_LOOP(index, n) {
+
+        /* retrieve rgb value of the pixel */
+        const scalar_t r = data_inp[index];
+        const scalar_t g = data_inp[index + height * width];
+        const scalar_t b = data_inp[index + height * width * 2];
+
+        /* retrieve index of the interpolation verticess */
+        const int32_t rid = clamp((int32_t)floor(r * (stride_lut - 1)), 0, stride_lut - 2);
+        const int32_t gid = clamp((int32_t)floor(g * (stride_lut - 1)), 0, stride_lut - 2);
+        const int32_t bid = clamp((int32_t)floor(b * (stride_lut - 1)), 0, stride_lut - 2);
+
+        /* utility variables for indexing */
+        const int stride_lut_2 = stride_lut * stride_lut;
+        const int stride_lut_3 = stride_lut_2 * stride_lut;
+
+        /* retrieve the interpolation verticess (number of 8 in case of trilinear interpolation) */
+        const int id000 = (rid    ) + stride_lut * (gid    ) + stride_lut_2 * (bid    );
+        const int id100 = (rid + 1) + stride_lut * (gid    ) + stride_lut_2 * (bid    );
+        const int id010 = (rid    ) + stride_lut * (gid + 1) + stride_lut_2 * (bid    );
+        const int id110 = (rid + 1) + stride_lut * (gid + 1) + stride_lut_2 * (bid    );
+        const int id001 = (rid    ) + stride_lut * (gid    ) + stride_lut_2 * (bid + 1);
+        const int id101 = (rid + 1) + stride_lut * (gid    ) + stride_lut_2 * (bid + 1);
+        const int id011 = (rid    ) + stride_lut * (gid + 1) + stride_lut_2 * (bid + 1);
+        const int id111 = (rid + 1) + stride_lut * (gid + 1) + stride_lut_2 * (bid + 1);
+
+        /* compute interpolation weights */
+        const scalar_t rd = (r - size_bin * rid) / size_bin;
+        const scalar_t gd = (g - size_bin * gid) / size_bin;
+        const scalar_t bd = (b - size_bin * bid) / size_bin;
+
+        const scalar_t w000 = (1 - rd) * (1 - gd) * (1 - bd);
+        const scalar_t w100 = (    rd) * (1 - gd) * (1 - bd);
+        const scalar_t w010 = (1 - rd) * (    gd) * (1 - bd);
+        const scalar_t w110 = (    rd) * (    gd) * (1 - bd);
+        const scalar_t w001 = (1 - rd) * (1 - gd) * (    bd);
+        const scalar_t w101 = (    rd) * (1 - gd) * (    bd);
+        const scalar_t w011 = (1 - rd) * (    gd) * (    bd);
+        const scalar_t w111 = (    rd) * (    gd) * (    bd);
+
+        /* derivatives: w to rd */
+        const scalar_t w000_rd = - (1 - gd) * (1 - bd);
+        const scalar_t w100_rd =   (1 - gd) * (1 - bd);
+        const scalar_t w010_rd = - (    gd) * (1 - bd);
+        const scalar_t w110_rd =   (    gd) * (1 - bd);
+        const scalar_t w001_rd = - (1 - gd) * (    bd);
+        const scalar_t w101_rd =   (1 - gd) * (    bd);
+        const scalar_t w011_rd = - (    gd) * (    bd);
+        const scalar_t w111_rd =   (    gd) * (    bd);
+
+        /* derivatives: w to gd */
+        const scalar_t w000_gd = - (1 - rd) * (1 - bd);
+        const scalar_t w100_gd = - (    rd) * (1 - bd);
+        const scalar_t w010_gd =   (1 - rd) * (1 - bd);
+        const scalar_t w110_gd =   (    rd) * (1 - bd);
+        const scalar_t w001_gd = - (1 - rd) * (    bd);
+        const scalar_t w101_gd = - (    rd) * (    bd);
+        const scalar_t w011_gd =   (1 - rd) * (    bd);
+        const scalar_t w111_gd =   (    rd) * (    bd);
+
+        /* derivatives: w to bd */
+        const scalar_t w000_bd = - (1 - rd) * (1 - gd);
+        const scalar_t w100_bd = - (    rd) * (1 - gd);
+        const scalar_t w010_bd = - (1 - rd) * (    gd);
+        const scalar_t w110_bd = - (    rd) * (    gd);
+        const scalar_t w001_bd =   (1 - rd) * (1 - gd);
+        const scalar_t w101_bd =   (    rd) * (1 - gd);
+        const scalar_t w011_bd =   (1 - rd) * (    gd);
+        const scalar_t w111_bd =   (    rd) * (    gd);
+
+        for (int i = 0; i < num_channels; ++i) {
+            scalar_t grad_o_ = grad_output[index + width * height * i];
+
+            /* compute gradient of lut */
+            atomicAdd(grad_lut + id000 + stride_lut_3 * i, grad_o_ * w000);
+            atomicAdd(grad_lut + id100 + stride_lut_3 * i, grad_o_ * w100);
+            atomicAdd(grad_lut + id010 + stride_lut_3 * i, grad_o_ * w010);
+            atomicAdd(grad_lut + id110 + stride_lut_3 * i, grad_o_ * w110);
+            atomicAdd(grad_lut + id001 + stride_lut_3 * i, grad_o_ * w001);
+            atomicAdd(grad_lut + id101 + stride_lut_3 * i, grad_o_ * w101);
+            atomicAdd(grad_lut + id011 + stride_lut_3 * i, grad_o_ * w011);
+            atomicAdd(grad_lut + id111 + stride_lut_3 * i, grad_o_ * w111);
+
+            /* compute gradient of vertices */
+            scalar_t grad_d = 0;
+            const scalar_t lut000 = data_lut[id000 + stride_lut_3 * i];
+            const scalar_t lut100 = data_lut[id100 + stride_lut_3 * i];
+            const scalar_t lut010 = data_lut[id010 + stride_lut_3 * i];
+            const scalar_t lut110 = data_lut[id110 + stride_lut_3 * i];
+            const scalar_t lut001 = data_lut[id001 + stride_lut_3 * i];
+            const scalar_t lut101 = data_lut[id101 + stride_lut_3 * i];
+            const scalar_t lut011 = data_lut[id011 + stride_lut_3 * i];
+            const scalar_t lut111 = data_lut[id111 + stride_lut_3 * i];
+            grad_d = grad_o_ *
+                (w000_rd * lut000 + w100_rd * lut100 + w010_rd * lut010 + w110_rd * lut110 +
+                 w001_rd * lut001 + w101_rd * lut101 + w011_rd * lut011 + w111_rd * lut111);
+            // r
+            atomicAdd(grad_inp + index, grad_d * 1 / size_bin);
+
+            grad_d = grad_o_ *
+                (w000_gd * lut000 + w100_gd * lut100 + w010_gd * lut010 + w110_gd * lut110 +
+                 w001_gd * lut001 + w101_gd * lut101 + w011_gd * lut011 + w111_gd * lut111);
+            // g
+            atomicAdd(grad_inp + index + height * width, grad_d * 1 / size_bin);
+
+            grad_d = grad_o_ *
+                (w000_bd * lut000 + w100_bd * lut100 + w010_bd * lut010 + w110_bd * lut110 +
+                 w001_bd * lut001 + w101_bd * lut101 + w011_bd * lut011 + w111_bd * lut111);
+            // b
+            atomicAdd(grad_inp + index + height * width * 2, grad_d * 1 / size_bin);
+        }
+    }
+}
+
+
+template <typename scalar_t>
+__launch_bounds__(THREADS_PER_BLOCK)
+__global__ void ailut_transform_3d_cuda_forward_kernel(
+        const int n,
+        const scalar_t* __restrict__ data_inp,
+        const scalar_t* __restrict__ data_lut,
+        const scalar_t* __restrict__ data_anc,
+        const int height,
+        const int width,
+        const int stride_lut,
+        const int num_channels,
+        scalar_t* __restrict__ data_col) {
+
+    const static scalar_t eps = 1e-10;
+
+    CUDA_1D_KERNEL_LOOP(index, n) {
+
+        /* retrieve rgb value of the pixel */
+        const scalar_t r = data_inp[index];
+        const scalar_t g = data_inp[index + height * width];
+        const scalar_t b = data_inp[index + height * width * 2];
+
+        /* retrieve index of the interpolation verticess */
+        const int32_t rid = lower_bound(data_anc, 0, stride_lut, r);
+        const int32_t gid = lower_bound(data_anc, stride_lut, stride_lut * 2, g);
+        const int32_t bid = lower_bound(data_anc, stride_lut * 2, stride_lut * 3, b);
+
+        /* utility variables for indexing */
+        const int stride_lut_2 = stride_lut * stride_lut;
+        const int stride_lut_3 = stride_lut_2 * stride_lut;
+
+        /* retrieve the interpolation verticess (number of 8 in case of trilinear interpolation) */
+        const int id000 = (rid    ) + stride_lut * (gid    ) + stride_lut_2 * (bid    );
+        const int id100 = (rid + 1) + stride_lut * (gid    ) + stride_lut_2 * (bid    );
+        const int id010 = (rid    ) + stride_lut * (gid + 1) + stride_lut_2 * (bid    );
+        const int id110 = (rid + 1) + stride_lut * (gid + 1) + stride_lut_2 * (bid    );
+        const int id001 = (rid    ) + stride_lut * (gid    ) + stride_lut_2 * (bid + 1);
+        const int id101 = (rid + 1) + stride_lut * (gid    ) + stride_lut_2 * (bid + 1);
+        const int id011 = (rid    ) + stride_lut * (gid + 1) + stride_lut_2 * (bid + 1);
+        const int id111 = (rid + 1) + stride_lut * (gid + 1) + stride_lut_2 * (bid + 1);
+
+        /* compute interpolation weights */
+        const scalar_t r0 = data_anc[rid];
+        const scalar_t r1 = data_anc[rid + 1];
+        const scalar_t g0 = data_anc[gid + stride_lut];
+        const scalar_t g1 = data_anc[gid + stride_lut + 1];
+        const scalar_t b0 = data_anc[bid + stride_lut * 2];
+        const scalar_t b1 = data_anc[bid + stride_lut * 2 + 1];
+
+        const scalar_t rd = (r - r0) / (r1 - r0 + eps);
+        const scalar_t gd = (g - g0) / (g1 - g0 + eps);
+        const scalar_t bd = (b - b0) / (b1 - b0 + eps);
+
+        const scalar_t w000 = (1 - rd) * (1 - gd) * (1 - bd);
+        const scalar_t w100 = (    rd) * (1 - gd) * (1 - bd);
+        const scalar_t w010 = (1 - rd) * (    gd) * (1 - bd);
+        const scalar_t w110 = (    rd) * (    gd) * (1 - bd);
+        const scalar_t w001 = (1 - rd) * (1 - gd) * (    bd);
+        const scalar_t w101 = (    rd) * (1 - gd) * (    bd);
+        const scalar_t w011 = (1 - rd) * (    gd) * (    bd);
+        const scalar_t w111 = (    rd) * (    gd) * (    bd);
+
+        /* Execute the interpolation */
+        for (int i = 0; i < num_channels; ++i) {
+            data_col[index + height * width * i] =
+                w000 * data_lut[id000 + stride_lut_3 * i] + w100 * data_lut[id100 + stride_lut_3 * i] +
+                w010 * data_lut[id010 + stride_lut_3 * i] + w110 * data_lut[id110 + stride_lut_3 * i] +
+                w001 * data_lut[id001 + stride_lut_3 * i] + w101 * data_lut[id101 + stride_lut_3 * i] +
+                w011 * data_lut[id011 + stride_lut_3 * i] + w111 * data_lut[id111 + stride_lut_3 * i];
+        }
+    }
+}
+
+
+template <typename scalar_t>
+__launch_bounds__(THREADS_PER_BLOCK)
+__global__ void ailut_transform_3d_cuda_backward_kernel(
+        const int n,
+        const scalar_t* __restrict__ grad_output,
+        const scalar_t* __restrict__ data_inp,
+        const scalar_t* __restrict__ data_lut,
+        const scalar_t* __restrict__ data_anc,
+        const int height,
+        const int width,
+        const int stride_lut,
+        const int num_channels,
+        scalar_t* __restrict__ grad_inp,
+        scalar_t* __restrict__ grad_lut,
+        scalar_t* __restrict__ grad_ver) {
+
+    const static scalar_t eps = 1e-10;
+
+    CUDA_1D_KERNEL_LOOP(index, n) {
+
+        /* retrieve rgb value of the pixel */
+        const scalar_t r = data_inp[index];
+        const scalar_t g = data_inp[index + height * width];
+        const scalar_t b = data_inp[index + height * width * 2];
+
+        /* retrieve index of the interpolation verticess */
+        const int32_t rid = lower_bound(data_anc, 0, stride_lut, r);
+        const int32_t gid = lower_bound(data_anc, stride_lut, stride_lut * 2, g);
+        const int32_t bid = lower_bound(data_anc, stride_lut * 2, stride_lut * 3, b);
+
+        /* utility variables for indexing */
+        const int stride_lut_2 = stride_lut * stride_lut;
+        const int stride_lut_3 = stride_lut_2 * stride_lut;
+
+        /* retrieve the interpolation verticess (number of 8 in case of trilinear interpolation) */
+        const int id000 = (rid    ) + stride_lut * (gid    ) + stride_lut_2 * (bid    );
+        const int id100 = (rid + 1) + stride_lut * (gid    ) + stride_lut_2 * (bid    );
+        const int id010 = (rid    ) + stride_lut * (gid + 1) + stride_lut_2 * (bid    );
+        const int id110 = (rid + 1) + stride_lut * (gid + 1) + stride_lut_2 * (bid    );
+        const int id001 = (rid    ) + stride_lut * (gid    ) + stride_lut_2 * (bid + 1);
+        const int id101 = (rid + 1) + stride_lut * (gid    ) + stride_lut_2 * (bid + 1);
+        const int id011 = (rid    ) + stride_lut * (gid + 1) + stride_lut_2 * (bid + 1);
+        const int id111 = (rid + 1) + stride_lut * (gid + 1) + stride_lut_2 * (bid + 1);
+
+        /* compute interpolation weights */
+        const scalar_t r0 = data_anc[rid];
+        const scalar_t r1 = data_anc[rid + 1];
+        const scalar_t g0 = data_anc[gid + stride_lut];
+        const scalar_t g1 = data_anc[gid + stride_lut + 1];
+        const scalar_t b0 = data_anc[bid + stride_lut * 2];
+        const scalar_t b1 = data_anc[bid + stride_lut * 2 + 1];
+
+        const scalar_t rd = (r - r0) / (r1 - r0 + eps);
+        const scalar_t gd = (g - g0) / (g1 - g0 + eps);
+        const scalar_t bd = (b - b0) / (b1 - b0 + eps);
+
+        const scalar_t w000 = (1 - rd) * (1 - gd) * (1 - bd);
+        const scalar_t w100 = (    rd) * (1 - gd) * (1 - bd);
+        const scalar_t w010 = (1 - rd) * (    gd) * (1 - bd);
+        const scalar_t w110 = (    rd) * (    gd) * (1 - bd);
+        const scalar_t w001 = (1 - rd) * (1 - gd) * (    bd);
+        const scalar_t w101 = (    rd) * (1 - gd) * (    bd);
+        const scalar_t w011 = (1 - rd) * (    gd) * (    bd);
+        const scalar_t w111 = (    rd) * (    gd) * (    bd);
+
+        /* derivatives: rd to r/r0/r1 */
+        const scalar_t rd_r  =          1 / (r1 - r0 + eps);
+        const scalar_t rd_r0 = - (1 - rd) / (r1 - r0 + eps);
+        const scalar_t rd_r1 = - (    rd) / (r1 - r0 + eps);
+        /* derivatives: gd to g/g0/g1 */
+        const scalar_t gd_g  =          1 / (g1 - g0 + eps);
+        const scalar_t gd_g0 = - (1 - gd) / (g1 - g0 + eps);
+        const scalar_t gd_g1 = - (    gd) / (g1 - g0 + eps);
+        /* derivatives: bd to b/b0/b1 */
+        const scalar_t bd_b =           1 / (b1 - b0 + eps);
+        const scalar_t bd_b0 = - (1 - bd) / (b1 - b0 + eps);
+        const scalar_t bd_b1 = - (    bd) / (b1 - b0 + eps);
+
+        /* derivatives: w to rd */
+        const scalar_t w000_rd = - (1 - gd) * (1 - bd);
+        const scalar_t w100_rd =   (1 - gd) * (1 - bd);
+        const scalar_t w010_rd = - (    gd) * (1 - bd);
+        const scalar_t w110_rd =   (    gd) * (1 - bd);
+        const scalar_t w001_rd = - (1 - gd) * (    bd);
+        const scalar_t w101_rd =   (1 - gd) * (    bd);
+        const scalar_t w011_rd = - (    gd) * (    bd);
+        const scalar_t w111_rd =   (    gd) * (    bd);
+
+        /* derivatives: w to gd */
+        const scalar_t w000_gd = - (1 - rd) * (1 - bd);
+        const scalar_t w100_gd = - (    rd) * (1 - bd);
+        const scalar_t w010_gd =   (1 - rd) * (1 - bd);
+        const scalar_t w110_gd =   (    rd) * (1 - bd);
+        const scalar_t w001_gd = - (1 - rd) * (    bd);
+        const scalar_t w101_gd = - (    rd) * (    bd);
+        const scalar_t w011_gd =   (1 - rd) * (    bd);
+        const scalar_t w111_gd =   (    rd) * (    bd);
+
+        /* derivatives: w to bd */
+        const scalar_t w000_bd = - (1 - rd) * (1 - gd);
+        const scalar_t w100_bd = - (    rd) * (1 - gd);
+        const scalar_t w010_bd = - (1 - rd) * (    gd);
+        const scalar_t w110_bd = - (    rd) * (    gd);
+        const scalar_t w001_bd =   (1 - rd) * (1 - gd);
+        const scalar_t w101_bd =   (    rd) * (1 - gd);
+        const scalar_t w011_bd =   (1 - rd) * (    gd);
+        const scalar_t w111_bd =   (    rd) * (    gd);
+
+        for (int i = 0; i < num_channels; ++i) {
+            scalar_t grad_o_ = grad_output[index + width * height * i];
+
+            /* compute gradient of lut */
+            atomicAdd(grad_lut + id000 + stride_lut_3 * i, grad_o_ * w000);
+            atomicAdd(grad_lut + id100 + stride_lut_3 * i, grad_o_ * w100);
+            atomicAdd(grad_lut + id010 + stride_lut_3 * i, grad_o_ * w010);
+            atomicAdd(grad_lut + id110 + stride_lut_3 * i, grad_o_ * w110);
+            atomicAdd(grad_lut + id001 + stride_lut_3 * i, grad_o_ * w001);
+            atomicAdd(grad_lut + id101 + stride_lut_3 * i, grad_o_ * w101);
+            atomicAdd(grad_lut + id011 + stride_lut_3 * i, grad_o_ * w011);
+            atomicAdd(grad_lut + id111 + stride_lut_3 * i, grad_o_ * w111);
+
+            /* compute gradient of vertices */
+            scalar_t grad_d = 0;
+            const scalar_t lut000 = data_lut[id000 + stride_lut_3 * i];
+            const scalar_t lut100 = data_lut[id100 + stride_lut_3 * i];
+            const scalar_t lut010 = data_lut[id010 + stride_lut_3 * i];
+            const scalar_t lut110 = data_lut[id110 + stride_lut_3 * i];
+            const scalar_t lut001 = data_lut[id001 + stride_lut_3 * i];
+            const scalar_t lut101 = data_lut[id101 + stride_lut_3 * i];
+            const scalar_t lut011 = data_lut[id011 + stride_lut_3 * i];
+            const scalar_t lut111 = data_lut[id111 + stride_lut_3 * i];
+            grad_d = grad_o_ *
+                (w000_rd * lut000 + w100_rd * lut100 + w010_rd * lut010 + w110_rd * lut110 +
+                 w001_rd * lut001 + w101_rd * lut101 + w011_rd * lut011 + w111_rd * lut111);
+            // r0/r1
+            atomicAdd(grad_ver + rid,     grad_d * rd_r0);
+            atomicAdd(grad_ver + rid + 1, grad_d * rd_r1);
+            // r
+            atomicAdd(grad_inp + index, grad_d * rd_r);
+
+            grad_d = grad_o_ *
+                (w000_gd * lut000 + w100_gd * lut100 + w010_gd * lut010 + w110_gd * lut110 +
+                 w001_gd * lut001 + w101_gd * lut101 + w011_gd * lut011 + w111_gd * lut111);
+            // g0/g1
+            atomicAdd(grad_ver + stride_lut + gid,     grad_d * gd_g0);
+            atomicAdd(grad_ver + stride_lut + gid + 1, grad_d * gd_g1);
+            // g
+            atomicAdd(grad_inp + index + height * width, grad_d * gd_g);
+
+            grad_d = grad_o_ *
+                (w000_bd * lut000 + w100_bd * lut100 + w010_bd * lut010 + w110_bd * lut110 +
+                 w001_bd * lut001 + w101_bd * lut101 + w011_bd * lut011 + w111_bd * lut111);
+            // b0/b1
+            atomicAdd(grad_ver + stride_lut * 2 + bid,     grad_d * bd_b0);
+            atomicAdd(grad_ver + stride_lut * 2 + bid + 1, grad_d * bd_b1);
+            // b
+            atomicAdd(grad_inp + index + height * width * 2, grad_d * bd_b);
+        }
+    }
+}
+
+
+void LutTransformForwardCUDAKernelLauncher(
+    const torch::Tensor &input, const torch::Tensor &lut, torch::Tensor output) {
+
+    c10::cuda::CUDAGuard device_guard(input.device());
+
+    /* retrieve some meta-information of the input tensors */
+    int batch_size = input.size(0);
+    int height     = input.size(2);
+    int width      = input.size(3);
+
+    int num_channels = lut.size(1);
+    int stride_lut   = lut.size(2);
+
+    int num_kernels = height * width;
+    for (int elt = 0; elt < batch_size; ++elt) {
+
+        /* launch the CUDA kernel */
+        AT_DISPATCH_FLOATING_TYPES(
+            input.scalar_type(), "lut_transform_cuda_forward", ([&] {
+                const scalar_t *data_inp = input[elt].data_ptr<scalar_t>();
+                const scalar_t *data_lut = lut[elt].data_ptr<scalar_t>();
+                scalar_t *data_col = output[elt].data_ptr<scalar_t>();
+
+                lut_transform_3d_cuda_forward_kernel<<<GET_BLOCKS(num_kernels),
+                                                    THREADS_PER_BLOCK, 0,
+                                                    at::cuda::getCurrentCUDAStream()>>>(
+                    num_kernels, data_inp, data_lut,
+                    height, width, stride_lut, num_channels,
+                    data_col);
+            }));
+
+        AT_CUDA_CHECK(cudaGetLastError());
+    }
+}
+
+
+
+void LutTransformBackwardCUDAKernelLauncher(
+    const torch::Tensor &grad_output, const torch::Tensor &input,
+    const torch::Tensor &lut, torch::Tensor grad_inp, torch::Tensor grad_lut) {
+
+    c10::cuda::CUDAGuard device_guard(grad_output.device());
+
+    /* retrieve some meta-information of the input tensors */
+    int batch_size = input.size(0);
+    int height     = input.size(2);
+    int width      = input.size(3);
+
+    int num_channels = lut.size(1);
+    int stride_lut   = lut.size(2);
+
+    int num_kernels = height * width;
+    for (int elt = 0; elt < batch_size; ++elt) {
+
+        /* launch the CUDA kernel */
+        AT_DISPATCH_FLOATING_TYPES(
+            input.scalar_type(), "lut_transform_cuda_backward", ([&] {
+                const scalar_t *grad_out = grad_output[elt].data_ptr<scalar_t>();
+                const scalar_t *data_inp = input[elt].data_ptr<scalar_t>();
+                const scalar_t *data_lut = lut[elt].data_ptr<scalar_t>();
+                scalar_t *grad_inp_  = grad_inp[elt].data_ptr<scalar_t>();
+                scalar_t *grad_lut_ = grad_lut[elt].data_ptr<scalar_t>();
+
+                lut_transform_3d_cuda_backward_kernel<<<GET_BLOCKS(num_kernels),
+                                                    THREADS_PER_BLOCK, 0,
+                                                    at::cuda::getCurrentCUDAStream()>>>(
+                    num_kernels, grad_out, data_inp, data_lut,
+                    height, width, stride_lut, num_channels,
+                    grad_inp_, grad_lut_);
+            }));
+
+        AT_CUDA_CHECK(cudaGetLastError());
+    }
+}
+
+
+void AiLutTransformForwardCUDAKernelLauncher(
+    const torch::Tensor &input, const torch::Tensor &lut,
+    const torch::Tensor &vertices, torch::Tensor output) {
+
+    /* tensor check
+       input: (b,3,h,w); lut: (b,m,d,d,d), vertices: (b,3,d), output: (b,m,h,w)
+     */
+    ailut_transform_sanity_check(input, lut, vertices, output);
+
+    c10::cuda::CUDAGuard device_guard(input.device());
+
+    /* retrieve some meta-information of the input tensors */
+    int batch_size = input.size(0);
+    int height     = input.size(2);
+    int width      = input.size(3);
+
+    int num_channels = lut.size(1);
+    int stride_lut   = lut.size(2);
+
+    int num_kernels = height * width;
+    for (int elt = 0; elt < batch_size; ++elt) {
+
+        /* launch the CUDA kernel */
+        AT_DISPATCH_FLOATING_TYPES(
+            input.scalar_type(), "ailut_transform_cuda_forward", ([&] {
+                const scalar_t *data_inp = input[elt].data_ptr<scalar_t>();
+                const scalar_t *data_lut = lut[elt].data_ptr<scalar_t>();
+                const scalar_t *data_anc = vertices[elt].data_ptr<scalar_t>();
+                scalar_t *data_col = output[elt].data_ptr<scalar_t>();
+
+                ailut_transform_3d_cuda_forward_kernel<<<GET_BLOCKS(num_kernels),
+                                                    THREADS_PER_BLOCK, 0,
+                                                    at::cuda::getCurrentCUDAStream()>>>(
+                    num_kernels, data_inp, data_lut, data_anc,
+                    height, width, stride_lut, num_channels,
+                    data_col);
+            }));
+
+        AT_CUDA_CHECK(cudaGetLastError());
+    }
+}
+
+
+void AiLutTransformBackwardCUDAKernelLauncher(
+    const torch::Tensor &grad_output, const torch::Tensor &input,
+    const torch::Tensor &lut, const torch::Tensor &vertices,
+    torch::Tensor grad_inp, torch::Tensor grad_lut, torch::Tensor grad_ver) {
+
+    /* tensor check
+       input: (b,3,h,w); lut: (b,m,d,d,d), vertices: (b,3,d), output: (b,m,h,w)
+     */
+    ailut_transform_sanity_check(grad_inp, grad_lut, grad_ver, grad_output);
+
+    c10::cuda::CUDAGuard device_guard(grad_output.device());
+
+    /* retrieve some meta-information of the input tensors */
+    int batch_size = input.size(0);
+    int height     = input.size(2);
+    int width      = input.size(3);
+
+    int num_channels = lut.size(1);
+    int stride_lut   = lut.size(2);
+
+    int num_kernels = height * width;
+    for (int elt = 0; elt < batch_size; ++elt) {
+
+        /* launch the CUDA kernel */
+        AT_DISPATCH_FLOATING_TYPES(
+            input.scalar_type(), "ailut_transform_cuda_backward", ([&] {
+                const scalar_t *grad_out = grad_output[elt].data_ptr<scalar_t>();
+                const scalar_t *data_inp = input[elt].data_ptr<scalar_t>();
+                const scalar_t *data_lut = lut[elt].data_ptr<scalar_t>();
+                const scalar_t *data_anc = vertices[elt].data_ptr<scalar_t>();
+                scalar_t *grad_inp_  = grad_inp[elt].data_ptr<scalar_t>();
+                scalar_t *grad_lut_ = grad_lut[elt].data_ptr<scalar_t>();
+                scalar_t *grad_ver_ = grad_ver[elt].data_ptr<scalar_t>();
+
+                ailut_transform_3d_cuda_backward_kernel<<<GET_BLOCKS(num_kernels),
+                                                    THREADS_PER_BLOCK, 0,
+                                                    at::cuda::getCurrentCUDAStream()>>>(
+                    num_kernels, grad_out, data_inp, data_lut, data_anc,
+                    height, width, stride_lut, num_channels,
+                    grad_inp_, grad_lut_, grad_ver_);
+            }));
+
+        AT_CUDA_CHECK(cudaGetLastError());
+    }
+}
diff --git a/modelscope/ops/ailut/__init__.py b/modelscope/ops/ailut/__init__.py
new file mode 100644
index 00000000..8ea630dd
--- /dev/null
+++ b/modelscope/ops/ailut/__init__.py
@@ -0,0 +1,3 @@
+from .pyinterfaces import ailut_transform, lut_transform
+
+__all__ = ['ailut_transform', 'lut_transform']
diff --git a/modelscope/ops/ailut/pyinterfaces.py b/modelscope/ops/ailut/pyinterfaces.py
new file mode 100644
index 00000000..e9299cf8
--- /dev/null
+++ b/modelscope/ops/ailut/pyinterfaces.py
@@ -0,0 +1,130 @@
+import os.path as osp
+from typing import Tuple
+
+import torch
+from torch.cuda.amp import custom_bwd, custom_fwd
+from torch.utils.cpp_extension import load
+
+try:
+    from cudaops_ailut import (ailut_cbackward, ailut_cforward, lut_cbackward,
+                               lut_cforward)
+except ImportError:
+    CUR_DIR = osp.abspath(osp.dirname(__file__))
+    cudaops_ailut = load(
+        name='cudaops_ailut',
+        sources=[
+            osp.join(CUR_DIR, 'Ailut', 'csrc/ailut_transform.cpp'),
+            osp.join(CUR_DIR, 'Ailut', 'csrc/ailut_transform_cpu.cpp'),
+            osp.join(CUR_DIR, 'Ailut', 'csrc/ailut_transform_cuda.cu')
+        ],
+        verbose=True)
+    from cudaops_ailut import (ailut_cbackward, ailut_cforward, lut_cbackward,
+                               lut_cforward)
+
+
+class LUTTransformFunction(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd(cast_inputs=torch.float32)
+    def forward(ctx, img: torch.Tensor, lut: torch.Tensor) -> torch.Tensor:
+
+        img = img.contiguous()
+        lut = lut.contiguous()
+
+        assert img.ndimension() == 4, \
+            'only support 2D image with batch and channel dimensions (4D tensor)'
+        assert lut.ndimension() in [5], \
+            'only support 3D lookup table with batch dimension (5D tensor)'
+
+        output = img.new_zeros(
+            (img.size(0), lut.size(1), img.size(2), img.size(3)))
+        lut_cforward(img, lut, output)
+
+        ctx.save_for_backward(img, lut)
+
+        return output
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx, grad_output: torch.Tensor) -> Tuple[torch.Tensor]:
+        grad_output = grad_output.contiguous()
+
+        img, lut = ctx.saved_tensors
+
+        grad_img = torch.zeros_like(img)
+        grad_lut = torch.zeros_like(lut)
+
+        lut_cbackward(grad_output, img, lut, grad_img, grad_lut)
+
+        return grad_img, grad_lut
+
+
+class AiLUTTransformFunction(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd(cast_inputs=torch.float32)
+    def forward(ctx, img: torch.Tensor, lut: torch.Tensor,
+                vertices: torch.tensor) -> torch.Tensor:
+
+        img = img.contiguous()
+        lut = lut.contiguous()
+        vertices = vertices.contiguous()
+
+        assert img.ndimension() == 4, \
+            'only support 2D image with batch and channel dimensions (4D tensor)'
+        assert lut.ndimension() in [5], \
+            'only support 3D lookup table with batch dimension (5D tensor)'
+        assert vertices.ndimension() == 3, \
+            'only support 1D vertices list with batch and channel dimensions (3D tensor)'
+
+        output = img.new_zeros(
+            (img.size(0), lut.size(1), img.size(2), img.size(3)))
+        ailut_cforward(img, lut, vertices, output)
+
+        ctx.save_for_backward(img, lut, vertices)
+
+        return output
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx, grad_output: torch.Tensor) -> Tuple[torch.Tensor]:
+
+        grad_output = grad_output.contiguous()
+
+        img, lut, vertices = ctx.saved_tensors
+
+        grad_img = torch.zeros_like(img)
+        grad_lut = torch.zeros_like(lut)
+        grad_ver = torch.zeros_like(vertices)
+
+        ailut_cbackward(grad_output, img, lut, vertices, grad_img, grad_lut,
+                        grad_ver)
+
+        return grad_img, grad_lut, grad_ver
+
+
+def ailut_transform(img: torch.Tensor, lut: torch.Tensor,
+                    vertices: torch.Tensor) -> torch.Tensor:
+    r"""Adaptive Interval 3D Lookup Table Transform (AiLUT-Transform).
+
+    Args:
+        img (torch.Tensor): input image of shape (b, 3, h, w).
+        lut (torch.Tensor): output values of the 3D LUT, shape (b, 3, d, d, d).
+        vertices (torch.Tensor): sampling coordinates along each dimension of
+            the 3D LUT, shape (b, 3, d).
+    Returns:
+        torch.Tensor: transformed image of shape (b, 3, h, w).
+    """
+    return AiLUTTransformFunction.apply(img, lut, vertices)
+
+
+def lut_transform(img: torch.Tensor, lut: torch.Tensor) -> torch.Tensor:
+    r"""Standard 3D Lookup Table Transform.
+
+    Args:
+        img (torch.Tensor): input image of shape (b, 3, h, w).
+        lut (torch.Tensor): output values of the 3D LUT, shape (b, 3, d, d, d).
+    Returns:
+        torch.Tensor: transformed image of shape (b, 3, h, w).
+    """
+    return LUTTransformFunction.apply(img, lut)
diff --git a/modelscope/outputs/nlp_outputs.py b/modelscope/outputs/nlp_outputs.py
index e285d40f..8ed8c94a 100644
--- a/modelscope/outputs/nlp_outputs.py
+++ b/modelscope/outputs/nlp_outputs.py
@@ -172,7 +172,7 @@ class FeatureExtractionOutput(ModelOutputBase):
 
 @dataclass
 class FillMaskModelOutput(ModelOutputBase):
-    """The output class for text classification models.
+    """The output class for fill mask models.
 
     Args:
         logits (`Tensor`): The logits output of the model.
@@ -415,3 +415,18 @@ class DialogueUserSatisfactionEstimationModelOutput(ModelOutputBase):
         logits (`Tensor`): The logits output of the model.
     """
     logits: Tensor = None
+
+
+@dataclass
+class SentencEmbeddingModelOutput(ModelOutputBase):
+    """The output class for text classification models.
+
+    Args:
+        query_embs (`Tensor`, *optional*): The tensor of the query embeddings.
+        doc_embs (`Tensor`, *optional*) Then tensor of the doc embeddings.
+        loss (`torch.FloatTensor` of shape `(1,)`, *optional*): Sentence Embedding modeling loss.
+    """
+
+    query_embeddings: Tensor = None
+    doc_embeddings: Tensor = None
+    loss: Tensor = None
diff --git a/modelscope/outputs/outputs.py b/modelscope/outputs/outputs.py
index dec5084f..d8217aa0 100644
--- a/modelscope/outputs/outputs.py
+++ b/modelscope/outputs/outputs.py
@@ -147,6 +147,27 @@ TASK_OUTPUTS = {
     Tasks.card_detection:
     [OutputKeys.SCORES, OutputKeys.BOXES, OutputKeys.KEYPOINTS],
 
+    # content check result for single sample
+    #   {
+    #       "scores": [0.9] # non sexy probability
+    #   }
+    Tasks.content_check: [OutputKeys.SCORES],
+
+    # image driving perception result for single sample
+    #   {
+    #       "boxes": [
+    #           [x1, y1, x2, y2],
+    #           [x1, y1, x2, y2],
+    #           [x1, y1, x2, y2],
+    #           [x1, y1, x2, y2],
+    #       ],
+    #       "masks": [
+    #            [np.array], # with fixed shape(h=720, w=1280, 3) containing only 0, 1
+    #            [np.array], # with fixed shape(h=720, w=1280, 3) containing only 0, 1
+    #       ]
+    #   }
+    Tasks.image_driving_perception: [OutputKeys.BOXES, OutputKeys.MASKS],
+
     # facial expression recognition result for single sample
     #   {
     #       "scores": [0.9]
@@ -154,6 +175,13 @@ TASK_OUTPUTS = {
     #   }
     Tasks.face_liveness: [OutputKeys.SCORES, OutputKeys.BOXES],
 
+    # face quality assessment for single sample
+    #   {
+    #       "scores": [0.9]
+    #       "boxes": [x1, y1, x2, y2]
+    #   }
+    Tasks.face_quality_assessment: [OutputKeys.SCORES, OutputKeys.BOXES],
+
     # facial expression recognition result for single sample
     #   {
     #       "scores": [0.9, 0.1, 0.02, 0.02, 0.02, 0.02, 0.02],
@@ -173,15 +201,6 @@ TASK_OUTPUTS = {
         OutputKeys.KEYPOINTS
     ],
 
-    # facial landmark confidence result for single sample
-    #   {
-    #       "output_img": np.array with shape(h, w, 3) (output_img = aligned_img)
-    #       "scores": [0.85]
-    #       "keypoints": [x1, y1, x2, y2, x3, y3, x4, y4]
-    #       "boxes": [x1, y1, x2, y2]
-    #   }
-    Tasks.facial_landmark_confidence:
-    [OutputKeys.SCORES, OutputKeys.KEYPOINTS, OutputKeys.BOXES],
     # face attribute recognition result for single sample
     #   {
     #       "scores": [[0.9, 0.1], [0.92, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01]
@@ -195,13 +214,6 @@ TASK_OUTPUTS = {
     #   }
     Tasks.face_recognition: [OutputKeys.IMG_EMBEDDING],
 
-    # face recognition ood result for single sample
-    #   {
-    #       "img_embedding": np.array with shape [1, D],
-    #       "ood_score ": [0.95]
-    #   }
-    Tasks.face_recognition_ood: [OutputKeys.IMG_EMBEDDING, OutputKeys.SCORES],
-
     # human detection result for single sample
     #   {
     #       "scores": [0.9, 0.1, 0.05, 0.05]
@@ -243,6 +255,8 @@ TASK_OUTPUTS = {
     [OutputKeys.SCORES, OutputKeys.LABELS, OutputKeys.BOXES],
     Tasks.domain_specific_object_detection:
     [OutputKeys.SCORES, OutputKeys.LABELS, OutputKeys.BOXES],
+    Tasks.open_vocabulary_detection:
+    [OutputKeys.SCORES, OutputKeys.LABELS, OutputKeys.BOXES],
 
     # video object detection result for single sample
     #   {
@@ -270,6 +284,12 @@ TASK_OUTPUTS = {
     Tasks.video_object_detection:
     [OutputKeys.SCORES, OutputKeys.LABELS, OutputKeys.BOXES],
 
+    # 3d object detection result for single sample
+    # {
+    #   "output_img": np.array with shape(h, w, 3)
+    # }
+    Tasks.object_detection_3d: [OutputKeys.OUTPUT_IMG],
+
     # instance segmentation result for single sample
     #   {
     #       "scores": [0.9, 0.1, 0.05, 0.05],
@@ -281,6 +301,36 @@ TASK_OUTPUTS = {
     Tasks.image_segmentation:
     [OutputKeys.SCORES, OutputKeys.LABELS, OutputKeys.MASKS],
 
+    # video panoptic segmentation result for single sample
+    #         "scores": [[0.8, 0.25, 0.05, 0.05], [0.9, 0.1, 0.05, 0.05]]
+    #         "labels": [["person", "traffic light", "car", "bus"],
+    #                     ["person", "traffic light", "car", "bus"]]
+    #       "masks": [ #array containing only 0, 1
+    #           [np.array, np.array, np.array, np.array],
+    #           [np.array, np.array, np.array, np.array],
+    #       ]
+    #       "boxes":
+    #          [
+    #              [
+    #                [x1, y1, x2, y2],
+    #                [x1, y1, x2, y2],
+    #                [x1, y1, x2, y2],
+    #                [x1, y1, x2, y2],
+    #              ],
+    #              [
+    #                [x1, y1, x2, y2],
+    #                [x1, y1, x2, y2],
+    #                [x1, y1, x2, y2],
+    #                [x1, y1, x2, y2],
+    #               ]
+    #           ],
+    #       "uuid": [[0, 1, 2, 3],[0, 1, 2, 3]]
+    #   }
+    Tasks.video_panoptic_segmentation: [
+        OutputKeys.SCORES, OutputKeys.LABELS, OutputKeys.MASKS,
+        OutputKeys.BOXES, OutputKeys.UUID
+    ],
+
     # semantic segmentation result for single sample
     #   {
     #       "masks": [np.array # 2D array with shape [height, width]]
@@ -294,6 +344,11 @@ TASK_OUTPUTS = {
     #                 , shape(h, w) for crowd counting
     # }
     Tasks.portrait_matting: [OutputKeys.OUTPUT_IMG],
+    Tasks.universal_matting: [OutputKeys.OUTPUT_IMG],
+
+    # image_quality_assessment_mos result for a single image is a score in range [0, 1]
+    # {0.5}
+    Tasks.image_quality_assessment_mos: [OutputKeys.SCORE],
 
     # image editing task result for a single image
     # {"output_img": np.array with shape (h, w, 3)}
@@ -305,6 +360,7 @@ TASK_OUTPUTS = {
     Tasks.image_portrait_enhancement: [OutputKeys.OUTPUT_IMG],
     Tasks.crowd_counting: [OutputKeys.SCORES, OutputKeys.OUTPUT_IMG],
     Tasks.image_inpainting: [OutputKeys.OUTPUT_IMG],
+    Tasks.image_paintbyexample: [OutputKeys.OUTPUT_IMG],
 
     # image generation task result for a single image
     # {"output_img": np.array with shape (h, w, 3)}
@@ -318,6 +374,17 @@ TASK_OUTPUTS = {
     # {"output_video": "path_to_rendered_video"}
     Tasks.video_frame_interpolation: [OutputKeys.OUTPUT_VIDEO],
     Tasks.video_super_resolution: [OutputKeys.OUTPUT_VIDEO],
+    Tasks.video_deinterlace: [OutputKeys.OUTPUT_VIDEO],
+    Tasks.nerf_recon_acc: [OutputKeys.OUTPUT_VIDEO],
+    Tasks.video_colorization: [OutputKeys.OUTPUT_VIDEO],
+
+    # image quality assessment degradation result for single image
+    # {
+    #       "scores": [0.885272, 0.014790631, 0.014558001]
+    #       "labels": ['噪声强度', '模糊程度', '压缩强度'],
+    # }
+    Tasks.image_quality_assessment_degradation:
+    [OutputKeys.SCORES, OutputKeys.LABELS],
 
     # live category recognition result for single video
     # {
@@ -350,8 +417,9 @@ TASK_OUTPUTS = {
     #               [x1, y1, x2, y2],
     #             ]
     # }
-    Tasks.body_2d_keypoints:
-    [OutputKeys.KEYPOINTS, OutputKeys.SCORES, OutputKeys.BOXES],
+    Tasks.body_2d_keypoints: [
+        OutputKeys.KEYPOINTS, OutputKeys.SCORES, OutputKeys.BOXES
+    ],
 
     # 3D human body keypoints detection result for single sample
     # {
@@ -372,6 +440,21 @@ TASK_OUTPUTS = {
         OutputKeys.KEYPOINTS, OutputKeys.TIMESTAMPS, OutputKeys.OUTPUT_VIDEO
     ],
 
+    # 3D face reconstruction result for single sample
+    # {
+    #     "output": {
+    #         "vertices": np.array with shape(n, 3),
+    #         "faces": np.array with shape(n, 3),
+    #         "faces_uv": np.array with shape(n, 3),
+    #         "faces_normal": np.array with shape(n, 3),
+    #         "colors": np.array with shape(n, 3),
+    #         "UVs": np.array with shape(n, 2),
+    #         "normals": np.array with shape(n, 3),
+    #         "texture_map": np.array with shape(h, w, 3),
+    #     }
+    # }
+    Tasks.face_reconstruction: [OutputKeys.OUTPUT],
+
     # 2D hand keypoints result for single sample
     # {
     #     "keypoints": [
@@ -403,14 +486,26 @@ TASK_OUTPUTS = {
     # video multi object tracking result for single video
     # {
     #   "boxes": [
-    #               [frame_num, obj_id, x1, y1, x2, y2],
-    #               [frame_num, obj_id, x1, y1, x2, y2],
-    #               [frame_num, obj_id, x1, y1, x2, y2],
+    #               [
+    #                   [x1, y1, x2, y2],
+    #                   [x1, y1, x2, y2],
+    #                   ...
+    #               ],
+    #               [
+    #                   [x1, y1, x2, y2],
+    #                   [x1, y1, x2, y2],
+    #                   ...
+    #               ],
+    #               [
+    #                   [x1, y1, x2, y2]
+    #                   ...
+    #               ]
     #             ],
+    #   "labels": [[obj_id0, obj_id1, ...], [obj_id1, obj_id2, ...], [obj_id3, ...]],
     #   "timestamps": ["hh:mm:ss", "hh:mm:ss", "hh:mm:ss"]
     # }
     Tasks.video_multi_object_tracking: [
-        OutputKeys.BOXES, OutputKeys.TIMESTAMPS
+        OutputKeys.BOXES, OutputKeys.LABELS, OutputKeys.TIMESTAMPS
     ],
 
     # live category recognition result for single video
@@ -534,8 +629,9 @@ TASK_OUTPUTS = {
     # video human matting result for a single video
     #   {
     #       "masks": [np.array # 2D array with shape [height, width]]
+    #       "output_video": "path_to_matting_video"
     #   }
-    Tasks.video_human_matting: [OutputKeys.MASKS],
+    Tasks.video_human_matting: [OutputKeys.MASKS, OutputKeys.OUTPUT_VIDEO],
 
     # ============ nlp tasks ===================
 
@@ -622,6 +718,12 @@ TASK_OUTPUTS = {
     # }
     Tasks.text_generation: [OutputKeys.TEXT],
 
+    # fid dialogue result for single sample
+    # {
+    #   "text": "My name is Mike"
+    # }
+    Tasks.fid_dialogue: [OutputKeys.TEXT],
+
     # summarization result for single sample
     # {
     #   "text": "this is the text generated by a model."
@@ -743,6 +845,18 @@ TASK_OUTPUTS = {
     # punctuation result for single sample
     # { "text": "你好，明天！"}
     Tasks.punctuation: [OutputKeys.TEXT],
+    # language model result for single sample
+    # { "text": " hel@@ lo 大 家 好 呀 </s>
+    #               p( hel@@ | <s> ) = 0.00057767 [ -7.45650959 ]
+    #               p( lo | hel@@ ) = 0.99832278 [ -0.00167861 ]
+    #               p( 大 | lo ) = 0.49116334 [ -0.71097857 ]
+    #               p( 家 | 大 ) = 0.99691027 [ -0.00309453 ]
+    #               p( 好 | 家 ) = 0.97999156 [ -0.02021134 ]
+    #               p( 呀 | 好 ) = 0.00461205 [ -5.37908363 ]
+    #               p( </s> | 呀 ) = 0.01524554 [ -4.18346834 ]
+    #           logprob= -17.755 ppl= 12.6345
+    # "}
+    Tasks.language_model: [OutputKeys.TEXT],
 
     # audio processed for single file in PCM format
     # {
@@ -934,6 +1048,29 @@ TASK_OUTPUTS = {
     #       "masks": [np.array # 3D array with shape [frame_num, height, width]]
     #   }
     Tasks.video_object_segmentation: [OutputKeys.MASKS],
+
+    # motion generation result for a single input
+    #   {
+    #       "keypoints": [np.array # 3D array with shape [frame_num, joint_num, 3]]
+    #       "output_video": "path_to_rendered_video"
+    #   }
+    Tasks.motion_generation: [OutputKeys.KEYPOINTS, OutputKeys.OUTPUT_VIDEO],
+
+    # bad image detecting for a single input
+    #   {
+    #       "scores": [0.8, 0.1, 0.1]
+    #       "labels": ["正常", "花屏", "绿屏"],
+    Tasks.bad_image_detecting: [OutputKeys.SCORES, OutputKeys.LABELS],
+
+    # vision efficient tuning result for single sample
+    #   {
+    #       "scores": [0.9, 0.1, 0.05, 0.05]
+    #       "labels": ["dog", "horse", "cow", "cat"],
+    #   }
+    Tasks.vision_efficient_tuning: [OutputKeys.SCORES, OutputKeys.LABELS],
+    Tasks.document_grounded_dialog_generate: [OutputKeys.TEXT],
+    Tasks.document_grounded_dialog_rerank: [OutputKeys.OUTPUT],
+    Tasks.document_grounded_dialog_retrieval: [OutputKeys.OUTPUT],
 }
 
 
diff --git a/modelscope/pipeline_inputs.py b/modelscope/pipeline_inputs.py
index eab76cb3..7dc134d5 100644
--- a/modelscope/pipeline_inputs.py
+++ b/modelscope/pipeline_inputs.py
@@ -68,6 +68,8 @@ TASK_INPUTS = {
     InputType.IMAGE,
     Tasks.face_recognition:
     InputType.IMAGE,
+    Tasks.face_reconstruction:
+    InputType.IMAGE,
     Tasks.human_detection:
     InputType.IMAGE,
     Tasks.face_image_generation:
@@ -84,6 +86,14 @@ TASK_INPUTS = {
     InputType.IMAGE,
     Tasks.image_fewshot_detection:
     InputType.IMAGE,
+    Tasks.open_vocabulary_detection: {
+        'img': InputType.IMAGE,
+        'category_names': InputType.TEXT
+    },
+    Tasks.image_driving_perception:
+    InputType.IMAGE,
+    Tasks.vision_efficient_tuning:
+    InputType.IMAGE,
 
     # image editing task result for a single image
     Tasks.skin_retouching:
@@ -104,10 +114,17 @@ TASK_INPUTS = {
         'img': InputType.IMAGE,
         'mask': InputType.IMAGE,
     },
+    Tasks.image_paintbyexample: {
+        'img': InputType.IMAGE,
+        'mask': InputType.IMAGE,
+        'reference': InputType.IMAGE,
+    },
     Tasks.image_skychange: {
         'sky_image': InputType.IMAGE,
         'scene_image': InputType.IMAGE,
     },
+    Tasks.video_colorization:
+    InputType.VIDEO,
 
     # image generation task result for a single image
     Tasks.image_to_image_generation:
@@ -148,6 +165,8 @@ TASK_INPUTS = {
     InputType.IMAGE,
     Tasks.movie_scene_segmentation:
     InputType.VIDEO,
+    Tasks.bad_image_detecting:
+    InputType.IMAGE,
 
     # ============ nlp tasks ===================
     Tasks.text_classification: [
@@ -184,6 +203,12 @@ TASK_INPUTS = {
     Tasks.text_ranking: (InputType.TEXT, InputType.TEXT),
     Tasks.text_generation:
     InputType.TEXT,
+    Tasks.fid_dialogue: {
+        'history': InputType.TEXT,
+        'knowledge': InputType.TEXT,
+        'bot_profile': InputType.TEXT,
+        'user_profile': InputType.TEXT,
+    },
     Tasks.fill_mask:
     InputType.TEXT,
     Tasks.task_oriented_conversation: {
@@ -209,6 +234,19 @@ TASK_INPUTS = {
         'text': InputType.TEXT,
         'database': InputType.TEXT
     },
+    Tasks.document_grounded_dialog_generate: {
+        'query': InputType.LIST,
+        'context': InputType.LIST,
+        'label': InputType.LIST,
+    },
+    Tasks.document_grounded_dialog_rerank: {
+        'dataset': InputType.LIST
+    },
+    Tasks.document_grounded_dialog_retrieval: {
+        'query': InputType.LIST,
+        'positive': InputType.LIST,
+        'negative': InputType.LIST
+    },
 
     # ============ audio tasks ===================
     Tasks.auto_speech_recognition:
diff --git a/modelscope/pipelines/audio/__init__.py b/modelscope/pipelines/audio/__init__.py
index 3ad32b3d..c38c9762 100644
--- a/modelscope/pipelines/audio/__init__.py
+++ b/modelscope/pipelines/audio/__init__.py
@@ -11,6 +11,7 @@ if TYPE_CHECKING:
     from .linear_aec_pipeline import LinearAECPipeline
     from .text_to_speech_pipeline import TextToSpeechSambertHifiganPipeline
     from .inverse_text_processing_pipeline import InverseTextProcessingPipeline
+    from .speaker_verification_pipeline import SpeakerVerificationPipeline
 else:
     _import_structure = {
         'ans_pipeline': ['ANSPipeline'],
@@ -19,7 +20,8 @@ else:
         'kws_kwsbp_pipeline': ['KeyWordSpottingKwsbpPipeline'],
         'linear_aec_pipeline': ['LinearAECPipeline'],
         'text_to_speech_pipeline': ['TextToSpeechSambertHifiganPipeline'],
-        'itn_inference_pipeline': ['InverseTextProcessingPipeline']
+        'inverse_text_processing_pipeline': ['InverseTextProcessingPipeline'],
+        'speaker_verification_pipeline': ['SpeakerVerificationPipeline']
     }
 
     import sys
diff --git a/modelscope/pipelines/audio/asr_inference_pipeline.py b/modelscope/pipelines/audio/asr_inference_pipeline.py
index 33732cd2..f0288f27 100644
--- a/modelscope/pipelines/audio/asr_inference_pipeline.py
+++ b/modelscope/pipelines/audio/asr_inference_pipeline.py
@@ -1,7 +1,8 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import os
-from typing import Any, Dict, List, Sequence, Tuple, Union
+from typing import Any, Dict, List, Optional, Sequence, Tuple, Union
 
+import json
 import yaml
 
 from modelscope.metainfo import Pipelines
@@ -13,7 +14,8 @@ from modelscope.preprocessors import WavToScp
 from modelscope.utils.audio.audio_utils import (extract_pcm_from_wav,
                                                 generate_scp_from_url,
                                                 load_bytes_from_url)
-from modelscope.utils.constant import Frameworks, Tasks
+from modelscope.utils.constant import Frameworks, ModelFile, Tasks
+from modelscope.utils.hub import snapshot_download
 from modelscope.utils.logger import get_logger
 
 logger = get_logger()
@@ -25,68 +27,189 @@ __all__ = ['AutomaticSpeechRecognitionPipeline']
     Tasks.auto_speech_recognition, module_name=Pipelines.asr_inference)
 class AutomaticSpeechRecognitionPipeline(Pipeline):
     """ASR Inference Pipeline
+    Example:
+
+    >>> from modelscope.pipelines import pipeline
+    >>> from modelscope.utils.constant import Tasks
+
+    >>> inference_pipeline = pipeline(
+    >>>     task=Tasks.auto_speech_recognition,
+    >>>     model='damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch')
+
+    >>> rec_result = inference_pipeline(
+    >>>     audio_in='https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav')
+    >>> print(rec_result)
+
     """
 
     def __init__(self,
                  model: Union[Model, str] = None,
                  preprocessor: WavToScp = None,
+                 vad_model: Optional[Union[Model, str]] = None,
+                 vad_model_revision: Optional[str] = None,
+                 punc_model: Optional[Union[Model, str]] = None,
+                 punc_model_revision: Optional[str] = None,
+                 lm_model: Optional[Union[Model, str]] = None,
+                 lm_model_revision: Optional[str] = None,
                  **kwargs):
-        """use `model` and `preprocessor` to create an asr pipeline for prediction
+        """
+        Use `model` and `preprocessor` to create an asr pipeline for prediction
+        Args:
+            model ('Model' or 'str'):
+                The pipeline handles three types of model:
+
+                - A model instance
+                - A model local dir
+                - A model id in the model hub
+            preprocessor:
+                (list of) Preprocessor object
+            vad_model (Optional: 'Model' or 'str'):
+                voice activity detection model from model hub or local
+                example: 'damo/speech_fsmn_vad_zh-cn-16k-common-pytorch'
+            punc_model (Optional: 'Model' or 'str'):
+                punctuation model from model hub or local
+                example: 'damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch'
+            lm_model (Optional: 'Model' or 'str'):
+                language model from model hub or local
+                example: 'damo/speech_transformer_lm_zh-cn-common-vocab8404-pytorch'
+            output_dir('str'):
+                output dir path
+            batch_size('int'):
+                the batch size for inference
+            ngpu('int'):
+                the number of gpus, 0 indicates CPU mode
+            beam_size('int'):
+                beam size for decoding
+            ctc_weight('float'):
+                CTC weight in joint decoding
+            lm_weight('float'):
+                lm weight
+            decoding_ind('int', defaults to 0):
+                decoding ind
+            decoding_mode('str', defaults to 'model1'):
+                decoding mode
+            vad_model_file('str'):
+                vad model file
+            vad_infer_config('str'):
+                VAD infer configuration
+            vad_cmvn_file('str'):
+                global CMVN file
+            punc_model_file('str'):
+                punc model file
+            punc_infer_config('str'):
+                punc infer config
+            param_dict('dict'):
+                extra kwargs
         """
         super().__init__(model=model, preprocessor=preprocessor, **kwargs)
+        self.vad_model = None
+        self.punc_model = None
+        self.lm_model = None
+        if vad_model is not None:
+            if os.path.exists(vad_model):
+                self.vad_model = vad_model
+            else:
+                self.vad_model = snapshot_download(
+                    vad_model, revision=vad_model_revision)
+        if punc_model is not None:
+            if os.path.exists(punc_model):
+                self.punc_model = punc_model
+            else:
+                self.punc_model = snapshot_download(
+                    punc_model, revision=punc_model_revision)
+        if lm_model is not None:
+            if os.path.exists(lm_model):
+                self.lm_model = lm_model
+            else:
+                self.lm_model = snapshot_download(
+                    lm_model, revision=lm_model_revision)
         self.model_cfg = self.model.forward()
 
-        self.output_dir = None
-        if 'output_dir' in kwargs:
-            self.output_dir = kwargs['output_dir']
         self.cmd = self.get_cmd(kwargs)
         if self.cmd['code_base'] == 'funasr':
             from funasr.bin import asr_inference_launch
             self.funasr_infer_modelscope = asr_inference_launch.inference_launch(
                 mode=self.cmd['mode'],
-                batch_size=self.cmd['batch_size'],
                 maxlenratio=self.cmd['maxlenratio'],
                 minlenratio=self.cmd['minlenratio'],
+                batch_size=self.cmd['batch_size'],
                 beam_size=self.cmd['beam_size'],
                 ngpu=self.cmd['ngpu'],
-                num_workers=self.cmd['num_workers'],
                 ctc_weight=self.cmd['ctc_weight'],
                 lm_weight=self.cmd['lm_weight'],
                 penalty=self.cmd['penalty'],
                 log_level=self.cmd['log_level'],
-                cmvn_file=self.cmd['cmvn_file'],
                 asr_train_config=self.cmd['asr_train_config'],
                 asr_model_file=self.cmd['asr_model_file'],
+                cmvn_file=self.cmd['cmvn_file'],
                 lm_file=self.cmd['lm_file'],
-                lm_train_config=self.cmd['lm_train_config'],
-                frontend_conf=self.cmd['frontend_conf'],
+                token_type=self.cmd['token_type'],
+                key_file=self.cmd['key_file'],
+                word_lm_train_config=self.cmd['word_lm_train_config'],
+                bpemodel=self.cmd['bpemodel'],
+                allow_variable_data_keys=self.cmd['allow_variable_data_keys'],
+                output_dir=self.cmd['output_dir'],
+                dtype=self.cmd['dtype'],
+                seed=self.cmd['seed'],
+                ngram_weight=self.cmd['ngram_weight'],
+                nbest=self.cmd['nbest'],
+                num_workers=self.cmd['num_workers'],
+                vad_infer_config=self.cmd['vad_infer_config'],
+                vad_model_file=self.cmd['vad_model_file'],
+                vad_cmvn_file=self.cmd['vad_cmvn_file'],
+                punc_model_file=self.cmd['punc_model_file'],
+                punc_infer_config=self.cmd['punc_infer_config'],
+                outputs_dict=self.cmd['outputs_dict'],
+                param_dict=self.cmd['param_dict'],
                 token_num_relax=self.cmd['token_num_relax'],
                 decoding_ind=self.cmd['decoding_ind'],
                 decoding_mode=self.cmd['decoding_mode'],
-                vad_model_file=self.cmd['vad_model_name'],
-                vad_infer_config=self.cmd['vad_model_config'],
-                vad_cmvn_file=self.cmd['vad_mvn_file'],
-                punc_model_file=self.cmd['punc_model_name'],
-                punc_infer_config=self.cmd['punc_model_config'],
-                output_dir=self.output_dir)
+            )
 
     def __call__(self,
                  audio_in: Union[str, bytes],
                  audio_fs: int = None,
                  recog_type: str = None,
                  audio_format: str = None,
-                 output_dir: str = None) -> Dict[str, Any]:
+                 output_dir: str = None,
+                 param_dict: dict = None) -> Dict[str, Any]:
         from funasr.utils import asr_utils
+        """
+        Decoding the input audios
+        Args:
+            audio_in('str' or 'bytes'):
+                - A string containing a local path to a wav file
+                - A string containing a local path to a scp
+                - A string containing a wav url
+                - A bytes input
+            audio_fs('int'):
+                frequency of sample
+            recog_type('str'):
+                recog type
+            audio_format('str'):
+                audio format
+            output_dir('str'):
+                output dir
+            param_dict('dict'):
+                extra kwargs
+        Return:
+            A dictionary of result or a list of dictionary of result.
+
+            The dictionary contain the following keys:
+            - **text** ('str') --The asr result.
+        """
 
         # code base
         code_base = self.cmd['code_base']
         self.recog_type = recog_type
         self.audio_format = audio_format
-        self.audio_fs = audio_fs
+        self.audio_fs = None
         checking_audio_fs = None
         self.raw_inputs = None
         if output_dir is not None:
             self.cmd['output_dir'] = output_dir
+        self.cmd['param_dict'] = param_dict
+
         if code_base == 'funasr':
             if isinstance(audio_in, str):
                 # for funasr code, generate wav.scp from url or local path
@@ -129,6 +252,10 @@ class AutomaticSpeechRecognitionPipeline(Pipeline):
                 self.audio_in, self.audio_format)
             if checking_audio_fs is not None:
                 self.audio_fs = checking_audio_fs
+        if audio_fs is not None:
+            self.cmd['fs']['audio_fs'] = audio_fs
+        else:
+            self.cmd['fs']['audio_fs'] = self.audio_fs
 
         output = self.preprocessor.forward(self.model_cfg, self.recog_type,
                                            self.audio_format, self.audio_in,
@@ -142,14 +269,42 @@ class AutomaticSpeechRecognitionPipeline(Pipeline):
             self.preprocessor = WavToScp()
 
         outputs = self.preprocessor.config_checking(self.model_cfg)
-
         # generate asr inference command
         cmd = {
-            'output_dir': None,
-            'model_type': outputs['model_type'],
-            'ngpu': 1,  # 0: only CPU, ngpu>=1: gpu number if cuda is available
+            'maxlenratio': 0.0,
+            'minlenratio': 0.0,
+            'batch_size': 1,
+            'beam_size': 1,
+            'ngpu': 1,
+            'ctc_weight': 0.0,
+            'lm_weight': 0.0,
+            'penalty': 0.0,
             'log_level': 'ERROR',
+            'asr_train_config': None,
             'asr_model_file': outputs['am_model_path'],
+            'cmvn_file': None,
+            'lm_train_config': None,
+            'lm_file': None,
+            'token_type': None,
+            'key_file': None,
+            'word_lm_train_config': None,
+            'bpemodel': None,
+            'allow_variable_data_keys': False,
+            'output_dir': None,
+            'dtype': 'float32',
+            'seed': 0,
+            'ngram_weight': 0.9,
+            'nbest': 1,
+            'num_workers': 0,
+            'vad_infer_config': None,
+            'vad_model_file': None,
+            'vad_cmvn_file': None,
+            'time_stamp_writer': True,
+            'punc_infer_config': None,
+            'punc_model_file': None,
+            'outputs_dict': True,
+            'param_dict': None,
+            'model_type': outputs['model_type'],
             'idx_text': '',
             'sampled_ids': 'seq2seq/sampled_ids',
             'sampled_lengths': 'seq2seq/sampled_lengths',
@@ -157,7 +312,8 @@ class AutomaticSpeechRecognitionPipeline(Pipeline):
             'code_base': outputs['code_base'],
             'mode': outputs['mode'],
             'fs': {
-                'model_fs': 16000
+                'model_fs': None,
+                'audio_fs': None
             }
         }
 
@@ -166,13 +322,18 @@ class AutomaticSpeechRecognitionPipeline(Pipeline):
             token_num_relax = None
             decoding_ind = None
             decoding_mode = None
+            if os.path.exists(outputs['am_model_config']):
+                config_file = open(
+                    outputs['am_model_config'], encoding='utf-8')
+                root = yaml.full_load(config_file)
+                config_file.close()
+                if 'frontend_conf' in root:
+                    frontend_conf = root['frontend_conf']
             if os.path.exists(outputs['asr_model_config']):
                 config_file = open(
                     outputs['asr_model_config'], encoding='utf-8')
                 root = yaml.full_load(config_file)
                 config_file.close()
-                if 'frontend_conf' in root:
-                    frontend_conf = root['frontend_conf']
                 if 'token_num_relax' in root:
                     token_num_relax = root['token_num_relax']
                 if 'decoding_ind' in root:
@@ -204,60 +365,51 @@ class AutomaticSpeechRecognitionPipeline(Pipeline):
             cmd['token_num_relax'] = token_num_relax
             cmd['decoding_ind'] = decoding_ind
             cmd['decoding_mode'] = decoding_mode
-            cmd['num_workers'] = 0
             if outputs.__contains__('mvn_file'):
                 cmd['cmvn_file'] = outputs['mvn_file']
-            else:
-                cmd['cmvn_file'] = None
             if outputs.__contains__('vad_model_name'):
-                cmd['vad_model_name'] = outputs['vad_model_name']
-            else:
-                cmd['vad_model_name'] = None
+                cmd['vad_model_file'] = outputs['vad_model_name']
             if outputs.__contains__('vad_model_config'):
-                cmd['vad_model_config'] = outputs['vad_model_config']
-            else:
-                cmd['vad_model_config'] = None
+                cmd['vad_infer_config'] = outputs['vad_model_config']
             if outputs.__contains__('vad_mvn_file'):
-                cmd['vad_mvn_file'] = outputs['vad_mvn_file']
-            else:
-                cmd['vad_mvn_file'] = None
+                cmd['vad_cmvn_file'] = outputs['vad_mvn_file']
             if outputs.__contains__('punc_model_name'):
-                cmd['punc_model_name'] = outputs['punc_model_name']
-            else:
-                cmd['punc_model_name'] = None
+                cmd['punc_model_file'] = outputs['punc_model_name']
             if outputs.__contains__('punc_model_config'):
-                cmd['punc_model_config'] = outputs['punc_model_config']
-            else:
-                cmd['punc_model_config'] = None
-            if 'batch_size' in extra_args:
-                cmd['batch_size'] = extra_args['batch_size']
-            if 'mode' in extra_args:
-                cmd['mode'] = extra_args['mode']
-            if 'ngpu' in extra_args:
-                cmd['ngpu'] = extra_args['ngpu']
-            if 'beam_size' in extra_args:
-                cmd['beam_size'] = extra_args['beam_size']
-            if 'decoding_ind' in extra_args:
-                cmd['decoding_ind'] = extra_args['decoding_ind']
-            if 'decoding_mode' in extra_args:
-                cmd['decoding_mode'] = extra_args['decoding_mode']
-            if 'vad_model_file' in extra_args:
-                cmd['vad_model_name'] = extra_args['vad_model_file']
-            if 'vad_infer_config' in extra_args:
-                cmd['vad_model_config'] = extra_args['vad_infer_config']
-            if 'vad_cmvn_file' in extra_args:
-                cmd['vad_mvn_file'] = extra_args['vad_cmvn_file']
-            if 'punc_model_file' in extra_args:
-                cmd['punc_model_name'] = extra_args['punc_model_file']
-            if 'punc_infer_config' in extra_args:
-                cmd['punc_model_config'] = extra_args['punc_infer_config']
+                cmd['punc_infer_config'] = outputs['punc_model_config']
+            self.load_vad_model(cmd)
+            self.load_punc_model(cmd)
+            self.load_lm_model(cmd)
+
+            user_args_dict = [
+                'output_dir',
+                'batch_size',
+                'mode',
+                'ngpu',
+                'beam_size',
+                'ctc_weight',
+                'lm_weight',
+                'decoding_ind',
+                'decoding_mode',
+                'vad_model_file',
+                'vad_infer_config',
+                'vad_cmvn_file',
+                'punc_model_file',
+                'punc_infer_config',
+                'param_dict',
+            ]
+
+            for user_args in user_args_dict:
+                if user_args in extra_args and extra_args[
+                        user_args] is not None:
+                    cmd[user_args] = extra_args[user_args]
 
         elif self.framework == Frameworks.tf:
             cmd['fs']['model_fs'] = outputs['model_config']['fs']
             cmd['hop_length'] = outputs['model_config']['hop_length']
             cmd['feature_dims'] = outputs['model_config']['feature_dims']
             cmd['predictions_file'] = 'text'
-            cmd['mvn_file'] = outputs['am_mvn_file']
+            cmd['cmvn_file'] = outputs['am_mvn_file']
             cmd['vocab_file'] = outputs['vocab_file']
             if 'idx_text' in outputs:
                 cmd['idx_text'] = outputs['idx_text']
@@ -271,6 +423,53 @@ class AutomaticSpeechRecognitionPipeline(Pipeline):
 
         return cmd
 
+    def load_vad_model(self, cmd):
+        if self.vad_model is not None:
+            logger.info('loading vad model from {0} ...'.format(
+                self.vad_model))
+            config_path = os.path.join(self.vad_model, ModelFile.CONFIGURATION)
+            model_cfg = json.loads(open(config_path).read())
+            model_dir = os.path.dirname(config_path)
+            cmd['vad_model_file'] = os.path.join(
+                model_dir,
+                model_cfg['model']['model_config']['vad_model_name'])
+            cmd['vad_infer_config'] = os.path.join(
+                model_dir,
+                model_cfg['model']['model_config']['vad_model_config'])
+            cmd['vad_cmvn_file'] = os.path.join(
+                model_dir, model_cfg['model']['model_config']['vad_mvn_file'])
+            if 'vad' not in cmd['mode']:
+                cmd['mode'] = cmd['mode'] + '_vad'
+
+    def load_punc_model(self, cmd):
+        if self.punc_model is not None:
+            logger.info('loading punctuation model from {0} ...'.format(
+                self.punc_model))
+            config_path = os.path.join(self.punc_model,
+                                       ModelFile.CONFIGURATION)
+            model_cfg = json.loads(open(config_path).read())
+            model_dir = os.path.dirname(config_path)
+            cmd['punc_model_file'] = os.path.join(
+                model_dir, model_cfg['model']['punc_model_name'])
+            cmd['punc_infer_config'] = os.path.join(
+                model_dir,
+                model_cfg['model']['punc_model_config']['punc_config'])
+            if 'punc' not in cmd['mode']:
+                cmd['mode'] = cmd['mode'] + '_punc'
+
+    def load_lm_model(self, cmd):
+        if self.lm_model is not None:
+            logger.info('loading language model from {0} ...'.format(
+                self.lm_model))
+            config_path = os.path.join(self.lm_model, ModelFile.CONFIGURATION)
+            model_cfg = json.loads(open(config_path).read())
+            model_dir = os.path.dirname(config_path)
+            cmd['lm_file'] = os.path.join(
+                model_dir, model_cfg['model']['model_config']['lm_model_name'])
+            cmd['lm_train_config'] = os.path.join(
+                model_dir,
+                model_cfg['model']['model_config']['lm_model_config'])
+
     def forward(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
         """Decoding
         """
@@ -298,7 +497,6 @@ class AutomaticSpeechRecognitionPipeline(Pipeline):
 
         # generate asr inference command
         self.cmd['name_and_type'] = data_cmd
-        self.cmd['fs']['audio_fs'] = inputs['audio_fs']
         self.cmd['raw_inputs'] = self.raw_inputs
         self.cmd['audio_in'] = self.audio_in
 
@@ -318,9 +516,12 @@ class AutomaticSpeechRecognitionPipeline(Pipeline):
         # single wav or pcm task
         if inputs['recog_type'] == 'wav':
             if 'asr_result' in inputs and len(inputs['asr_result']) > 0:
-                text = inputs['asr_result'][0]['value']
-                if len(text) > 0:
-                    rst[OutputKeys.TEXT] = text
+                for key, value in inputs['asr_result'][0].items():
+                    if key == 'value':
+                        if len(value) > 0:
+                            rst[OutputKeys.TEXT] = value
+                    elif key != 'key':
+                        rst[key] = value
 
         # run with datasets, and audio format is waveform or kaldi_ark or tfrecord
         elif inputs['recog_type'] != 'wav':
@@ -379,32 +580,10 @@ class AutomaticSpeechRecognitionPipeline(Pipeline):
             asr_result = self.funasr_infer_modelscope(
                 data_path_and_name_and_type=cmd['name_and_type'],
                 raw_inputs=cmd['raw_inputs'],
-                output_dir_v2=cmd['output_dir'])
+                output_dir_v2=cmd['output_dir'],
+                fs=cmd['fs'],
+                param_dict=cmd['param_dict'])
 
-        elif self.framework == Frameworks.torch:
-            from easyasr import asr_inference_paraformer_espnet
-
-            if hasattr(asr_inference_paraformer_espnet, 'set_parameters'):
-                asr_inference_paraformer_espnet.set_parameters(
-                    sample_rate=cmd['fs'])
-                asr_inference_paraformer_espnet.set_parameters(
-                    language=cmd['lang'])
-
-            asr_result = asr_inference_paraformer_espnet.asr_inference(
-                batch_size=cmd['batch_size'],
-                maxlenratio=cmd['maxlenratio'],
-                minlenratio=cmd['minlenratio'],
-                beam_size=cmd['beam_size'],
-                ngpu=cmd['ngpu'],
-                ctc_weight=cmd['ctc_weight'],
-                lm_weight=cmd['lm_weight'],
-                penalty=cmd['penalty'],
-                log_level=cmd['log_level'],
-                name_and_type=cmd['name_and_type'],
-                audio_lists=cmd['audio_in'],
-                asr_train_config=cmd['asr_train_config'],
-                asr_model_file=cmd['asr_model_file'],
-                frontend_conf=cmd['frontend_conf'])
         elif self.framework == Frameworks.tf:
             from easyasr import asr_inference_paraformer_tf
             if hasattr(asr_inference_paraformer_tf, 'set_parameters'):
@@ -421,7 +600,7 @@ class AutomaticSpeechRecognitionPipeline(Pipeline):
                 idx_text_file=cmd['idx_text'],
                 asr_model_file=cmd['asr_model_file'],
                 vocab_file=cmd['vocab_file'],
-                am_mvn_file=cmd['mvn_file'],
+                am_mvn_file=cmd['cmvn_file'],
                 predictions_file=cmd['predictions_file'],
                 fs=cmd['fs'],
                 hop_length=cmd['hop_length'],
diff --git a/modelscope/pipelines/audio/inverse_text_processing_pipeline.py b/modelscope/pipelines/audio/inverse_text_processing_pipeline.py
index f5282691..4c9ce710 100644
--- a/modelscope/pipelines/audio/inverse_text_processing_pipeline.py
+++ b/modelscope/pipelines/audio/inverse_text_processing_pipeline.py
@@ -28,7 +28,9 @@ class InverseTextProcessingPipeline(Pipeline):
         model (BartForTextErrorCorrection): A model instance, or a model local dir, or a model id in the model hub.
         kwargs (dict, `optional`):
             Extra kwargs passed into the preprocessor's constructor.
-    Example:
+
+    Examples:
+
     >>> from modelscope.pipelines import pipeline
     >>> pipeline_itn = pipeline(
     >>>    task=Tasks.inverse_text_processing, model='damo/speech_inverse_text_processing_fun-text-processing-itn-id')
diff --git a/modelscope/pipelines/audio/kws_farfield_pipeline.py b/modelscope/pipelines/audio/kws_farfield_pipeline.py
index e2f618fa..5bfc31e9 100644
--- a/modelscope/pipelines/audio/kws_farfield_pipeline.py
+++ b/modelscope/pipelines/audio/kws_farfield_pipeline.py
@@ -40,6 +40,10 @@ class KWSFarfieldPipeline(Pipeline):
         self.model.eval()
         frame_size = self.INPUT_CHANNELS * self.SAMPLE_WIDTH
         self._nframe = self.model.size_in // frame_size
+        if 'keyword_map' in kwargs:
+            self._keyword_map = kwargs['keyword_map']
+        else:
+            self._keyword_map = {}
 
     def preprocess(self, inputs: Input, **preprocess_params) -> Dict[str, Any]:
         if isinstance(inputs, bytes):
@@ -85,6 +89,10 @@ class KWSFarfieldPipeline(Pipeline):
                 fout.writeframes(result['pcm'])
             if 'kws' in result:
                 result['kws']['offset'] += start_index / self.SAMPLE_RATE
+                result['kws']['type'] = 'wakeup'
+                keyword = result['kws']['keyword']
+                if keyword in self._keyword_map:
+                    result['kws']['keyword'] = self._keyword_map[keyword]
                 kws_list.append(result['kws'])
 
     def postprocess(self, inputs: Dict[str, Any], **kwargs) -> Dict[str, Any]:
diff --git a/modelscope/pipelines/audio/linear_aec_pipeline.py b/modelscope/pipelines/audio/linear_aec_pipeline.py
index e1e75ddb..f07b77a5 100644
--- a/modelscope/pipelines/audio/linear_aec_pipeline.py
+++ b/modelscope/pipelines/audio/linear_aec_pipeline.py
@@ -122,10 +122,7 @@ class LinearAECPipeline(Pipeline):
                 'base' the base audio to mask.
 
         Returns:
-            dict:
-                {
-                    'output_pcm': generated audio array
-                }
+            output_pcm: generated audio array
         """
         output_data = self._process(inputs['feature'], inputs['base'])
         output_data = output_data.astype(np.int16).tobytes()
@@ -135,17 +132,12 @@ class LinearAECPipeline(Pipeline):
         r"""The post process. Will save audio to file, if the output_path is given.
 
         Args:
-            inputs: dict:
-                {
-                    'output_pcm': generated audio array
-                }
+            inputs: a dict contains following keys:
+                - output_pcm: generated audio array
             kwargs: accept 'output_path' which is the path to write generated audio
 
         Returns:
-            dict:
-                {
-                    'output_pcm': generated audio array
-                }
+            output_pcm: generated audio array
         """
         if 'output_path' in kwargs.keys():
             wav.write(
diff --git a/modelscope/pipelines/audio/lm_infer_pipeline.py b/modelscope/pipelines/audio/lm_infer_pipeline.py
new file mode 100644
index 00000000..d7275b6b
--- /dev/null
+++ b/modelscope/pipelines/audio/lm_infer_pipeline.py
@@ -0,0 +1,220 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os
+from typing import Any, Dict, Union
+
+from modelscope.metainfo import Pipelines
+from modelscope.models import Model
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines.base import Pipeline
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.utils.audio.audio_utils import generate_text_from_url
+from modelscope.utils.config import Config
+from modelscope.utils.constant import Frameworks, ModelFile, Tasks
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+__all__ = ['LanguageModelPipeline']
+
+
+@PIPELINES.register_module(
+    Tasks.language_model, module_name=Pipelines.lm_inference)
+class LanguageModelPipeline(Pipeline):
+    """Language Model Inference Pipeline
+
+    Example:
+    >>> from modelscope.pipelines import pipeline
+    >>> from modelscope.utils.constant import Tasks
+
+    >>> pipeline_lm = pipeline(
+    >>>    task=Tasks.language_model,
+    >>>    model='damo/speech_transformer_lm_zh-cn-common-vocab8404-pytorch')
+    >>> text_in='hello 大 家 好 呀'
+    >>> print(pipeline_lm(text_in))
+
+    """
+
+    def __init__(self, model: Union[Model, str] = None, **kwargs):
+        """
+        Use `model` to create a LM pipeline for prediction
+        Args:
+            model ('Model' or 'str'):
+                The pipeline handles three types of model:
+
+                - A model instance
+                - A model local dir
+                - A model id in the model hub
+            output_dir('str'):
+                output dir path
+            batch_size('int'):
+                the batch size for inference
+            ngpu('int'):
+                the number of gpus, 0 indicates CPU mode
+            model_file('str'):
+                LM model file
+            train_config('str'):
+                LM infer configuration
+            num_workers('int'):
+                the number of workers used for DataLoader
+            log_level('str'):
+                log level
+            log_base('float', defaults to 10.0):
+                the base of logarithm for Perplexity
+            split_with_space('bool'):
+                split the input sentence by space
+            seg_dict_file('str'):
+                seg dict file
+            param_dict('dict'):
+                extra kwargs
+        """
+        super().__init__(model=model, **kwargs)
+        config_path = os.path.join(model, ModelFile.CONFIGURATION)
+        self.cmd = self.get_cmd(config_path, kwargs)
+
+        from funasr.bin import lm_inference_launch
+        self.funasr_infer_modelscope = lm_inference_launch.inference_launch(
+            mode=self.cmd['mode'],
+            batch_size=self.cmd['batch_size'],
+            dtype=self.cmd['dtype'],
+            ngpu=self.cmd['ngpu'],
+            seed=self.cmd['seed'],
+            num_workers=self.cmd['num_workers'],
+            log_level=self.cmd['log_level'],
+            key_file=self.cmd['key_file'],
+            train_config=self.cmd['train_config'],
+            model_file=self.cmd['model_file'],
+            log_base=self.cmd['log_base'],
+            split_with_space=self.cmd['split_with_space'],
+            seg_dict_file=self.cmd['seg_dict_file'],
+            output_dir=self.cmd['output_dir'],
+            param_dict=self.cmd['param_dict'])
+
+    def __call__(self,
+                 text_in: str = None,
+                 output_dir: str = None,
+                 param_dict: dict = None) -> Dict[str, Any]:
+        """
+        Compute PPL
+        Args:
+            text_in('str'):
+                - A text str input
+                - A local text file input endswith .txt or .scp
+                - A url text file input
+            output_dir('str'):
+                output dir
+            param_dict('dict'):
+                extra kwargs
+        Return:
+            A dictionary of result or a list of dictionary of result.
+
+            The dictionary contain the following keys:
+            - **text** ('str') --The PPL result.
+        """
+        if len(text_in) == 0:
+            raise ValueError('The input of lm should not be null.')
+        else:
+            self.text_in = text_in
+        if output_dir is not None:
+            self.cmd['output_dir'] = output_dir
+        if param_dict is not None:
+            self.cmd['param_dict'] = param_dict
+
+        output = self.forward(self.text_in)
+        result = self.postprocess(output)
+        return result
+
+    def postprocess(self, inputs: list) -> Dict[str, Any]:
+        """Postprocessing
+        """
+        rst = {}
+        for i in range(len(inputs)):
+            if i == 0:
+                text = inputs[0]['value']
+                if len(text) > 0:
+                    rst[OutputKeys.TEXT] = text
+            else:
+                rst[inputs[i]['key']] = inputs[i]['value']
+        return rst
+
+    def get_cmd(self, config_path, extra_args) -> Dict[str, Any]:
+        # generate inference command
+        model_cfg = Config.from_file(config_path)
+        model_dir = os.path.dirname(config_path)
+        mode = model_cfg.model['model_config']['mode']
+        lm_model_path = os.path.join(
+            model_dir, model_cfg.model['model_config']['lm_model_name'])
+        lm_model_config = os.path.join(
+            model_dir, model_cfg.model['model_config']['lm_model_config'])
+        seg_dict_file = None
+        if 'seg_dict_file' in model_cfg.model['model_config']:
+            seg_dict_file = os.path.join(
+                model_dir, model_cfg.model['model_config']['seg_dict_file'])
+
+        cmd = {
+            'mode': mode,
+            'batch_size': 1,
+            'dtype': 'float32',
+            'ngpu': 1,  # 0: only CPU, ngpu>=1: gpu number if cuda is available
+            'seed': 0,
+            'num_workers': 0,
+            'log_level': 'ERROR',
+            'key_file': None,
+            'train_config': lm_model_config,
+            'model_file': lm_model_path,
+            'log_base': 10.0,
+            'allow_variable_data_keys': False,
+            'split_with_space': True,
+            'seg_dict_file': seg_dict_file,
+            'output_dir': None,
+            'param_dict': None,
+        }
+
+        user_args_dict = [
+            'batch_size',
+            'ngpu',
+            'num_workers',
+            'log_level',
+            'train_config',
+            'model_file',
+            'log_base',
+            'split_with_space',
+            'seg_dict_file',
+            'output_dir',
+            'param_dict',
+        ]
+
+        for user_args in user_args_dict:
+            if user_args in extra_args and extra_args[user_args] is not None:
+                cmd[user_args] = extra_args[user_args]
+
+        return cmd
+
+    def forward(self, text_in: str = None) -> list:
+        """Decoding
+        """
+        logger.info('Compute PPL : {0} ...'.format(text_in))
+        # generate text_in
+        text_file, raw_inputs = generate_text_from_url(text_in)
+        data_cmd = None
+        if raw_inputs is None:
+            data_cmd = [(text_file, 'text', 'text')]
+        elif text_file is None and raw_inputs is not None:
+            data_cmd = None
+
+        self.cmd['name_and_type'] = data_cmd
+        self.cmd['raw_inputs'] = raw_inputs
+        lm_result = self.run_inference(self.cmd)
+
+        return lm_result
+
+    def run_inference(self, cmd):
+        if self.framework == Frameworks.torch:
+            lm_result = self.funasr_infer_modelscope(
+                data_path_and_name_and_type=cmd['name_and_type'],
+                raw_inputs=cmd['raw_inputs'],
+                output_dir_v2=cmd['output_dir'],
+                param_dict=cmd['param_dict'])
+        else:
+            raise ValueError('model type is mismatching')
+
+        return lm_result
diff --git a/modelscope/pipelines/audio/punctuation_processing_pipeline.py b/modelscope/pipelines/audio/punctuation_processing_pipeline.py
index 226717bc..ec1532ea 100644
--- a/modelscope/pipelines/audio/punctuation_processing_pipeline.py
+++ b/modelscope/pipelines/audio/punctuation_processing_pipeline.py
@@ -29,7 +29,7 @@ class PunctuationProcessingPipeline(Pipeline):
         model (PunctuationProcessingPipeline): A model instance, or a model local dir, or a model id in the model hub.
         kwargs (dict, `optional`):
             Extra kwargs passed into the preprocessor's constructor.
-    Example:
+    Examples
     >>> from modelscope.pipelines import pipeline
     >>> pipeline_punc = pipeline(
     >>>    task=Tasks.punctuation, model='damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch')
@@ -43,33 +43,38 @@ class PunctuationProcessingPipeline(Pipeline):
         """
         super().__init__(model=model, **kwargs)
         self.model_cfg = self.model.forward()
-        self.cmd = self.get_cmd()
-        self.output_dir = None
-        if 'output_dir' in kwargs:
-            self.output_dir = kwargs['output_dir']
+        self.cmd = self.get_cmd(kwargs)
+
         from funasr.bin import punc_inference_launch
         self.funasr_infer_modelscope = punc_inference_launch.inference_launch(
             mode=self.cmd['mode'],
-            ngpu=self.cmd['ngpu'],
-            log_level=self.cmd['log_level'],
-            dtype=self.cmd['dtype'],
-            seed=self.cmd['seed'],
-            output_dir=self.output_dir,
             batch_size=self.cmd['batch_size'],
+            dtype=self.cmd['dtype'],
+            ngpu=self.cmd['ngpu'],
+            seed=self.cmd['seed'],
             num_workers=self.cmd['num_workers'],
+            log_level=self.cmd['log_level'],
             key_file=self.cmd['key_file'],
             train_config=self.cmd['train_config'],
-            model_file=self.cmd['model_file'])
+            model_file=self.cmd['model_file'],
+            output_dir=self.cmd['output_dir'],
+            param_dict=self.cmd['param_dict'])
 
     def __call__(self,
                  text_in: str = None,
-                 output_dir: str = None) -> Dict[str, Any]:
+                 output_dir: str = None,
+                 cache: List[Any] = None,
+                 param_dict: dict = None) -> Dict[str, Any]:
         if len(text_in) == 0:
             raise ValueError('The input of punctuation should not be null.')
         else:
             self.text_in = text_in
         if output_dir is not None:
             self.cmd['output_dir'] = output_dir
+        if cache is not None:
+            self.cmd['cache'] = cache
+        if param_dict is not None:
+            self.cmd['param_dict'] = param_dict
 
         output = self.forward(self.text_in)
         result = self.postprocess(output)
@@ -88,7 +93,7 @@ class PunctuationProcessingPipeline(Pipeline):
                 rst[inputs[i]['key']] = inputs[i]['value']
         return rst
 
-    def get_cmd(self) -> Dict[str, Any]:
+    def get_cmd(self, extra_args) -> Dict[str, Any]:
         # generate inference command
         lang = self.model_cfg['model_config']['lang']
         punc_model_path = self.model_cfg['punc_model_path']
@@ -98,19 +103,39 @@ class PunctuationProcessingPipeline(Pipeline):
         mode = self.model_cfg['model_config']['mode']
         cmd = {
             'mode': mode,
-            'output_dir': None,
             'batch_size': 1,
-            'num_workers': 1,
-            'ngpu': 1,  # 0: only CPU, ngpu>=1: gpu number if cuda is available
-            'log_level': 'ERROR',
             'dtype': 'float32',
+            'ngpu': 1,  # 0: only CPU, ngpu>=1: gpu number if cuda is available
             'seed': 0,
+            'num_workers': 0,
+            'log_level': 'ERROR',
             'key_file': None,
-            'model_file': punc_model_path,
             'train_config': punc_model_config,
-            'lang': lang
+            'model_file': punc_model_path,
+            'output_dir': None,
+            'lang': lang,
+            'cache': None,
+            'param_dict': None,
         }
 
+        user_args_dict = [
+            'batch_size',
+            'dtype',
+            'ngpu',
+            'seed',
+            'num_workers',
+            'log_level',
+            'train_config',
+            'model_file',
+            'output_dir',
+            'lang',
+            'param_dict',
+        ]
+
+        for user_args in user_args_dict:
+            if user_args in extra_args and extra_args[user_args] is not None:
+                cmd[user_args] = extra_args[user_args]
+
         return cmd
 
     def forward(self, text_in: str = None) -> list:
@@ -136,7 +161,9 @@ class PunctuationProcessingPipeline(Pipeline):
             punc_result = self.funasr_infer_modelscope(
                 data_path_and_name_and_type=cmd['name_and_type'],
                 raw_inputs=cmd['raw_inputs'],
-                output_dir_v2=cmd['output_dir'])
+                output_dir_v2=cmd['output_dir'],
+                cache=cmd['cache'],
+                param_dict=cmd['param_dict'])
         else:
             raise ValueError('model type is mismatching')
 
diff --git a/modelscope/pipelines/audio/speaker_verification_light_pipeline.py b/modelscope/pipelines/audio/speaker_verification_light_pipeline.py
new file mode 100644
index 00000000..5cff800a
--- /dev/null
+++ b/modelscope/pipelines/audio/speaker_verification_light_pipeline.py
@@ -0,0 +1,111 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import io
+from typing import Any, Dict, List, Union
+
+import soundfile as sf
+import torch
+
+from modelscope.fileio import File
+from modelscope.metainfo import Pipelines
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines.base import InputModel, Pipeline
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.utils.constant import Tasks
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+__all__ = ['SpeakerVerificationPipeline']
+
+
+@PIPELINES.register_module(
+    Tasks.speaker_verification, module_name=Pipelines.speaker_verification)
+class SpeakerVerificationPipeline(Pipeline):
+    """Speaker Verification Inference Pipeline
+    use `model` to create a Speaker Verification pipeline.
+
+    Args:
+        model (SpeakerVerificationPipeline): A model instance, or a model local dir, or a model id in the model hub.
+        kwargs (dict, `optional`):
+            Extra kwargs passed into the pipeline's constructor.
+    Example:
+    >>> from modelscope.pipelines import pipeline
+    >>> from modelscope.utils.constant import Tasks
+    >>> p = pipeline(
+    >>>    task=Tasks.speaker_verification, model='damo/speech_ecapa-tdnn_sv_en_voxceleb_16k')
+    >>> print(p([audio_1, audio_2]))
+
+    """
+
+    def __init__(self, model: InputModel, **kwargs):
+        """use `model` to create a speaker verification pipeline for prediction
+        Args:
+            model (str): a valid offical model id
+        """
+        super().__init__(model=model, **kwargs)
+        self.model_config = self.model.model_config
+        self.config = self.model.other_config
+        self.thr = self.config['yesOrno_thr']
+
+    def __call__(self,
+                 in_audios: List[str],
+                 thr: float = None) -> Dict[str, Any]:
+        if thr is not None:
+            self.thr = thr
+        if self.thr < -1 or self.thr > 1:
+            raise ValueError(
+                'modelscope error: the thr value should be in [-1, 1], but found to be %f.'
+                % self.thr)
+        outputs = self.preprocess(in_audios)
+        outputs = self.forward(outputs)
+        outputs = self.postprocess(outputs)
+
+        return outputs
+
+    def forward(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
+        emb1 = self.model(inputs['data1'])
+        emb2 = self.model(inputs['data2'])
+
+        return {'emb1': emb1, 'emb2': emb2}
+
+    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
+        score = self.compute_cos_similarity(inputs['emb1'], inputs['emb2'])
+        score = round(score, 5)
+        if score >= self.thr:
+            ans = 'yes'
+        else:
+            ans = 'no'
+
+        return {OutputKeys.SCORE: score, OutputKeys.TEXT: ans}
+
+    def preprocess(self, inputs: List[str],
+                   **preprocess_params) -> Dict[str, Any]:
+        if len(inputs) != 2:
+            raise ValueError(
+                'modelscope error: Two input audio files are required.')
+        output = {}
+        for i in range(len(inputs)):
+            if isinstance(inputs[i], str):
+                file_bytes = File.read(inputs[i])
+                data, fs = sf.read(io.BytesIO(file_bytes), dtype='float32')
+                if len(data.shape) == 2:
+                    data = data[:, 0]
+                if fs != self.model_config['sample_rate']:
+                    raise ValueError(
+                        'modelscope error: Only support %d sample rate files'
+                        % self.model_cfg['sample_rate'])
+                output['data%d' %
+                       (i + 1)] = torch.from_numpy(data).unsqueeze(0)
+            else:
+                raise ValueError(
+                    'modelscope error: The input type is temporarily restricted to audio file address'
+                    % i)
+        return output
+
+    def compute_cos_similarity(self, emb1: torch.Tensor,
+                               emb2: torch.Tensor) -> float:
+        assert len(emb1.shape) == 2 and len(emb2.shape) == 2
+        cos = torch.nn.CosineSimilarity(dim=1, eps=1e-6)
+        cosine = cos(emb1, emb2)
+        return cosine.item()
diff --git a/modelscope/pipelines/audio/speaker_verification_pipeline.py b/modelscope/pipelines/audio/speaker_verification_pipeline.py
index 2f38cfe3..e2099e2f 100644
--- a/modelscope/pipelines/audio/speaker_verification_pipeline.py
+++ b/modelscope/pipelines/audio/speaker_verification_pipeline.py
@@ -30,12 +30,12 @@ class SpeakerVerificationPipeline(Pipeline):
         model (SpeakerVerificationPipeline): A model instance, or a model local dir, or a model id in the model hub.
         kwargs (dict, `optional`):
             Extra kwargs passed into the preprocessor's constructor.
-    Example:
-    >>> from modelscope.pipelines import pipeline
-    >>> pipeline_punc = pipeline(
-    >>>    task=Tasks.speaker_verification, model='damo/speech_xvector_sv-zh-cn-cnceleb-16k-spk3465-pytorch')
-    >>> audio_in=('','')
-    >>> print(pipeline_punc(audio_in))
+    Examples:
+        >>> from modelscope.pipelines import pipeline
+        >>> pipeline_sv = pipeline(
+        >>>    task=Tasks.speaker_verification, model='damo/speech_xvector_sv-zh-cn-cnceleb-16k-spk3465-pytorch')
+        >>> audio_in=('','')
+        >>> print(pipeline_sv(audio_in))
 
     """
 
@@ -44,32 +44,40 @@ class SpeakerVerificationPipeline(Pipeline):
         """
         super().__init__(model=model, **kwargs)
         self.model_cfg = self.model.forward()
-        self.cmd = self.get_cmd()
+        self.cmd = self.get_cmd(kwargs)
 
         from funasr.bin import sv_inference_launch
         self.funasr_infer_modelscope = sv_inference_launch.inference_launch(
             mode=self.cmd['mode'],
-            ngpu=self.cmd['ngpu'],
-            log_level=self.cmd['log_level'],
-            dtype=self.cmd['dtype'],
-            seed=self.cmd['seed'],
-            sv_train_config=self.cmd['sv_train_config'],
-            sv_model_file=self.cmd['sv_model_file'],
             output_dir=self.cmd['output_dir'],
             batch_size=self.cmd['batch_size'],
+            dtype=self.cmd['dtype'],
+            ngpu=self.cmd['ngpu'],
+            seed=self.cmd['seed'],
             num_workers=self.cmd['num_workers'],
+            log_level=self.cmd['log_level'],
             key_file=self.cmd['key_file'],
-            model_tag=self.cmd['model_tag'])
+            sv_train_config=self.cmd['sv_train_config'],
+            sv_model_file=self.cmd['sv_model_file'],
+            model_tag=self.cmd['model_tag'],
+            allow_variable_data_keys=self.cmd['allow_variable_data_keys'],
+            streaming=self.cmd['streaming'],
+            embedding_node=self.cmd['embedding_node'],
+            sv_threshold=self.cmd['sv_threshold'],
+            param_dict=self.cmd['param_dict'],
+        )
 
     def __call__(self,
                  audio_in: Union[tuple, str, Any] = None,
-                 output_dir: str = None) -> Dict[str, Any]:
+                 output_dir: str = None,
+                 param_dict: dict = None) -> Dict[str, Any]:
         if len(audio_in) == 0:
-            raise ValueError('The input of ITN should not be null.')
+            raise ValueError('The input of sv should not be null.')
         else:
             self.audio_in = audio_in
         if output_dir is not None:
             self.cmd['output_dir'] = output_dir
+        self.cmd['param_dict'] = param_dict
 
         output = self.forward(self.audio_in)
         result = self.postprocess(output)
@@ -81,17 +89,17 @@ class SpeakerVerificationPipeline(Pipeline):
         rst = {}
         for i in range(len(inputs)):
             if i == 0:
-                if isinstance(self.audio_in, tuple):
+                if isinstance(self.audio_in, tuple) or isinstance(
+                        self.audio_in, list):
                     score = inputs[0]['value']
                     rst[OutputKeys.SCORES] = score
                 else:
                     embedding = inputs[0]['value']
                     rst[OutputKeys.SPK_EMBEDDING] = embedding
-            else:
-                rst[inputs[i]['key']] = inputs[i]['value']
+            rst[inputs[i]['key']] = inputs[i]['value']
         return rst
 
-    def get_cmd(self) -> Dict[str, Any]:
+    def get_cmd(self, extra_args) -> Dict[str, Any]:
         # generate asr inference command
         mode = self.model_cfg['model_config']['mode']
         sv_model_path = self.model_cfg['sv_model_path']
@@ -101,17 +109,39 @@ class SpeakerVerificationPipeline(Pipeline):
         cmd = {
             'mode': mode,
             'output_dir': None,
-            'ngpu': 1,  # 0: only CPU, ngpu>=1: gpu number if cuda is available
             'batch_size': 1,
-            'num_workers': 1,
-            'log_level': 'ERROR',
             'dtype': 'float32',
+            'ngpu': 1,  # 0: only CPU, ngpu>=1: gpu number if cuda is available
             'seed': 0,
+            'num_workers': 0,
+            'log_level': 'ERROR',
             'key_file': None,
             'sv_model_file': sv_model_path,
             'sv_train_config': sv_model_config,
-            'model_tag': None
+            'model_tag': None,
+            'allow_variable_data_keys': True,
+            'streaming': False,
+            'embedding_node': 'resnet1_dense',
+            'sv_threshold': 0.9465,
+            'param_dict': None,
         }
+        user_args_dict = [
+            'output_dir',
+            'batch_size',
+            'ngpu',
+            'embedding_node',
+            'sv_threshold',
+            'log_level',
+            'allow_variable_data_keys',
+            'streaming',
+            'num_workers',
+            'param_dict',
+        ]
+
+        for user_args in user_args_dict:
+            if user_args in extra_args and extra_args[user_args] is not None:
+                cmd[user_args] = extra_args[user_args]
+
         return cmd
 
     def forward(self, audio_in: Union[tuple, str, Any] = None) -> list:
@@ -121,12 +151,31 @@ class SpeakerVerificationPipeline(Pipeline):
             'Speaker Verification Processing: {0} ...'.format(audio_in))
 
         data_cmd, raw_inputs = None, None
-        if isinstance(audio_in, tuple):
+        if isinstance(audio_in, tuple) or isinstance(audio_in, list):
             # generate audio_scp
+            assert len(audio_in) == 2
             if isinstance(audio_in[0], str):
-                audio_scp_1, audio_scp_2 = generate_sv_scp_from_url(audio_in)
-                data_cmd = [(audio_scp_1, 'speech', 'sound'),
-                            (audio_scp_2, 'ref_speech', 'sound')]
+                # for scp inputs
+                if len(audio_in[0].split(',')) == 3 and audio_in[0].split(
+                        ',')[0].endswith('.scp'):
+                    if len(audio_in[1].split(',')) == 3 and audio_in[1].split(
+                            ',')[0].endswith('.scp'):
+                        data_cmd = [
+                            tuple(audio_in[0].split(',')),
+                            tuple(audio_in[1].split(','))
+                        ]
+                # for single-file inputs
+                else:
+                    audio_scp_1, audio_scp_2 = generate_sv_scp_from_url(
+                        audio_in)
+                    if isinstance(audio_scp_1, bytes) and isinstance(
+                            audio_scp_2, bytes):
+                        data_cmd = [(audio_scp_1, 'speech', 'bytes'),
+                                    (audio_scp_2, 'ref_speech', 'bytes')]
+                    else:
+                        data_cmd = [(audio_scp_1, 'speech', 'sound'),
+                                    (audio_scp_2, 'ref_speech', 'sound')]
+            # for raw bytes inputs
             elif isinstance(audio_in[0], bytes):
                 data_cmd = [(audio_in[0], 'speech', 'bytes'),
                             (audio_in[1], 'ref_speech', 'bytes')]
@@ -134,10 +183,20 @@ class SpeakerVerificationPipeline(Pipeline):
                 raise TypeError('Unsupported data type.')
         else:
             if isinstance(audio_in, str):
-                audio_scp = generate_scp_for_sv(audio_in)
-                data_cmd = [(audio_scp, 'speech', 'sound')]
-            elif isinstance(audio_in[0], bytes):
+                # for scp inputs
+                if len(audio_in.split(',')) == 3:
+                    data_cmd = [audio_in.split(',')]
+                # for single-file inputs
+                else:
+                    audio_scp = generate_scp_for_sv(audio_in)
+                    if isinstance(audio_scp, bytes):
+                        data_cmd = [(audio_scp, 'speech', 'bytes')]
+                    else:
+                        data_cmd = [(audio_scp, 'speech', 'sound')]
+            # for raw bytes
+            elif isinstance(audio_in, bytes):
                 data_cmd = [(audio_in, 'speech', 'bytes')]
+            # for ndarray and tensor inputs
             else:
                 import torch
                 import numpy as np
@@ -150,16 +209,17 @@ class SpeakerVerificationPipeline(Pipeline):
 
         self.cmd['name_and_type'] = data_cmd
         self.cmd['raw_inputs'] = raw_inputs
-        punc_result = self.run_inference(self.cmd)
+        result = self.run_inference(self.cmd)
 
-        return punc_result
+        return result
 
     def run_inference(self, cmd):
         if self.framework == Frameworks.torch:
             sv_result = self.funasr_infer_modelscope(
                 data_path_and_name_and_type=cmd['name_and_type'],
                 raw_inputs=cmd['raw_inputs'],
-                output_dir_v2=cmd['output_dir'])
+                output_dir_v2=cmd['output_dir'],
+                param_dict=cmd['param_dict'])
         else:
             raise ValueError('model type is mismatching')
 
diff --git a/modelscope/pipelines/audio/voice_activity_detection_pipeline.py b/modelscope/pipelines/audio/voice_activity_detection_pipeline.py
new file mode 100644
index 00000000..f3248af1
--- /dev/null
+++ b/modelscope/pipelines/audio/voice_activity_detection_pipeline.py
@@ -0,0 +1,225 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os
+from typing import Any, Dict, List, Sequence, Tuple, Union
+
+import json
+from funasr.utils import asr_utils
+
+from modelscope.metainfo import Pipelines
+from modelscope.models import Model
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines.base import Pipeline
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.utils.audio.audio_utils import generate_scp_from_url
+from modelscope.utils.constant import Frameworks, ModelFile, Tasks
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+__all__ = ['VoiceActivityDetectionPipeline']
+
+
+@PIPELINES.register_module(
+    Tasks.voice_activity_detection, module_name=Pipelines.vad_inference)
+class VoiceActivityDetectionPipeline(Pipeline):
+    """Voice Activity Detection Inference Pipeline
+    use `model` to create a Voice Activity Detection pipeline.
+
+    Args:
+        model: A model instance, or a model local dir, or a model id in the model hub.
+        kwargs (dict, `optional`):
+            Extra kwargs passed into the preprocessor's constructor.
+
+    Example:
+        >>> from modelscope.pipelines import pipeline
+        >>> pipeline_vad = pipeline(
+        >>>    task=Tasks.voice_activity_detection, model='damo/speech_fsmn_vad_zh-cn-16k-common-pytorch')
+        >>> audio_in='https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/vad_example.pcm'
+        >>> print(pipeline_vad(audio_in))
+
+    """
+
+    def __init__(self, model: Union[Model, str] = None, **kwargs):
+        """use `model` to create an vad pipeline for prediction
+        """
+        super().__init__(model=model, **kwargs)
+        config_path = os.path.join(model, ModelFile.CONFIGURATION)
+        self.cmd = self.get_cmd(config_path, kwargs)
+
+        from funasr.bin import vad_inference_launch
+        self.funasr_infer_modelscope = vad_inference_launch.inference_launch(
+            mode=self.cmd['mode'],
+            batch_size=self.cmd['batch_size'],
+            dtype=self.cmd['dtype'],
+            ngpu=self.cmd['ngpu'],
+            seed=self.cmd['seed'],
+            num_workers=self.cmd['num_workers'],
+            log_level=self.cmd['log_level'],
+            key_file=self.cmd['key_file'],
+            vad_infer_config=self.cmd['vad_infer_config'],
+            vad_model_file=self.cmd['vad_model_file'],
+            vad_cmvn_file=self.cmd['vad_cmvn_file'])
+
+    def __call__(self,
+                 audio_in: Union[str, bytes],
+                 audio_fs: int = None,
+                 recog_type: str = None,
+                 audio_format: str = None,
+                 output_dir: str = None,
+                 param_dict: dict = None) -> Dict[str, Any]:
+        """
+        Decoding the input audios
+        Args:
+            audio_in('str' or 'bytes'):
+                - A string containing a local path to a wav file
+                - A string containing a local path to a scp
+                - A string containing a wav url
+                - A bytes input
+            audio_fs('int'):
+                frequency of sample
+            recog_type('str'):
+                recog type for wav file or datasets file ('wav', 'test', 'dev', 'train')
+            audio_format('str'):
+                audio format ('pcm', 'scp', 'kaldi_ark', 'tfrecord')
+            output_dir('str'):
+                output dir
+            param_dict('dict'):
+                extra kwargs
+        Return:
+            A dictionary of result or a list of dictionary of result.
+
+            The dictionary contain the following keys:
+            - **text** ('str') --The vad result.
+        """
+        self.recog_type = recog_type
+        self.audio_format = audio_format
+        self.audio_fs = audio_fs
+        checking_audio_fs = None
+        self.raw_inputs = None
+        if output_dir is not None:
+            self.cmd['output_dir'] = output_dir
+        if audio_fs is not None:
+            self.cmd['fs']['audio_fs'] = audio_fs
+        if isinstance(audio_in, str):
+            # for funasr code, generate wav.scp from url or local path
+            self.audio_in, self.raw_inputs = generate_scp_from_url(audio_in)
+        elif isinstance(audio_in, bytes):
+            self.audio_in = audio_in
+            self.raw_inputs = None
+        else:
+            import numpy
+            import torch
+            if isinstance(audio_in, torch.Tensor):
+                self.audio_in = None
+                self.raw_inputs = audio_in
+            elif isinstance(audio_in, numpy.ndarray):
+                self.audio_in = None
+                self.raw_inputs = audio_in
+        if output_dir is not None:
+            self.cmd['output_dir'] = output_dir
+        if param_dict is not None:
+            self.cmd['param_dict'] = param_dict
+
+        # set the sample_rate of audio_in if checking_audio_fs is valid
+        if checking_audio_fs is not None:
+            self.audio_fs = checking_audio_fs
+
+        if recog_type is None or audio_format is None:
+            self.recog_type, self.audio_format, self.audio_in = asr_utils.type_checking(
+                audio_in=self.audio_in,
+                recog_type=recog_type,
+                audio_format=audio_format)
+
+        if hasattr(asr_utils,
+                   'sample_rate_checking') and self.audio_in is not None:
+            checking_audio_fs = asr_utils.sample_rate_checking(
+                self.audio_in, self.audio_format)
+            if checking_audio_fs is not None:
+                self.audio_fs = checking_audio_fs
+        output = self.forward(self.audio_in)
+        result = self.postprocess(output)
+        return result
+
+    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
+        """Postprocessing
+        """
+        rst = {}
+        for i in range(len(inputs)):
+            if i == 0:
+                text = inputs[0]['value']
+                if len(text) > 0:
+                    rst[OutputKeys.TEXT] = text
+            else:
+                rst[inputs[i]['key']] = inputs[i]['value']
+        return rst
+
+    def get_cmd(self, config_path, extra_args) -> Dict[str, Any]:
+        model_cfg = json.loads(open(config_path).read())
+        model_dir = os.path.dirname(config_path)
+        # generate inference command
+        vad_model_path = os.path.join(
+            model_dir, model_cfg['model']['model_config']['vad_model_name'])
+        vad_model_config = os.path.join(
+            model_dir, model_cfg['model']['model_config']['vad_model_config'])
+        vad_cmvn_file = os.path.join(
+            model_dir, model_cfg['model']['model_config']['vad_mvn_file'])
+        mode = model_cfg['model']['model_config']['mode']
+        cmd = {
+            'mode': mode,
+            'batch_size': 1,
+            'dtype': 'float32',
+            'ngpu': 1,  # 0: only CPU, ngpu>=1: gpu number if cuda is available
+            'seed': 0,
+            'num_workers': 0,
+            'log_level': 'ERROR',
+            'key_file': None,
+            'vad_infer_config': vad_model_config,
+            'vad_model_file': vad_model_path,
+            'vad_cmvn_file': vad_cmvn_file,
+            'output_dir': None,
+            'param_dict': None,
+        }
+
+        user_args_dict = [
+            'output_dir', 'batch_size', 'mode', 'ngpu', 'param_dict',
+            'num_workers'
+        ]
+
+        for user_args in user_args_dict:
+            if user_args in extra_args and extra_args[user_args] is not None:
+                cmd[user_args] = extra_args[user_args]
+
+        return cmd
+
+    def forward(self, audio_in: Dict[str, Any]) -> Dict[str, Any]:
+        """Decoding
+        """
+        logger.info('VAD Processing ...')
+        # generate inputs
+        data_cmd: Sequence[Tuple[str, str, str]]
+        if isinstance(self.audio_in, bytes):
+            data_cmd = [self.audio_in, 'speech', 'bytes']
+        elif isinstance(self.audio_in, str):
+            data_cmd = [self.audio_in, 'speech', 'sound']
+        elif self.raw_inputs is not None:
+            data_cmd = None
+        self.cmd['name_and_type'] = data_cmd
+        self.cmd['raw_inputs'] = self.raw_inputs
+        self.cmd['audio_in'] = self.audio_in
+
+        vad_result = self.run_inference(self.cmd)
+
+        return vad_result
+
+    def run_inference(self, cmd):
+        vad_result = []
+        if self.framework == Frameworks.torch:
+            vad_result = self.funasr_infer_modelscope(
+                data_path_and_name_and_type=cmd['name_and_type'],
+                raw_inputs=cmd['raw_inputs'],
+                output_dir_v2=cmd['output_dir'],
+                param_dict=cmd['param_dict'])
+        else:
+            raise ValueError('model type is mismatching')
+
+        return vad_result
diff --git a/modelscope/pipelines/base.py b/modelscope/pipelines/base.py
index be16fd5d..374b05d0 100644
--- a/modelscope/pipelines/base.py
+++ b/modelscope/pipelines/base.py
@@ -22,7 +22,6 @@ from modelscope.utils.device import (create_device, device_placement,
 from modelscope.utils.hub import read_config, snapshot_download
 from modelscope.utils.import_utils import is_tf_available, is_torch_available
 from modelscope.utils.logger import get_logger
-from modelscope.utils.torch_utils import _find_free_port, _is_free_port
 from .util import is_model, is_official_hub_path
 
 if is_torch_available():
@@ -256,7 +255,6 @@ class Pipeline(ABC):
         postprocess_params = kwargs.get('postprocess_params')
 
         # batch data
-        batched_input = {}
         output_list = []
         for i in range(0, len(input), batch_size):
             end = min(i + batch_size, len(input))
@@ -268,13 +266,14 @@ class Pipeline(ABC):
             with device_placement(self.framework, self.device_name):
                 if self.framework == Frameworks.torch:
                     with torch.no_grad():
+                        batched_out = self._batch(preprocessed_list)
                         if self._auto_collate:
-                            out = self._batch(preprocessed_list)
-                            batched_out = self._collate_fn(out)
+                            batched_out = self._collate_fn(batched_out)
                         batched_out = self.forward(batched_out,
                                                    **forward_params)
                 else:
-                    batched_out = self.forward(batched_input, **forward_params)
+                    batched_out = self._batch(preprocessed_list)
+                    batched_out = self.forward(batched_out, **forward_params)
 
             for batch_idx in range(real_batch_size):
                 out = {}
@@ -426,6 +425,7 @@ class DistributedPipeline(Pipeline):
             'master_ip']
         master_port = '29500' if 'master_port' not in kwargs else kwargs[
             'master_port']
+        from modelscope.utils.torch_utils import _find_free_port, _is_free_port
         if not _is_free_port(int(master_port)):
             master_port = str(_find_free_port())
         self.model_pool.map(
diff --git a/modelscope/pipelines/builder.py b/modelscope/pipelines/builder.py
index 951d201c..b67a68b5 100644
--- a/modelscope/pipelines/builder.py
+++ b/modelscope/pipelines/builder.py
@@ -4,7 +4,7 @@ import os
 from typing import List, Optional, Union
 
 from modelscope.hub.snapshot_download import snapshot_download
-from modelscope.metainfo import Pipelines
+from modelscope.metainfo import DEFAULT_MODEL_FOR_PIPELINE, Pipelines
 from modelscope.models.base import Model
 from modelscope.utils.config import ConfigDict, check_config
 from modelscope.utils.constant import DEFAULT_MODEL_REVISION, Invoke, Tasks
@@ -15,272 +15,6 @@ from .util import is_official_hub_path
 
 PIPELINES = Registry('pipelines')
 
-DEFAULT_MODEL_FOR_PIPELINE = {
-    # TaskName: (pipeline_module_name, model_repo)
-    Tasks.sentence_embedding:
-    (Pipelines.sentence_embedding,
-     'damo/nlp_corom_sentence-embedding_english-base'),
-    Tasks.text_ranking: (Pipelines.text_ranking,
-                         'damo/nlp_corom_passage-ranking_english-base'),
-    Tasks.text_ranking: (Pipelines.mgeo_ranking,
-                         'damo/mgeo_address_ranking_chinese_base'),
-    Tasks.word_segmentation:
-    (Pipelines.word_segmentation,
-     'damo/nlp_structbert_word-segmentation_chinese-base'),
-    Tasks.part_of_speech: (Pipelines.part_of_speech,
-                           'damo/nlp_structbert_part-of-speech_chinese-base'),
-    Tasks.token_classification:
-    (Pipelines.part_of_speech,
-     'damo/nlp_structbert_part-of-speech_chinese-base'),
-    Tasks.named_entity_recognition:
-    (Pipelines.named_entity_recognition,
-     'damo/nlp_raner_named-entity-recognition_chinese-base-news'),
-    Tasks.relation_extraction:
-    (Pipelines.relation_extraction,
-     'damo/nlp_bert_relation-extraction_chinese-base'),
-    Tasks.information_extraction:
-    (Pipelines.relation_extraction,
-     'damo/nlp_bert_relation-extraction_chinese-base'),
-    Tasks.sentence_similarity:
-    (Pipelines.sentence_similarity,
-     'damo/nlp_structbert_sentence-similarity_chinese-base'),
-    Tasks.translation: (Pipelines.csanmt_translation,
-                        'damo/nlp_csanmt_translation_zh2en'),
-    Tasks.nli: (Pipelines.nli, 'damo/nlp_structbert_nli_chinese-base'),
-    Tasks.sentiment_classification:
-    (Pipelines.sentiment_classification,
-     'damo/nlp_structbert_sentiment-classification_chinese-base'
-     ),  # TODO: revise back after passing the pr
-    Tasks.portrait_matting: (Pipelines.portrait_matting,
-                             'damo/cv_unet_image-matting'),
-    Tasks.human_detection: (Pipelines.human_detection,
-                            'damo/cv_resnet18_human-detection'),
-    Tasks.image_object_detection: (Pipelines.object_detection,
-                                   'damo/cv_vit_object-detection_coco'),
-    Tasks.image_denoising: (Pipelines.image_denoise,
-                            'damo/cv_nafnet_image-denoise_sidd'),
-    Tasks.image_deblurring: (Pipelines.image_deblur,
-                             'damo/cv_nafnet_image-deblur_gopro'),
-    Tasks.video_stabilization: (Pipelines.video_stabilization,
-                                'damo/cv_dut-raft_video-stabilization_base'),
-    Tasks.video_super_resolution:
-    (Pipelines.video_super_resolution,
-     'damo/cv_realbasicvsr_video-super-resolution_videolq'),
-    Tasks.text_classification:
-    (Pipelines.sentiment_classification,
-     'damo/nlp_structbert_sentiment-classification_chinese-base'),
-    Tasks.text_generation: (Pipelines.text_generation,
-                            'damo/nlp_palm2.0_text-generation_chinese-base'),
-    Tasks.zero_shot_classification:
-    (Pipelines.zero_shot_classification,
-     'damo/nlp_structbert_zero-shot-classification_chinese-base'),
-    Tasks.task_oriented_conversation: (Pipelines.dialog_modeling,
-                                       'damo/nlp_space_dialog-modeling'),
-    Tasks.dialog_state_tracking: (Pipelines.dialog_state_tracking,
-                                  'damo/nlp_space_dialog-state-tracking'),
-    Tasks.table_question_answering:
-    (Pipelines.table_question_answering_pipeline,
-     'damo/nlp-convai-text2sql-pretrain-cn'),
-    Tasks.text_error_correction:
-    (Pipelines.text_error_correction,
-     'damo/nlp_bart_text-error-correction_chinese'),
-    Tasks.image_captioning: (Pipelines.image_captioning,
-                             'damo/ofa_image-caption_coco_large_en'),
-    Tasks.video_captioning:
-    (Pipelines.video_captioning,
-     'damo/multi-modal_hitea_video-captioning_base_en'),
-    Tasks.image_portrait_stylization:
-    (Pipelines.person_image_cartoon,
-     'damo/cv_unet_person-image-cartoon_compound-models'),
-    Tasks.ocr_detection: (Pipelines.ocr_detection,
-                          'damo/cv_resnet18_ocr-detection-line-level_damo'),
-    Tasks.table_recognition:
-    (Pipelines.table_recognition,
-     'damo/cv_dla34_table-structure-recognition_cycle-centernet'),
-    Tasks.document_vl_embedding:
-    (Pipelines.document_vl_embedding,
-     'damo/multi-modal_convnext-roberta-base_vldoc-embedding'),
-    Tasks.license_plate_detection:
-    (Pipelines.license_plate_detection,
-     'damo/cv_resnet18_license-plate-detection_damo'),
-    Tasks.fill_mask: (Pipelines.fill_mask, 'damo/nlp_veco_fill-mask-large'),
-    Tasks.feature_extraction: (Pipelines.feature_extraction,
-                               'damo/pert_feature-extraction_base-test'),
-    Tasks.action_recognition: (Pipelines.action_recognition,
-                               'damo/cv_TAdaConv_action-recognition'),
-    Tasks.action_detection: (Pipelines.action_detection,
-                             'damo/cv_ResNetC3D_action-detection_detection2d'),
-    Tasks.live_category: (Pipelines.live_category,
-                          'damo/cv_resnet50_live-category'),
-    Tasks.video_category: (Pipelines.video_category,
-                           'damo/cv_resnet50_video-category'),
-    Tasks.multi_modal_embedding: (Pipelines.multi_modal_embedding,
-                                  'damo/multi-modal_clip-vit-base-patch16_zh'),
-    Tasks.generative_multi_modal_embedding:
-    (Pipelines.generative_multi_modal_embedding,
-     'damo/multi-modal_gemm-vit-large-patch14_generative-multi-modal-embedding'
-     ),
-    Tasks.multi_modal_similarity:
-    (Pipelines.multi_modal_similarity,
-     'damo/multi-modal_team-vit-large-patch14_multi-modal-similarity'),
-    Tasks.visual_question_answering:
-    (Pipelines.visual_question_answering,
-     'damo/mplug_visual-question-answering_coco_large_en'),
-    Tasks.video_question_answering:
-    (Pipelines.video_question_answering,
-     'damo/multi-modal_hitea_video-question-answering_base_en'),
-    Tasks.video_embedding: (Pipelines.cmdssl_video_embedding,
-                            'damo/cv_r2p1d_video_embedding'),
-    Tasks.text_to_image_synthesis:
-    (Pipelines.text_to_image_synthesis,
-     'damo/cv_diffusion_text-to-image-synthesis_tiny'),
-    Tasks.body_2d_keypoints: (Pipelines.body_2d_keypoints,
-                              'damo/cv_hrnetv2w32_body-2d-keypoints_image'),
-    Tasks.body_3d_keypoints: (Pipelines.body_3d_keypoints,
-                              'damo/cv_canonical_body-3d-keypoints_video'),
-    Tasks.hand_2d_keypoints:
-    (Pipelines.hand_2d_keypoints,
-     'damo/cv_hrnetw18_hand-pose-keypoints_coco-wholebody'),
-    Tasks.card_detection: (Pipelines.card_detection,
-                           'damo/cv_resnet_carddetection_scrfd34gkps'),
-    Tasks.face_detection:
-    (Pipelines.mog_face_detection,
-     'damo/cv_resnet101_face-detection_cvpr22papermogface'),
-    Tasks.face_liveness: (Pipelines.face_liveness_ir,
-                          'damo/cv_manual_face-liveness_flir'),
-    Tasks.face_recognition: (Pipelines.face_recognition,
-                             'damo/cv_ir101_facerecognition_cfglint'),
-    Tasks.facial_expression_recognition:
-    (Pipelines.facial_expression_recognition,
-     'damo/cv_vgg19_facial-expression-recognition_fer'),
-    Tasks.facial_landmark_confidence:
-    (Pipelines.facial_landmark_confidence,
-     'damo/cv_manual_facial-landmark-confidence_flcm'),
-    Tasks.face_attribute_recognition:
-    (Pipelines.face_attribute_recognition,
-     'damo/cv_resnet34_face-attribute-recognition_fairface'),
-    Tasks.face_2d_keypoints: (Pipelines.face_2d_keypoints,
-                              'damo/cv_mobilenet_face-2d-keypoints_alignment'),
-    Tasks.video_multi_modal_embedding:
-    (Pipelines.video_multi_modal_embedding,
-     'damo/multi_modal_clip_vtretrival_msrvtt_53'),
-    Tasks.image_color_enhancement:
-    (Pipelines.image_color_enhance,
-     'damo/cv_csrnet_image-color-enhance-models'),
-    Tasks.virtual_try_on: (Pipelines.virtual_try_on,
-                           'damo/cv_daflow_virtual-try-on_base'),
-    Tasks.image_colorization: (Pipelines.ddcolor_image_colorization,
-                               'damo/cv_ddcolor_image-colorization'),
-    Tasks.image_segmentation:
-    (Pipelines.image_instance_segmentation,
-     'damo/cv_swin-b_image-instance-segmentation_coco'),
-    Tasks.image_depth_estimation:
-    (Pipelines.image_depth_estimation,
-     'damo/cv_newcrfs_image-depth-estimation_indoor'),
-    Tasks.indoor_layout_estimation:
-    (Pipelines.indoor_layout_estimation,
-     'damo/cv_panovit_indoor-layout-estimation'),
-    Tasks.video_depth_estimation:
-    (Pipelines.video_depth_estimation,
-     'damo/cv_dro-resnet18_video-depth-estimation_indoor'),
-    Tasks.panorama_depth_estimation:
-    (Pipelines.panorama_depth_estimation,
-     'damo/cv_unifuse_panorama-depth-estimation'),
-    Tasks.image_style_transfer: (Pipelines.image_style_transfer,
-                                 'damo/cv_aams_style-transfer_damo'),
-    Tasks.face_image_generation: (Pipelines.face_image_generation,
-                                  'damo/cv_gan_face-image-generation'),
-    Tasks.image_super_resolution: (Pipelines.image_super_resolution,
-                                   'damo/cv_rrdb_image-super-resolution'),
-    Tasks.image_portrait_enhancement:
-    (Pipelines.image_portrait_enhancement,
-     'damo/cv_gpen_image-portrait-enhancement'),
-    Tasks.product_retrieval_embedding:
-    (Pipelines.product_retrieval_embedding,
-     'damo/cv_resnet50_product-bag-embedding-models'),
-    Tasks.image_to_image_generation:
-    (Pipelines.image_to_image_generation,
-     'damo/cv_latent_diffusion_image2image_generate'),
-    Tasks.image_classification:
-    (Pipelines.daily_image_classification,
-     'damo/cv_vit-base_image-classification_Dailylife-labels'),
-    Tasks.image_object_detection:
-    (Pipelines.image_object_detection_auto,
-     'damo/cv_yolox_image-object-detection-auto'),
-    Tasks.ocr_recognition:
-    (Pipelines.ocr_recognition,
-     'damo/cv_convnextTiny_ocr-recognition-general_damo'),
-    Tasks.skin_retouching: (Pipelines.skin_retouching,
-                            'damo/cv_unet_skin-retouching'),
-    Tasks.faq_question_answering:
-    (Pipelines.faq_question_answering,
-     'damo/nlp_structbert_faq-question-answering_chinese-base'),
-    Tasks.crowd_counting: (Pipelines.crowd_counting,
-                           'damo/cv_hrnet_crowd-counting_dcanet'),
-    Tasks.video_single_object_tracking:
-    (Pipelines.video_single_object_tracking,
-     'damo/cv_vitb_video-single-object-tracking_ostrack'),
-    Tasks.image_reid_person: (Pipelines.image_reid_person,
-                              'damo/cv_passvitb_image-reid-person_market'),
-    Tasks.text_driven_segmentation:
-    (Pipelines.text_driven_segmentation,
-     'damo/cv_vitl16_segmentation_text-driven-seg'),
-    Tasks.movie_scene_segmentation:
-    (Pipelines.movie_scene_segmentation,
-     'damo/cv_resnet50-bert_video-scene-segmentation_movienet'),
-    Tasks.shop_segmentation: (Pipelines.shop_segmentation,
-                              'damo/cv_vitb16_segmentation_shop-seg'),
-    Tasks.image_inpainting: (Pipelines.image_inpainting,
-                             'damo/cv_fft_inpainting_lama'),
-    Tasks.video_inpainting: (Pipelines.video_inpainting,
-                             'damo/cv_video-inpainting'),
-    Tasks.video_human_matting: (Pipelines.video_human_matting,
-                                'damo/cv_effnetv2_video-human-matting'),
-    Tasks.video_frame_interpolation:
-    (Pipelines.video_frame_interpolation,
-     'damo/cv_raft_video-frame-interpolation'),
-    Tasks.human_wholebody_keypoint:
-    (Pipelines.human_wholebody_keypoint,
-     'damo/cv_hrnetw48_human-wholebody-keypoint_image'),
-    Tasks.hand_static: (Pipelines.hand_static,
-                        'damo/cv_mobileface_hand-static'),
-    Tasks.face_human_hand_detection:
-    (Pipelines.face_human_hand_detection,
-     'damo/cv_nanodet_face-human-hand-detection'),
-    Tasks.face_emotion: (Pipelines.face_emotion, 'damo/cv_face-emotion'),
-    Tasks.product_segmentation: (Pipelines.product_segmentation,
-                                 'damo/cv_F3Net_product-segmentation'),
-    Tasks.referring_video_object_segmentation: (
-        Pipelines.referring_video_object_segmentation,
-        'damo/cv_swin-t_referring_video-object-segmentation'),
-    Tasks.video_summarization: (Pipelines.video_summarization,
-                                'damo/cv_googlenet_pgl-video-summarization'),
-    Tasks.image_skychange: (Pipelines.image_skychange,
-                            'damo/cv_hrnetocr_skychange'),
-    Tasks.translation_evaluation: (
-        Pipelines.translation_evaluation,
-        'damo/nlp_unite_mup_translation_evaluation_multilingual_large'),
-    Tasks.video_object_segmentation: (
-        Pipelines.video_object_segmentation,
-        'damo/cv_rdevos_video-object-segmentation'),
-    Tasks.video_multi_object_tracking: (
-        Pipelines.video_multi_object_tracking,
-        'damo/cv_yolov5_video-multi-object-tracking_fairmot'),
-    Tasks.image_multi_view_depth_estimation: (
-        Pipelines.image_multi_view_depth_estimation,
-        'damo/cv_casmvs_multi-view-depth-estimation_general'),
-    Tasks.image_fewshot_detection: (
-        Pipelines.image_fewshot_detection,
-        'damo/cv_resnet101_detection_fewshot-defrcn'),
-    Tasks.image_body_reshaping: (Pipelines.image_body_reshaping,
-                                 'damo/cv_flow-based-body-reshaping_damo'),
-    Tasks.image_face_fusion: (Pipelines.image_face_fusion,
-                              'damo/cv_unet-image-face-fusion_damo'),
-    Tasks.image_matching: (
-        Pipelines.image_matching,
-        'damo/cv_quadtree_attention_image-matching_outdoor'),
-}
-
 
 def normalize_model_input(model, model_revision):
     """ normalize the input model, to ensure that a model str is a valid local path: in other words,
@@ -349,16 +83,15 @@ def pipeline(task: str = None,
         pipeline (obj:`Pipeline`): pipeline object for certain task.
 
     Examples:
-    ```python
-    >>> # Using default model for a task
-    >>> p = pipeline('image-classification')
-    >>> # Using pipeline with a model name
-    >>> p = pipeline('text-classification', model='damo/distilbert-base-uncased')
-    >>> # Using pipeline with a model object
-    >>> resnet = Model.from_pretrained('Resnet')
-    >>> p = pipeline('image-classification', model=resnet)
-    >>> # Using pipeline with a list of model names
-    >>> p = pipeline('audio-kws', model=['damo/audio-tts', 'damo/auto-tts2'])
+        >>> # Using default model for a task
+        >>> p = pipeline('image-classification')
+        >>> # Using pipeline with a model name
+        >>> p = pipeline('text-classification', model='damo/distilbert-base-uncased')
+        >>> # Using pipeline with a model object
+        >>> resnet = Model.from_pretrained('Resnet')
+        >>> p = pipeline('image-classification', model=resnet)
+        >>> # Using pipeline with a list of model names
+        >>> p = pipeline('audio-kws', model=['damo/audio-tts', 'damo/auto-tts2'])
     """
     if task is None and pipeline_name is None:
         raise ValueError('task or pipeline_name is required')
diff --git a/modelscope/pipelines/cv/__init__.py b/modelscope/pipelines/cv/__init__.py
index c9666398..bf791c82 100644
--- a/modelscope/pipelines/cv/__init__.py
+++ b/modelscope/pipelines/cv/__init__.py
@@ -11,6 +11,7 @@ if TYPE_CHECKING:
     from .body_3d_keypoints_pipeline import Body3DKeypointsPipeline
     from .hand_2d_keypoints_pipeline import Hand2DKeypointsPipeline
     from .cmdssl_video_embedding_pipeline import CMDSSLVideoEmbeddingPipeline
+    from .card_detection_pipeline import CardDetectionPipeline
     from .hicossl_video_embedding_pipeline import HICOSSLVideoEmbeddingPipeline
     from .crowd_counting_pipeline import CrowdCountingPipeline
     from .image_detection_pipeline import ImageDetectionPipeline
@@ -21,6 +22,8 @@ if TYPE_CHECKING:
     from .face_recognition_ood_pipeline import FaceRecognitionOodPipeline
     from .arc_face_recognition_pipeline import ArcFaceRecognitionPipeline
     from .mask_face_recognition_pipeline import MaskFaceRecognitionPipeline
+    from .face_recognition_onnx_ir_pipeline import FaceRecognitionOnnxIrPipeline
+    from .face_recognition_onnx_fm_pipeline import FaceRecognitionOnnxFmPipeline
     from .general_recognition_pipeline import GeneralRecognitionPipeline
     from .image_cartoon_pipeline import ImageCartoonPipeline
     from .image_classification_pipeline import GeneralImageClassificationPipeline
@@ -41,6 +44,7 @@ if TYPE_CHECKING:
     from .image_to_image_generate_pipeline import Image2ImageGenerationPipeline
     from .image_to_image_translation_pipeline import Image2ImageTranslationPipeline
     from .image_inpainting_pipeline import ImageInpaintingPipeline
+    from .image_paintbyexample_pipeline import ImagePaintbyexamplePipeline
     from .product_retrieval_embedding_pipeline import ProductRetrievalEmbeddingPipeline
     from .realtime_object_detection_pipeline import RealtimeObjectDetectionPipeline
     from .live_category_pipeline import LiveCategoryPipeline
@@ -49,6 +53,7 @@ if TYPE_CHECKING:
     from .license_plate_detection_pipeline import LicensePlateDetectionPipeline
     from .table_recognition_pipeline import TableRecognitionPipeline
     from .skin_retouching_pipeline import SkinRetouchingPipeline
+    from .face_reconstruction_pipeline import FaceReconstructionPipeline
     from .tinynas_classification_pipeline import TinynasClassificationPipeline
     from .video_category_pipeline import VideoCategoryPipeline
     from .virtual_try_on_pipeline import VirtualTryonPipeline
@@ -70,11 +75,17 @@ if TYPE_CHECKING:
     from .hand_static_pipeline import HandStaticPipeline
     from .referring_video_object_segmentation_pipeline import ReferringVideoObjectSegmentationPipeline
     from .language_guided_video_summarization_pipeline import LanguageGuidedVideoSummarizationPipeline
+    from .vision_efficient_tuning_adapter_pipeline import VisionEfficientTuningAdapterPipeline
+    from .vision_efficient_tuning_prompt_pipeline import VisionEfficientTuningPromptPipeline
+    from .vision_efficient_tuning_prefix_pipeline import VisionEfficientTuningPrefixPipeline
+    from .vision_efficient_tuning_lora_pipeline import VisionEfficientTuningLoRAPipeline
     from .vision_middleware_pipeline import VisionMiddlewarePipeline
     from .video_frame_interpolation_pipeline import VideoFrameInterpolationPipeline
     from .image_skychange_pipeline import ImageSkychangePipeline
+    from .image_driving_perception_pipeline import ImageDrivingPerceptionPipeline
     from .vop_retrieval_pipeline import VopRetrievalPipeline
     from .video_object_segmentation_pipeline import VideoObjectSegmentationPipeline
+    from .video_deinterlace_pipeline import VideoDeinterlacePipeline
     from .image_matching_pipeline import ImageMatchingPipeline
     from .video_stabilization_pipeline import VideoStabilizationPipeline
     from .video_super_resolution_pipeline import VideoSuperResolutionPipeline
@@ -84,7 +95,19 @@ if TYPE_CHECKING:
     from .image_mvs_depth_estimation_pipeline import ImageMultiViewDepthEstimationPipeline
     from .panorama_depth_estimation_pipeline import PanoramaDepthEstimationPipeline
     from .ddcolor_image_colorization_pipeline import DDColorImageColorizationPipeline
+    from .image_structured_model_probing_pipeline import ImageStructuredModelProbingPipeline
+    from .video_colorization_pipeline import VideoColorizationPipeline
     from .image_defrcn_fewshot_pipeline import ImageDefrcnDetectionPipeline
+    from .image_quality_assessment_degradation_pipeline import ImageQualityAssessmentDegradationPipeline
+    from .image_open_vocabulary_detection_pipeline import ImageOpenVocabularyDetectionPipeline
+    from .object_detection_3d_pipeline import ObjectDetection3DPipeline
+    from .ddpm_semantic_segmentation_pipeline import DDPMImageSemanticSegmentationPipeline
+    from .image_inpainting_sdv2_pipeline import ImageInpaintingSDV2Pipeline
+    from .image_quality_assessment_mos_pipeline import ImageQualityAssessmentMosPipeline
+    from .bad_image_detecting_pipeline import BadImageDetecingPipeline
+    from .mobile_image_super_resolution_pipeline import MobileImageSuperResolutionPipeline
+    from .image_human_parsing_pipeline import ImageHumanParsingPipeline
+    from .nerf_recon_acc_pipeline import NeRFReconAccPipeline
 
 else:
     _import_structure = {
@@ -94,6 +117,7 @@ else:
         'body_2d_keypoints_pipeline': ['Body2DKeypointsPipeline'],
         'body_3d_keypoints_pipeline': ['Body3DKeypointsPipeline'],
         'hand_2d_keypoints_pipeline': ['Hand2DKeypointsPipeline'],
+        'card_detection_pipeline': ['CardDetectionPipeline'],
         'cmdssl_video_embedding_pipeline': ['CMDSSLVideoEmbeddingPipeline'],
         'hicossl_video_embedding_pipeline': ['HICOSSLVideoEmbeddingPipeline'],
         'crowd_counting_pipeline': ['CrowdCountingPipeline'],
@@ -105,6 +129,8 @@ else:
         'face_recognition_ood_pipeline': ['FaceRecognitionOodPipeline'],
         'arc_face_recognition_pipeline': ['ArcFaceRecognitionPipeline'],
         'mask_face_recognition_pipeline': ['MaskFaceRecognitionPipeline'],
+        'face_recognition_onnx_ir_pipeline': ['FaceRecognitionOnnxIrPipeline'],
+        'face_recognition_onnx_fm_pipeline': ['FaceRecognitionOnnxFmPipeline'],
         'general_recognition_pipeline': ['GeneralRecognitionPipeline'],
         'image_classification_pipeline':
         ['GeneralImageClassificationPipeline', 'ImageClassificationPipeline'],
@@ -134,25 +160,26 @@ else:
         'realtime_object_detection_pipeline':
         ['RealtimeObjectDetectionPipeline'],
         'live_category_pipeline': ['LiveCategoryPipeline'],
-        'image_to_image_generation_pipeline':
-        ['Image2ImageGenerationPipeline'],
+        'image_to_image_generate_pipeline': ['Image2ImageGenerationPipeline'],
         'image_inpainting_pipeline': ['ImageInpaintingPipeline'],
+        'image_paintbyexample_pipeline': ['ImagePaintbyexamplePipeline'],
         'ocr_detection_pipeline': ['OCRDetectionPipeline'],
         'ocr_recognition_pipeline': ['OCRRecognitionPipeline'],
         'license_plate_detection_pipeline': ['LicensePlateDetectionPipeline'],
         'table_recognition_pipeline': ['TableRecognitionPipeline'],
         'skin_retouching_pipeline': ['SkinRetouchingPipeline'],
+        'face_reconstruction_pipeline': ['FaceReconstructionPipeline'],
         'tinynas_classification_pipeline': ['TinynasClassificationPipeline'],
         'video_category_pipeline': ['VideoCategoryPipeline'],
         'virtual_try_on_pipeline': ['VirtualTryonPipeline'],
         'shop_segmentation_pipleline': ['ShopSegmentationPipeline'],
-        'easycv_pipeline': [
+        'easycv_pipelines': [
             'EasyCVDetectionPipeline',
             'EasyCVSegmentationPipeline',
             'Face2DKeypointsPipeline',
             'HumanWholebodyKeypointsPipeline',
         ],
-        'text_driven_segmentation_pipeline':
+        'text_driven_segmentation_pipleline':
         ['TextDrivenSegmentationPipeline'],
         'movie_scene_segmentation_pipeline':
         ['MovieSceneSegmentationPipeline'],
@@ -164,9 +191,8 @@ else:
         'facial_landmark_confidence_pipeline':
         ['FacialLandmarkConfidencePipeline'],
         'face_processing_base_pipeline': ['FaceProcessingBasePipeline'],
-        'face_attribute_recognition_pipeline': [
-            'FaceAttributeRecognitionPipeline'
-        ],
+        'face_attribute_recognition_pipeline':
+        ['FaceAttributeRecognitionPipeline'],
         'mtcnn_face_detection_pipeline': ['MtcnnFaceDetectionPipeline'],
         'hand_static_pipeline': ['HandStaticPipeline'],
         'referring_video_object_segmentation_pipeline': [
@@ -175,15 +201,31 @@ else:
         'language_guided_video_summarization_pipeline': [
             'LanguageGuidedVideoSummarizationPipeline'
         ],
+        'vision_efficient_tuning_adapter_pipeline': [
+            'VisionEfficientTuningAdapterPipeline'
+        ],
+        'vision_efficient_tuning_prompt_pipeline': [
+            'VisionEfficientTuningPromptPipeline'
+        ],
+        'vision_efficient_tuning_prefix_pipeline': [
+            'VisionEfficientTuningPrefixPipeline'
+        ],
+        'vision_efficient_tuning_lora_pipeline': [
+            'VisionEfficientTuningLoRAPipeline'
+        ],
         'vision_middleware_pipeline': ['VisionMiddlewarePipeline'],
         'video_frame_interpolation_pipeline': [
             'VideoFrameInterpolationPipeline'
         ],
         'image_skychange_pipeline': ['ImageSkychangePipeline'],
+        'image_driving_perception_pipeline': [
+            'ImageDrivingPerceptionPipeline'
+        ],
         'vop_retrieval_pipeline': ['VopRetrievalPipeline'],
         'video_object_segmentation_pipeline': [
             'VideoObjectSegmentationPipeline'
         ],
+        'video_deinterlace_pipeline': ['VideoDeinterlacePipeline'],
         'image_matching_pipeline': ['ImageMatchingPipeline'],
         'video_stabilization_pipeline': ['VideoStabilizationPipeline'],
         'video_super_resolution_pipeline': ['VideoSuperResolutionPipeline'],
@@ -200,7 +242,28 @@ else:
         'ddcolor_image_colorization_pipeline': [
             'DDColorImageColorizationPipeline'
         ],
+        'image_structured_model_probing_pipeline': [
+            'ImageSturcturedModelProbingPipeline'
+        ],
+        'video_colorization_pipeline': ['VideoColorizationPipeline'],
         'image_defrcn_fewshot_pipeline': ['ImageDefrcnDetectionPipeline'],
+        'image_quality_assessment_degradation_pipeline': [
+            'ImageQualityAssessmentDegradationPipeline'
+        ],
+        'image_open_vocabulary_detection_pipeline': [
+            'ImageOpenVocabularyDetectionPipeline'
+        ],
+        'object_detection_3d_pipeline': ['ObjectDetection3DPipeline'],
+        'image_inpainting_sdv2_pipeline': ['ImageInpaintingSDV2Pipeline'],
+        'image_quality_assessment_mos_pipeline': [
+            'ImageQualityAssessmentMosPipeline'
+        ],
+        'mobile_image_super_resolution_pipeline': [
+            'MobileImageSuperResolutionPipeline'
+        ],
+        'bad_image_detecting_pipeline': ['BadImageDetecingPipeline'],
+        'image_human_parsing_pipeline': ['ImageHumanParsingPipeline'],
+        'nerf_recon_acc_pipeline': ['NeRFReconAccPipeline'],
     }
 
     import sys
diff --git a/modelscope/pipelines/cv/bad_image_detecting_pipeline.py b/modelscope/pipelines/cv/bad_image_detecting_pipeline.py
new file mode 100644
index 00000000..8fcee823
--- /dev/null
+++ b/modelscope/pipelines/cv/bad_image_detecting_pipeline.py
@@ -0,0 +1,81 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import Any, Dict, Optional, Union
+
+import numpy as np
+import torch
+
+from modelscope.metainfo import Pipelines
+from modelscope.models.cv.bad_image_detecting import BadImageDetecting
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines.base import Input, Pipeline
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.preprocessors import LoadImage
+from modelscope.utils.constant import Tasks
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+__all__ = ['BadImageDetecingPipeline']
+
+
+@PIPELINES.register_module(
+    Tasks.bad_image_detecting, module_name=Pipelines.bad_image_detecting)
+class BadImageDetecingPipeline(Pipeline):
+    """ Image Restoration Pipeline .
+
+    Take bad_image_detecting as an example
+    ```python
+    >>> from modelscope.pipelines import pipeline
+    >>> image_pipeline = pipeline(Tasks.bad_image_detecting, model=model_id)
+    >>> image_pipeline("https://modelscope.oss-cn-beijing.aliyuncs.com/test/images/dogs.jpg")
+
+    ```
+    """
+
+    def __init__(self, model: Union[BadImageDetecting, str], **kwargs):
+        """
+        use `model` and `preprocessor` to create a cv image denoise pipeline for prediction
+        Args:
+            model: model id on modelscope hub.
+        """
+        super().__init__(model=model, **kwargs)
+        self.model.eval()
+        self.labels = ['正常', '花屏', '绿屏']
+
+        if torch.cuda.is_available():
+            self._device = torch.device('cuda')
+        else:
+            self._device = torch.device('cpu')
+        logger.info('load bad image detecting model done')
+
+    def preprocess(self, input: Input) -> Dict[str, Any]:
+
+        img = LoadImage.convert_to_ndarray(input)
+        result = self.preprocessor(img)
+        return result
+
+    def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
+
+        with torch.no_grad():
+            output = self.model(input)  # output Tensor
+
+        return {'output': output['output']}
+
+    def postprocess(self, input: Dict[str, Any]) -> Dict[str, Any]:
+
+        pred = input['output']
+        score = torch.softmax(pred, dim=1).cpu().numpy()
+
+        pred_scores = np.sort(score, axis=1)[0][::-1]
+        pred_labels = np.argsort(score, axis=1)[0][::-1]
+        result = {
+            'pred_score': [score for score in pred_scores],
+            'pred_class': [self.labels[label] for label in pred_labels]
+        }
+
+        outputs = {
+            OutputKeys.SCORES: result['pred_score'],
+            OutputKeys.LABELS: result['pred_class']
+        }
+
+        return outputs
diff --git a/modelscope/pipelines/cv/body_3d_keypoints_pipeline.py b/modelscope/pipelines/cv/body_3d_keypoints_pipeline.py
index dbd59e97..b873034b 100644
--- a/modelscope/pipelines/cv/body_3d_keypoints_pipeline.py
+++ b/modelscope/pipelines/cv/body_3d_keypoints_pipeline.py
@@ -16,8 +16,8 @@ from matplotlib.animation import writers
 from matplotlib.ticker import MultipleLocator
 
 from modelscope.metainfo import Pipelines
-from modelscope.models.cv.body_3d_keypoints.body_3d_pose import (
-    BodyKeypointsDetection3D, KeypointsTypes)
+from modelscope.models.cv.body_3d_keypoints.cannonical_pose.body_3d_pose import \
+    KeypointsTypes
 from modelscope.outputs import OutputKeys
 from modelscope.pipelines import pipeline
 from modelscope.pipelines.base import Input, Model, Pipeline, Tensor
@@ -112,11 +112,19 @@ def convert_2_h36m_data(lst_kps, lst_bboxes, joints_nbr=15):
     Tasks.body_3d_keypoints, module_name=Pipelines.body_3d_keypoints)
 class Body3DKeypointsPipeline(Pipeline):
 
-    def __init__(self, model: Union[str, BodyKeypointsDetection3D], **kwargs):
+    def __init__(self, model: str, **kwargs):
         """Human body 3D pose estimation.
 
         Args:
-            model (Union[str, BodyKeypointsDetection3D]): model id on modelscope hub.
+            model (str): model id on modelscope hub.
+            kwargs (dict, `optional`): Extra kwargs passed into the preprocessor's constructor.
+        Example:
+            >>> from modelscope.pipelines import pipeline
+            >>> body_3d_keypoints = pipeline(Tasks.body_3d_keypoints,
+                model='damo/cv_hdformer_body-3d-keypoints_video')
+            >>> test_video_url = 'https://modelscope.oss-cn-beijing.aliyuncs.com/test/videos/Walking.54138969.mp4'
+            >>> output = body_3d_keypoints(test_video_url)
+            >>> print(output)
         """
         super().__init__(model=model, **kwargs)
 
@@ -130,6 +138,10 @@ class Body3DKeypointsPipeline(Pipeline):
             model=self.human_body_2d_kps_det_pipeline,
             device='gpu' if torch.cuda.is_available() else 'cpu')
 
+        self.max_frame = self.keypoint_model_3d.cfg.model.INPUT.MAX_FRAME \
+            if hasattr(self.keypoint_model_3d.cfg.model.INPUT, 'MAX_FRAME') \
+            else self.keypoint_model_3d.cfg.model.INPUT.max_frame  # max video frame number to be predicted 3D joints
+
     def preprocess(self, input: Input) -> Dict[str, Any]:
         self.video_url = input
         video_frames = self.read_video_frames(self.video_url)
@@ -139,7 +151,6 @@ class Body3DKeypointsPipeline(Pipeline):
 
         all_2d_poses = []
         all_boxes_with_socre = []
-        max_frame = self.keypoint_model_3d.cfg.model.INPUT.MAX_FRAME  # max video frame number to be predicted 3D joints
         for i, frame in enumerate(video_frames):
             kps_2d = self.human_body_2d_kps_detector(frame)
             if [] == kps_2d.get('boxes'):
@@ -157,7 +168,7 @@ class Body3DKeypointsPipeline(Pipeline):
             all_boxes_with_socre.append(
                 list(np.array(box).reshape(
                     (-1))) + [score])  # construct to list with shape [5]
-            if (i + 1) >= max_frame:
+            if (i + 1) >= self.max_frame:
                 break
 
         all_2d_poses_np = np.array(all_2d_poses).reshape(
@@ -166,10 +177,11 @@ class Body3DKeypointsPipeline(Pipeline):
         all_boxes_np = np.array(all_boxes_with_socre).reshape(
             (len(all_boxes_with_socre), 5))  # [x1, y1, x2, y2, score]
 
+        joint_num = self.keypoint_model_3d.cfg.model.MODEL.IN_NUM_JOINTS \
+            if hasattr(self.keypoint_model_3d.cfg.model.MODEL, 'IN_NUM_JOINTS') \
+            else self.keypoint_model_3d.cfg.model.MODEL.n_joints
         kps_2d_h36m_17 = convert_2_h36m_data(
-            all_2d_poses_np,
-            all_boxes_np,
-            joints_nbr=self.keypoint_model_3d.cfg.model.MODEL.IN_NUM_JOINTS)
+            all_2d_poses_np, all_boxes_np, joints_nbr=joint_num)
         kps_2d_h36m_17 = np.array(kps_2d_h36m_17)
         res = {'success': True, 'input_2d_pts': kps_2d_h36m_17}
         return res
@@ -246,7 +258,6 @@ class Body3DKeypointsPipeline(Pipeline):
             raise Exception('modelscope error: %s cannot get video fps info.' %
                             (video_url))
 
-        max_frame_num = self.keypoint_model_3d.cfg.model.INPUT.MAX_FRAME
         frame_idx = 0
         while True:
             ret, frame = cap.read()
@@ -256,7 +267,7 @@ class Body3DKeypointsPipeline(Pipeline):
                 timestamp_format(seconds=frame_idx / self.fps))
             frame_idx += 1
             frames.append(frame)
-            if frame_idx >= max_frame_num:
+            if frame_idx >= self.max_frame:
                 break
         cap.release()
         return frames
@@ -278,7 +289,8 @@ class Body3DKeypointsPipeline(Pipeline):
                  [12, 13], [9, 10]]  # connection between joints
 
         fig = plt.figure()
-        ax = p3.Axes3D(fig)
+        ax = p3.Axes3D(fig, auto_add_to_figure=False)
+        fig.add_axes(ax)
         x_major_locator = MultipleLocator(0.5)
 
         ax.xaxis.set_major_locator(x_major_locator)
diff --git a/modelscope/pipelines/cv/card_detection_pipeline.py b/modelscope/pipelines/cv/card_detection_pipeline.py
index 67a8cda1..75e7743a 100644
--- a/modelscope/pipelines/cv/card_detection_pipeline.py
+++ b/modelscope/pipelines/cv/card_detection_pipeline.py
@@ -15,60 +15,58 @@ logger = get_logger()
 @PIPELINES.register_module(
     Tasks.card_detection, module_name=Pipelines.card_detection)
 class CardDetectionPipeline(Pipeline):
-    """ Card Detection Pipeline.
+    r""" Card Detection Pipeline.
 
-    Example:
+    Examples:
 
-    ```python
     >>> from modelscope.pipelines import pipeline
 
     >>> detector = pipeline('card-detection', 'damo/cv_resnet_carddetection_scrfd34gkps')
     >>> detector("http://www.modelscope.cn/api/v1/models/damo/cv_resnet_carddetection_scrfd34gkps/repo?Revision=master"
-                 "&FilePath=description/card_detection1.jpg")
-       {
-        "boxes": [
-            [
-            446.9007568359375,
-            36.374977111816406,
-            907.0919189453125,
-            337.439208984375
-            ],
-            [
-            454.3310241699219,
-            336.08477783203125,
-            921.26904296875,
-            641.7871704101562
-            ]
-        ],
-        "keypoints": [
-            [
-            457.34710693359375,
-            339.02044677734375,
-            446.72271728515625,
-            52.899078369140625,
-            902.8200073242188,
-            35.063236236572266,
-            908.5877685546875,
-            325.62030029296875
-            ],
-            [
-            465.2864074707031,
-            642.8411254882812,
-            454.38568115234375,
-            357.4076232910156,
-            902.5343017578125,
-            334.18377685546875,
-            922.0982055664062,
-            621.0704345703125
-            ]
-        ],
-        "scores": [
-            0.9296008944511414,
-            0.9260380268096924
-        ]
-        }
-    >>> #
-    ```
+    >>>             "&FilePath=description/card_detection1.jpg")
+    >>>   {
+    >>>    "boxes": [
+    >>>        [
+    >>>        446.9007568359375,
+    >>>        36.374977111816406,
+    >>>        907.0919189453125,
+    >>>        337.439208984375
+    >>>        ],
+    >>>        [
+    >>>        454.3310241699219,
+    >>>        336.08477783203125,
+    >>>        921.26904296875,
+    >>>        641.7871704101562
+    >>>        ]
+    >>>    ],
+    >>>    "keypoints": [
+    >>>        [
+    >>>        457.34710693359375,
+    >>>        339.02044677734375,
+    >>>        446.72271728515625,
+    >>>        52.899078369140625,
+    >>>         902.8200073242188,
+    >>>        35.063236236572266,
+    >>>        908.5877685546875,
+    >>>        325.62030029296875
+    >>>         ],
+    >>>         [
+    >>>        465.2864074707031,
+    >>>        642.8411254882812,
+    >>>       454.38568115234375,
+    >>>        357.4076232910156,
+    >>>        902.5343017578125,
+    >>>        334.18377685546875,
+    >>>        922.0982055664062,
+    >>>         621.0704345703125
+    >>>        ]
+    >>>    ],
+    >>>    "scores": [
+    >>>        0.9296008944511414,
+    >>>        0.9260380268096924
+    >>>    ]
+    >>>   }
+    >>>
     """
 
     def __init__(self, model: str, **kwargs):
diff --git a/modelscope/pipelines/cv/content_check_pipeline.py b/modelscope/pipelines/cv/content_check_pipeline.py
new file mode 100644
index 00000000..7bf00b95
--- /dev/null
+++ b/modelscope/pipelines/cv/content_check_pipeline.py
@@ -0,0 +1,74 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os.path as osp
+from typing import Any, Dict
+
+import cv2
+import numpy as np
+import PIL
+import torch
+import torch.nn.functional as F
+from torch import nn
+from torchvision import transforms
+
+from modelscope.metainfo import Pipelines
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines import pipeline
+from modelscope.pipelines.base import Input, Pipeline
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.preprocessors import LoadImage
+from modelscope.utils.constant import ModelFile, Tasks
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+
+@PIPELINES.register_module(
+    Tasks.image_classification, module_name=Pipelines.content_check)
+class ContentCheckPipeline(Pipeline):
+
+    def __init__(self, model: str, **kwargs):
+        """
+        use `model` to create a content check pipeline for prediction
+        Args:
+            model: model id on modelscope hub.
+        Example:
+        ContentCheckPipeline can judge whether the picture is pornographic
+
+        ```python
+        >>> from modelscope.pipelines import pipeline
+        >>> cc_func = pipeline('image_classification', 'damo/cv_resnet50_image-classification_cc')
+        >>> cc_func("https://modelscope.oss-cn-beijing.aliyuncs.com/test/images/content_check.jpg")
+        {'scores': [0.2789826989173889], 'labels': 'pornographic'}
+        ```
+        """
+
+        # content check model
+        super().__init__(model=model, **kwargs)
+        self.test_transforms = transforms.Compose([
+            transforms.Resize(224),
+            transforms.CenterCrop(224),
+            transforms.ToTensor(),
+            transforms.Normalize(
+                mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
+        ])
+        logger.info('content check model loaded!')
+
+    def preprocess(self, input: Input) -> Dict[str, Any]:
+        img = LoadImage.convert_to_img(input)
+        img = self.test_transforms(img).float()
+        result = {}
+        result['img'] = img
+        return result
+
+    def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
+        img = input['img'].unsqueeze(0)
+        result = self.model(img)
+        score = [1 - F.softmax(result[:, :5])[0][-1].tolist()]
+        if score[0] < 0.5:
+            label = 'pornographic'
+        else:
+            label = 'normal'
+        return {OutputKeys.SCORES: score, OutputKeys.LABELS: label}
+
+    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
+        return inputs
diff --git a/modelscope/pipelines/cv/ddcolor_image_colorization_pipeline.py b/modelscope/pipelines/cv/ddcolor_image_colorization_pipeline.py
index 43e505fd..cab7a99e 100644
--- a/modelscope/pipelines/cv/ddcolor_image_colorization_pipeline.py
+++ b/modelscope/pipelines/cv/ddcolor_image_colorization_pipeline.py
@@ -24,9 +24,8 @@ logger = get_logger()
 class DDColorImageColorizationPipeline(Pipeline):
     """ DDColor Image Colorization Pipeline.
 
-    Example:
+    Examples:
 
-    ```python
     >>> from modelscope.pipelines import pipeline
 
     >>> colorizer = pipeline('image-colorization', 'damo/cv_ddcolor_image-colorization')
@@ -80,8 +79,6 @@ class DDColorImageColorizationPipeline(Pipeline):
          [233, 232, 231],
          [233, 232, 231],
          [232, 232, 229]]], dtype=uint8)}
-    >>> #
-    ```
     """
 
     def __init__(self, model: Union[DDColorForImageColorization, str],
diff --git a/modelscope/pipelines/cv/ddpm_semantic_segmentation_pipeline.py b/modelscope/pipelines/cv/ddpm_semantic_segmentation_pipeline.py
new file mode 100644
index 00000000..36a5e80c
--- /dev/null
+++ b/modelscope/pipelines/cv/ddpm_semantic_segmentation_pipeline.py
@@ -0,0 +1,60 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+from typing import Any, Dict
+
+import torch
+import torchvision.transforms as T
+
+from modelscope.metainfo import Pipelines
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines.base import Input, Pipeline
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.preprocessors import LoadImage
+from modelscope.utils.constant import Tasks
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+
+@PIPELINES.register_module(
+    Tasks.semantic_segmentation,
+    module_name=Pipelines.ddpm_image_semantic_segmentation)
+class DDPMImageSemanticSegmentationPipeline(Pipeline):
+
+    def __init__(self, model: str, **kwargs):
+        """use `model` to create a image semantic segmentation pipeline for prediction
+
+        Args:
+            model: model id on modelscope hub
+        """
+        _device = kwargs.pop('device', 'gpu')
+        if torch.cuda.is_available() and _device == 'gpu':
+            self.device = 'gpu'
+        else:
+            self.device = 'cpu'
+        super().__init__(model=model, device=self.device, **kwargs)
+
+        logger.info('Load model done!')
+
+    def preprocess(self, input: Input) -> Dict[str, Any]:
+        image = LoadImage.convert_to_img(input)
+        assert image.size[0] == image.size[1], \
+            f'Only square images are supported: ({image.size[0]}, {image.size[1]})'
+
+        infer_transforms = T.Compose(
+            [T.Resize(256), T.ToTensor(), lambda x: 2 * x - 1])
+        image = infer_transforms(image)
+
+        result = {'input_img': image}
+
+        return result
+
+    def forward(self, input: Dict[str, Any],
+                **forward_params) -> Dict[str, Any]:
+        with torch.no_grad():
+            output = self.model(input)
+        return output
+
+    def postprocess(self, inputs, **kwargs) -> Dict[str, Any]:
+        mask, out_img = inputs
+        return {OutputKeys.MASKS: mask[0], OutputKeys.OUTPUT_IMG: out_img[0]}
diff --git a/modelscope/pipelines/cv/face_liveness_xc_pipeline.py b/modelscope/pipelines/cv/face_liveness_xc_pipeline.py
new file mode 100644
index 00000000..dbe19be1
--- /dev/null
+++ b/modelscope/pipelines/cv/face_liveness_xc_pipeline.py
@@ -0,0 +1,87 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os.path as osp
+from typing import Any, Dict
+
+import cv2
+import numpy as np
+import onnxruntime
+import PIL
+import torch
+import torch.nn.functional as F
+
+from modelscope.metainfo import Pipelines
+from modelscope.models.cv.face_recognition.align_face import align_face
+from modelscope.models.cv.facial_landmark_confidence import \
+    FacialLandmarkConfidence
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines import pipeline
+from modelscope.pipelines.base import Input, Pipeline
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.preprocessors import LoadImage
+from modelscope.utils.constant import ModelFile, Tasks
+from modelscope.utils.logger import get_logger
+from . import FaceProcessingBasePipeline
+
+logger = get_logger()
+
+
+@PIPELINES.register_module(
+    Tasks.face_liveness, module_name=Pipelines.face_liveness_xc)
+class FaceLivenessXcPipeline(FaceProcessingBasePipeline):
+
+    def __init__(self, model: str, **kwargs):
+        """
+        FaceLivenessXcPipeline can judge the input face is a real or fake face.
+        use `model` to create a face lievness ir pipeline for prediction
+        Args:
+            model: model id on modelscope hub.
+        ```python
+        >>> from modelscope.pipelines import pipeline
+        >>> fl_xc = pipeline('face_liveness', 'damo/cv_manual_face-liveness_flxc')
+        >>> fl_xc("https://modelscope.oss-cn-beijing.aliyuncs.com/test/images/face_liveness_xc.png")
+        {'scores': [0.03821974992752075], 'boxes': [[12.569677352905273, 6.428711891174316,
+            94.17887115478516, 106.74441528320312]]}
+        ```
+        """
+        super().__init__(model=model, **kwargs)
+        onnx_path = osp.join(model, ModelFile.ONNX_MODEL_FILE)
+        logger.info(f'loading model from {onnx_path}')
+        self.sess, self.input_node_name, self.out_node_name = self.load_onnx_model(
+            onnx_path)
+        logger.info('load model done')
+
+    def load_onnx_model(self, onnx_path):
+        sess = onnxruntime.InferenceSession(onnx_path)
+        out_node_name = []
+        input_node_name = []
+        for node in sess.get_outputs():
+            out_node_name.append(node.name)
+
+        for node in sess.get_inputs():
+            input_node_name.append(node.name)
+
+        return sess, input_node_name, out_node_name
+
+    def preprocess(self, input: Input) -> Dict[str, Any]:
+        result = super().preprocess(input)
+        img = result['img']
+        img = (img - 127.5) * 0.0078125
+        img = np.expand_dims(img, 0).copy()
+        input_tensor = np.concatenate([img, img, img, img], axis=3)
+        input_tensor = np.transpose(
+            input_tensor, axes=(0, 3, 1, 2)).astype(np.float32)
+        result['input_tensor'] = input_tensor
+        return result
+
+    def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
+        input_feed = {}
+        input_feed[
+            self.input_node_name[0]] = input['input_tensor'].cpu().numpy()
+        result = self.sess.run(self.out_node_name, input_feed=input_feed)
+        scores = [result[0][0][0].tolist()]
+
+        boxes = input['bbox'].cpu().numpy()[np.newaxis, :].tolist()
+        return {OutputKeys.SCORES: scores, OutputKeys.BOXES: boxes}
+
+    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
+        return inputs
diff --git a/modelscope/pipelines/cv/face_quality_assessment_pipeline.py b/modelscope/pipelines/cv/face_quality_assessment_pipeline.py
new file mode 100644
index 00000000..4969696f
--- /dev/null
+++ b/modelscope/pipelines/cv/face_quality_assessment_pipeline.py
@@ -0,0 +1,96 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os.path as osp
+from typing import Any, Dict
+
+import cv2
+import numpy as np
+import onnxruntime
+import PIL
+import torch
+import torch.nn.functional as F
+
+from modelscope.metainfo import Pipelines
+from modelscope.models.cv.face_recognition.align_face import align_face
+from modelscope.models.cv.facial_landmark_confidence import \
+    FacialLandmarkConfidence
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines import pipeline
+from modelscope.pipelines.base import Input, Pipeline
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.pipelines.util import batch_process
+from modelscope.preprocessors import LoadImage
+from modelscope.utils.constant import ModelFile, Tasks
+from modelscope.utils.logger import get_logger
+from . import FaceProcessingBasePipeline
+
+logger = get_logger()
+
+
+@PIPELINES.register_module(
+    Tasks.face_quality_assessment,
+    module_name=Pipelines.face_quality_assessment)
+class FaceQualityAssessmentPipeline(FaceProcessingBasePipeline):
+
+    def __init__(self, model: str, **kwargs):
+        """
+        use `model` to create a face quality assessment pipeline for prediction
+        Args:
+            model: model id on modelscope hub.
+        Example:
+        FaceQualityAssessmentPipeline can measure the quality of an input face image,
+        the higher output score represents the better quality
+
+        ```python
+        >>> from modelscope.pipelines import pipeline
+        >>> fqa = pipeline('face-quality-assessment', 'damo/cv_manual_face-quality-assessment_fqa')
+        >>> frfm("https://modelscope.oss-cn-beijing.aliyuncs.com/test/images/face_recognition_1.png")
+        {'scores': [0.99949193], 'boxes': [[157.72341918945312, 67.5608139038086,
+            305.8574523925781, 271.25555419921875]]}
+
+        ```
+        """
+        super().__init__(model=model, **kwargs)
+        onnx_path = osp.join(model, ModelFile.ONNX_MODEL_FILE)
+        logger.info(f'loading model from {onnx_path}')
+        self.sess, self.input_node_name, self.out_node_name = self.load_onnx_model(
+            onnx_path)
+        logger.info('load model done')
+
+    def _batch(self, data):
+        return batch_process(self.model, data)
+
+    def load_onnx_model(self, onnx_path):
+        sess = onnxruntime.InferenceSession(onnx_path)
+        out_node_name = []
+        input_node_name = []
+        for node in sess.get_outputs():
+            out_node_name.append(node.name)
+
+        for node in sess.get_inputs():
+            input_node_name.append(node.name)
+
+        return sess, input_node_name, out_node_name
+
+    def preprocess(self, input: Input) -> Dict[str, Any]:
+        result = super().preprocess(input)
+        align_img = result['img']
+        face_img = align_img[:, :, ::-1]  # to rgb
+        face_img = (face_img / 255. - 0.5) / 0.5
+        face_img = np.expand_dims(face_img, 0).copy()
+        face_img = np.transpose(face_img, axes=(0, 3, 1, 2))
+        face_img = face_img.astype(np.float32)
+        result['input_tensor'] = face_img
+        return result
+
+    def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
+        input_feed = {}
+        input_feed[
+            self.input_node_name[0]] = input['input_tensor'].cpu().numpy()
+        result = self.sess.run(self.out_node_name, input_feed=input_feed)
+        assert result is not None
+        scores = [result[0][0][0]]
+        boxes = input['bbox'].cpu().numpy()[np.newaxis, :].tolist()
+        return {OutputKeys.SCORES: scores, OutputKeys.BOXES: boxes}
+
+    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
+        return inputs
diff --git a/modelscope/pipelines/cv/face_recognition_onnx_fm_pipeline.py b/modelscope/pipelines/cv/face_recognition_onnx_fm_pipeline.py
new file mode 100644
index 00000000..910bab29
--- /dev/null
+++ b/modelscope/pipelines/cv/face_recognition_onnx_fm_pipeline.py
@@ -0,0 +1,87 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os.path as osp
+from typing import Any, Dict
+
+import cv2
+import numpy as np
+import onnxruntime
+import PIL
+import torch
+import torch.nn.functional as F
+
+from modelscope.metainfo import Pipelines
+from modelscope.models.cv.face_recognition.align_face import align_face
+from modelscope.models.cv.facial_landmark_confidence import \
+    FacialLandmarkConfidence
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines import pipeline
+from modelscope.pipelines.base import Input, Pipeline
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.preprocessors import LoadImage
+from modelscope.utils.constant import ModelFile, Tasks
+from modelscope.utils.logger import get_logger
+from . import FaceProcessingBasePipeline
+
+logger = get_logger()
+
+
+@PIPELINES.register_module(
+    Tasks.face_recognition, module_name=Pipelines.face_recognition_onnx_fm)
+class FaceRecognitionOnnxFmPipeline(FaceProcessingBasePipeline):
+
+    def __init__(self, model: str, **kwargs):
+        """
+        FaceRecognitionOnnxFmPipeline can extract 512-dim feature of mask or non-masked face image. use `model`
+        to create a face recognition face mask onnx pipeline for prediction.
+
+        Args:
+            model: model id on modelscope hub.
+
+        Examples:
+
+        >>> from modelscope.pipelines import pipeline
+        >>> frfm = pipeline('face-recognition-ood', 'damo/cv_manual_face-recognition_frfm')
+        >>> frfm("https://modelscope.oss-cn-beijing.aliyuncs.com/test/images/face_recognition_1.png")
+        >>> {{'img_embedding': array([[ 0.02276129, -0.00761525, ...,0.05735306]],
+        >>>    dtype=float32)} }
+        """
+        super().__init__(model=model, **kwargs)
+        onnx_path = osp.join(model, ModelFile.ONNX_MODEL_FILE)
+        logger.info(f'loading model from {onnx_path}')
+        self.sess, self.input_node_name, self.out_node_name = self.load_onnx_model(
+            onnx_path)
+        logger.info('load model done')
+
+    def load_onnx_model(self, onnx_path):
+        sess = onnxruntime.InferenceSession(onnx_path)
+        out_node_name = []
+        input_node_name = []
+        for node in sess.get_outputs():
+            out_node_name.append(node.name)
+
+        for node in sess.get_inputs():
+            input_node_name.append(node.name)
+
+        return sess, input_node_name, out_node_name
+
+    def preprocess(self, input: Input) -> Dict[str, Any]:
+        result = super().preprocess(input)
+        align_img = result['img']
+        face_img = align_img[:, :, ::-1]  # to rgb
+        face_img = (face_img / 255. - 0.5) / 0.5
+        face_img = np.expand_dims(face_img, 0).copy()
+        face_img = np.transpose(face_img, axes=(0, 3, 1, 2))
+        face_img = face_img.astype(np.float32)
+        result['input_tensor'] = face_img
+        return result
+
+    def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
+        input_feed = {}
+        input_feed[
+            self.input_node_name[0]] = input['input_tensor'].cpu().numpy()
+        emb = self.sess.run(self.out_node_name, input_feed=input_feed)[0]
+        emb /= np.sqrt(np.sum(emb**2, -1, keepdims=True))  # l2 norm
+        return {OutputKeys.IMG_EMBEDDING: emb}
+
+    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
+        return inputs
diff --git a/modelscope/pipelines/cv/face_recognition_onnx_ir_pipeline.py b/modelscope/pipelines/cv/face_recognition_onnx_ir_pipeline.py
new file mode 100644
index 00000000..8c44f65d
--- /dev/null
+++ b/modelscope/pipelines/cv/face_recognition_onnx_ir_pipeline.py
@@ -0,0 +1,84 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os.path as osp
+from typing import Any, Dict
+
+import cv2
+import numpy as np
+import onnxruntime
+import PIL
+import torch
+import torch.nn.functional as F
+
+from modelscope.metainfo import Pipelines
+from modelscope.models.cv.face_recognition.align_face import align_face
+from modelscope.models.cv.facial_landmark_confidence import \
+    FacialLandmarkConfidence
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines import pipeline
+from modelscope.pipelines.base import Input, Pipeline
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.preprocessors import LoadImage
+from modelscope.utils.constant import ModelFile, Tasks
+from modelscope.utils.logger import get_logger
+from . import FaceProcessingBasePipeline
+
+logger = get_logger()
+
+
+@PIPELINES.register_module(
+    Tasks.face_recognition, module_name=Pipelines.face_recognition_onnx_ir)
+class FaceRecognitionOnnxIrPipeline(FaceProcessingBasePipeline):
+
+    def __init__(self, model: str, **kwargs):
+        """
+        FaceRecognitionOnnxIrPipeline  can extract 512-dim feature of IR face image.
+        use `model` to create a face recognition ir onnx pipeline for prediction.
+        Args:
+            model: model id on modelscope hub.
+        Example:
+
+        >>> from modelscope.pipelines import pipeline
+        >>> frir = pipeline('face-recognition-ood', 'damo/cv_manual_face-recognition_frir')
+        >>> frir("https://modelscope.oss-cn-beijing.aliyuncs.com/test/images/ir_face_recognition_1.png")
+        >>> # {{'img_embedding': array([[ 0.02276129, -0.00761525, ...,0.05735306]], dtype=float32)} }
+        """
+        super().__init__(model=model, **kwargs)
+        onnx_path = osp.join(model, ModelFile.ONNX_MODEL_FILE)
+        logger.info(f'loading model from {onnx_path}')
+        self.sess, self.input_node_name, self.out_node_name = self.load_onnx_model(
+            onnx_path)
+        logger.info('load model done')
+
+    def load_onnx_model(self, onnx_path):
+        sess = onnxruntime.InferenceSession(onnx_path)
+        out_node_name = []
+        input_node_name = []
+        for node in sess.get_outputs():
+            out_node_name.append(node.name)
+
+        for node in sess.get_inputs():
+            input_node_name.append(node.name)
+
+        return sess, input_node_name, out_node_name
+
+    def preprocess(self, input: Input) -> Dict[str, Any]:
+        result = super().preprocess(input)
+        align_img = result['img']
+        face_img = align_img[:, :, ::-1]  # to rgb
+        face_img = (face_img / 255. - 0.5) / 0.5
+        face_img = np.expand_dims(face_img, 0).copy()
+        face_img = np.transpose(face_img, axes=(0, 3, 1, 2))
+        face_img = face_img.astype(np.float32)
+        result['input_tensor'] = face_img
+        return result
+
+    def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
+        input_feed = {}
+        input_feed[
+            self.input_node_name[0]] = input['input_tensor'].cpu().numpy()
+        emb = self.sess.run(self.out_node_name, input_feed=input_feed)[0]
+        emb /= np.sqrt(np.sum(emb**2, -1, keepdims=True))  # l2 norm
+        return {OutputKeys.IMG_EMBEDDING: emb}
+
+    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
+        return inputs
diff --git a/modelscope/pipelines/cv/face_recognition_ood_pipeline.py b/modelscope/pipelines/cv/face_recognition_ood_pipeline.py
index f66288d1..b2e75619 100644
--- a/modelscope/pipelines/cv/face_recognition_ood_pipeline.py
+++ b/modelscope/pipelines/cv/face_recognition_ood_pipeline.py
@@ -23,7 +23,7 @@ logger = get_logger()
 
 
 @PIPELINES.register_module(
-    Tasks.face_recognition_ood, module_name=Pipelines.face_recognition_ood)
+    Tasks.face_recognition, module_name=Pipelines.face_recognition_ood)
 class FaceRecognitionOodPipeline(FaceProcessingBasePipeline):
 
     def __init__(self, model: str, **kwargs):
@@ -31,15 +31,14 @@ class FaceRecognitionOodPipeline(FaceProcessingBasePipeline):
         use `model` to create a face recognition ood pipeline for prediction
         Args:
             model: model id on modelscope hub.
-        Example:
 
-        ```python
+        Examples:
+
         >>> from modelscope.pipelines import pipeline
         >>> fr_ood= pipeline('face-recognition-ood', 'damo/cv_ir_face-recognition-ood_rts')
         >>> fr_ood("https://modelscope.oss-cn-beijing.aliyuncs.com/test/images/face_recognition_1.png")
         {{'img_embedding': array([[ 0.02276129, -0.00761525, ...,0.05735306]],
             dtype=float32, 'scores': [[0.7656678557395935]]}
-        ```
         """
 
         # face recong model
diff --git a/modelscope/pipelines/cv/face_reconstruction_pipeline.py b/modelscope/pipelines/cv/face_reconstruction_pipeline.py
new file mode 100644
index 00000000..64f8b3a9
--- /dev/null
+++ b/modelscope/pipelines/cv/face_reconstruction_pipeline.py
@@ -0,0 +1,370 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os
+import shutil
+from typing import Any, Dict
+
+import cv2
+import face_alignment
+import numpy as np
+import PIL.Image
+import tensorflow as tf
+import torch
+from scipy.io import loadmat, savemat
+
+from modelscope.metainfo import Pipelines
+from modelscope.models import Model
+from modelscope.models.cv.face_reconstruction.models.facelandmark.large_model_infer import \
+    LargeModelInfer
+from modelscope.models.cv.face_reconstruction.utils import (align_for_lm,
+                                                            align_img,
+                                                            load_lm3d,
+                                                            read_obj,
+                                                            write_obj)
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines import pipeline
+from modelscope.pipelines.base import Input, Pipeline
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.preprocessors import LoadImage
+from modelscope.utils.constant import ModelFile, Tasks
+from modelscope.utils.device import create_device, device_placement
+from modelscope.utils.logger import get_logger
+
+if tf.__version__ >= '2.0':
+    tf = tf.compat.v1
+    tf.disable_eager_execution()
+
+logger = get_logger()
+
+
+@PIPELINES.register_module(
+    Tasks.face_reconstruction, module_name=Pipelines.face_reconstruction)
+class FaceReconstructionPipeline(Pipeline):
+
+    def __init__(self, model: str, device: str):
+        """The inference pipeline for face reconstruction task.
+
+        Args:
+            model (`str` or `Model` or module instance): A model instance or a model local dir
+                or a model id in the model hub.
+            device ('str'): device str, should be either cpu, cuda, gpu, gpu:X or cuda:X.
+
+        Example:
+            >>> from modelscope.pipelines import pipeline
+            >>> test_image = 'data/test/images/face_reconstruction.jpg'
+            >>> pipeline_faceRecon = pipeline('face-reconstruction',
+                model='damo/cv_resnet50_face-reconstruction')
+            >>> result = pipeline_faceRecon(test_image)
+            >>> write_obj('result_face_reconstruction.obj', result[OutputKeys.OUTPUT])
+        """
+        super().__init__(model=model, device=device)
+
+        model_root = model
+        bfm_folder = os.path.join(model_root, 'assets')
+        checkpoint_path = os.path.join(model_root, ModelFile.TORCH_MODEL_FILE)
+
+        self.face_mark_model = LargeModelInfer(
+            os.path.join(model_root, 'large_base_net.pth'), device='cuda')
+
+        device = torch.device(0)
+        torch.cuda.set_device(device)
+        self.model.setup(checkpoint_path)
+        self.model.device = device
+        self.model.parallelize()
+        self.model.eval()
+        self.model.set_render(image_res=1024)
+
+        save_ckpt_dir = os.path.join(
+            os.path.expanduser('~'), '.cache/torch/hub/checkpoints')
+        if not os.path.exists(save_ckpt_dir):
+            os.makedirs(save_ckpt_dir)
+        shutil.copy(
+            os.path.join(model_root, 'face_alignment', 's3fd-619a316812.pth'),
+            save_ckpt_dir)
+        shutil.copy(
+            os.path.join(model_root, 'face_alignment',
+                         '3DFAN4-4a694010b9.zip'), save_ckpt_dir)
+        shutil.copy(
+            os.path.join(model_root, 'face_alignment', 'depth-6c4283c0e0.zip'),
+            save_ckpt_dir)
+        self.lm_sess = face_alignment.FaceAlignment(
+            face_alignment.LandmarksType._3D, flip_input=False)
+
+        config = tf.ConfigProto(allow_soft_placement=True)
+        config.gpu_options.per_process_gpu_memory_fraction = 0.2
+        config.gpu_options.allow_growth = True
+        g1 = tf.Graph()
+        self.face_sess = tf.Session(graph=g1, config=config)
+        with self.face_sess.as_default():
+            with g1.as_default():
+                with tf.gfile.FastGFile(
+                        os.path.join(model_root, 'segment_face.pb'),
+                        'rb') as f:
+                    graph_def = tf.GraphDef()
+                    graph_def.ParseFromString(f.read())
+                    self.face_sess.graph.as_default()
+                    tf.import_graph_def(graph_def, name='')
+                    self.face_sess.run(tf.global_variables_initializer())
+
+        self.tex_size = 4096
+
+        self.bald_tex_bg = cv2.imread(
+            '{}/assets/template_texture.jpg'.format(model_root)).astype(
+                np.float32)
+
+        front_mask = cv2.imread(
+            '{}/assets/face_mask.jpg'.format(model_root)).astype(
+                np.float32) / 255
+        front_mask = cv2.resize(front_mask, (1024, 1024))
+        front_mask = cv2.resize(front_mask, (0, 0), fx=0.1, fy=0.1)
+        front_mask = cv2.erode(front_mask,
+                               np.ones(shape=(7, 7), dtype=np.float32))
+        front_mask = cv2.GaussianBlur(front_mask, (13, 13), 0)
+        self.front_mask = cv2.resize(front_mask,
+                                     (self.tex_size, self.tex_size))
+        self.binary_front_mask = self.front_mask.copy()
+        self.binary_front_mask[(self.front_mask < 0.3)
+                               + (self.front_mask > 0.7)] = 0
+        self.binary_front_mask[self.binary_front_mask != 0] = 1.0
+        self.binary_front_mask_ = self.binary_front_mask.copy()
+        self.binary_front_mask = np.zeros((4096 + 1024, 4096, 3),
+                                          dtype=np.float32)
+        self.binary_front_mask[:4096, :] = self.binary_front_mask_
+        self.front_mask_ = self.front_mask.copy()
+        self.front_mask = np.zeros((4096 + 1024, 4096, 3), dtype=np.float32)
+        self.front_mask[:4096, :] = self.front_mask_
+
+        l_eye_mask = cv2.imread(
+            '{}/assets/l_eye_mask.png'.format(model_root))[:, :, :1] / 255.0
+        l_eye_mask = cv2.erode(l_eye_mask,
+                               np.ones(shape=(5, 5), dtype=np.float32))
+        self.l_eye_mask = cv2.GaussianBlur(l_eye_mask, (7, 7), 0)[..., None]
+        self.l_eye_binary_mask = self.l_eye_mask.copy()
+        self.l_eye_binary_mask[(self.l_eye_mask < 0.3)
+                               + (self.l_eye_mask > 0.7)] = 0
+        self.l_eye_binary_mask[self.l_eye_binary_mask != 0] = 1.0
+
+        r_eye_mask = cv2.imread(
+            '{}/assets/r_eye_mask.png'.format(model_root))[:, :, :1] / 255.0
+        r_eye_mask = cv2.dilate(r_eye_mask,
+                                np.ones(shape=(7, 7), dtype=np.float32))
+        self.r_eye_mask = cv2.GaussianBlur(r_eye_mask, (7, 7), 0)[..., None]
+        self.r_eye_binary_mask = self.r_eye_mask.copy()
+        self.r_eye_binary_mask[(self.r_eye_mask < 0.3)
+                               + (self.r_eye_mask > 0.7)] = 0
+        self.r_eye_binary_mask[self.r_eye_binary_mask != 0] = 1.0
+
+        self.lm3d_std = load_lm3d(bfm_folder)
+        self.align_params = loadmat(
+            '{}/assets/BBRegressorParam_r.mat'.format(model_root))
+
+        device = create_device(self.device_name)
+        self.device = device
+
+    def preprocess(self, input: Input) -> Dict[str, Any]:
+        img = LoadImage.convert_to_ndarray(input)
+        if len(img.shape) == 2:
+            img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)
+        img = img.astype(np.float)
+        result = {'img': img}
+        return result
+
+    def read_data(self,
+                  img,
+                  lm,
+                  lm3d_std,
+                  to_tensor=True,
+                  image_res=1024,
+                  img_fat=None):
+        # to RGB
+        im = PIL.Image.fromarray(img[..., ::-1])
+        W, H = im.size
+        lm[:, -1] = H - 1 - lm[:, -1]
+
+        im_lr_coeff, lm_lr_coeff = None, None
+        head_mask = None
+
+        _, im_lr, lm_lr, mask_lr_head = align_img(
+            im, lm, lm3d_std, mask=head_mask)
+        _, im_hd, lm_hd, _ = align_img(
+            im,
+            lm,
+            lm3d_std,
+            target_size=image_res,
+            rescale_factor=102.0 * image_res / 224)
+
+        mask_lr = self.face_sess.run(
+            self.face_sess.graph.get_tensor_by_name('output_alpha:0'),
+            feed_dict={'input_image:0': np.array(im_lr)})
+
+        if img_fat is not None:
+            assert img_fat.shape == img.shape
+            im_fat = PIL.Image.fromarray(img_fat[..., ::-1])
+
+            _, im_hd, _, _ = align_img(
+                im_fat,
+                lm,
+                lm3d_std,
+                target_size=image_res,
+                rescale_factor=102.0 * image_res / 224)
+
+        im_hd = np.array(im_hd).astype(np.float32)
+
+        if to_tensor:
+            im_lr = torch.tensor(
+                np.array(im_lr) / 255.,
+                dtype=torch.float32).permute(2, 0, 1).unsqueeze(0)
+            im_hd = torch.tensor(
+                np.array(im_hd) / 255.,
+                dtype=torch.float32).permute(2, 0, 1).unsqueeze(0)
+            mask_lr = torch.tensor(
+                np.array(mask_lr) / 255., dtype=torch.float32)[None,
+                                                               None, :, :]
+            mask_lr_head = torch.tensor(
+                np.array(mask_lr_head) / 255., dtype=torch.float32)[
+                    None, None, :, :] if mask_lr_head is not None else None
+            lm_lr = torch.tensor(lm_lr).unsqueeze(0)
+            lm_hd = torch.tensor(lm_hd).unsqueeze(0)
+        return im_lr, lm_lr, im_hd, lm_hd, mask_lr, mask_lr_head, im_lr_coeff, lm_lr_coeff
+
+    def prepare_data(self, img, lm_sess, five_points=None):
+        input_img, scale, bbox = align_for_lm(
+            img, five_points,
+            self.align_params)  # align for 68 landmark detection
+
+        if scale == 0:
+            return None
+
+        # detect landmarks
+        input_img = np.reshape(input_img, [1, 224, 224, 3]).astype(np.float32)
+
+        input_img = input_img[0, :, :, ::-1]
+        landmark = lm_sess.get_landmarks_from_image(input_img)[0]
+
+        landmark = landmark[:, :2] / scale
+        landmark[:, 0] = landmark[:, 0] + bbox[0]
+        landmark[:, 1] = landmark[:, 1] + bbox[1]
+
+        return landmark
+
+    def blend_eye_corner(self, tex_map, template_tex):
+        tex_map = tex_map.astype(np.float32)
+
+        x1 = int(288 * 4096 / 758)
+        y1 = int(235 * 4096 / 758)
+        w = int(90 * 4096 / 758)
+        h = int(50 * 4096 / 758)
+        template_tex_l = template_tex[y1:y1 + h, x1:x1 + w]
+        pred_tex_l = tex_map[y1:y1 + h, x1:x1 + w]
+        pred_tex_l_mean_rgb = np.sum(
+            pred_tex_l * self.l_eye_binary_mask, axis=(0, 1))
+        template_tex_l_mean_rgb = np.sum(
+            template_tex_l * self.l_eye_binary_mask, axis=(0, 1))
+        for ch in range(3):
+            template_tex_l[:, :, ch] *= pred_tex_l_mean_rgb[
+                ch] / template_tex_l_mean_rgb[ch]
+        pred_tex_l = pred_tex_l * (
+            1 - self.l_eye_mask) + template_tex_l * self.l_eye_mask
+
+        x2 = 4096 - x1 - w
+        y2 = y1
+        template_tex_r = template_tex[y2:y2 + h, x2:x2 + w]
+        pred_tex_r = tex_map[y2:y2 + h, x2:x2 + w]
+        pred_tex_r_mean_rgb = np.sum(
+            pred_tex_r * self.r_eye_binary_mask, axis=(0, 1))
+        template_tex_r_mean_rgb = np.sum(
+            template_tex_r * self.r_eye_binary_mask, axis=(0, 1))
+        for ch in range(3):
+            template_tex_r[:, :, ch] *= pred_tex_r_mean_rgb[
+                ch] / template_tex_r_mean_rgb[ch]
+        pred_tex_r = pred_tex_r * (
+            1 - self.r_eye_mask) + template_tex_r * self.r_eye_mask
+
+        tex_map[y1:y1 + h, x1:x1 + w] = pred_tex_l
+        tex_map[y2:y2 + h, x2:x2 + w] = pred_tex_r
+
+        return tex_map
+
+    def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
+        rgb_image = input['img'].cpu().numpy().astype(np.uint8)
+
+        bgr_image = cv2.cvtColor(rgb_image, cv2.COLOR_RGB2BGR)
+
+        img = bgr_image
+        # preprocess
+        flag = 0
+        box, results = self.face_mark_model.infer(img)
+        if results is None or np.array(results).shape[0] == 0:
+            flag = 1  # no face
+            return flag, {}
+
+        fatbgr = self.face_mark_model.fat_face(img, degree=0.02)
+
+        landmarks = []
+        results = results[0]
+        for idx in [74, 83, 54, 84, 90]:
+            landmarks.append([results[idx][0], results[idx][1]])
+        landmarks = np.array(landmarks)
+
+        landmarks = self.prepare_data(img, self.lm_sess, five_points=landmarks)
+
+        im_tensor, lm_tensor, im_hd_tensor, lm_hd_tensor, mask, _, _, _ = self.read_data(
+            img, landmarks, self.lm3d_std, image_res=1024, img_fat=fatbgr)
+        data = {
+            'imgs': im_tensor,
+            'imgs_hd': im_hd_tensor,
+            'lms': lm_tensor,
+            'lms_hd': lm_hd_tensor,
+            'face_mask': mask,
+            'img_name': 'temp',
+        }
+        self.model.set_input(data)  # unpack data from data loader
+
+        # reconstruct
+        out_dir = None
+        output = self.model(out_dir=out_dir)  # run inference
+
+        # process texture map
+        tex_map = output['head_tex_map'].astype(np.float32)
+        tex_map = cv2.resize(tex_map, (self.tex_size, self.tex_size + 1024))
+        bg_mean_rgb = np.sum(
+            self.bald_tex_bg * self.binary_front_mask, axis=(0, 1))
+        pred_tex_mean_rgb = np.sum(
+            tex_map * self.binary_front_mask, axis=(0, 1)) * 1.05
+        mid_mean_rgb = bg_mean_rgb * 0.8 + pred_tex_mean_rgb * 0.2
+        tex_map += (
+            (mid_mean_rgb - pred_tex_mean_rgb)
+            / np.sum(self.binary_front_mask, axis=(0, 1)))[None, None] * 0.5
+        pred_tex_mean_rgb = np.sum(
+            tex_map * self.binary_front_mask, axis=(0, 1)) * 1.05
+        _bald_tex_bg = self.bald_tex_bg.copy()
+        for ch in range(3):
+            _bald_tex_bg[:, :, ch] *= pred_tex_mean_rgb[ch] / bg_mean_rgb[ch]
+        tex_map = _bald_tex_bg * (
+            1. - self.front_mask) + tex_map * self.front_mask
+        tex_map = tex_map * 1.05
+        tex_map = self.blend_eye_corner(tex_map, self.bald_tex_bg)
+
+        # export mesh
+        results = {
+            'vertices': output['head_vertices'],
+            'faces': output['head_faces'],
+            'UVs': output['head_UVs'],
+            'faces_uv': output['head_faces_uv'],
+            'normals': output['head_normals'],
+            'texture_map': tex_map,
+        }
+
+        if out_dir is not None:
+            face_mesh = {
+                'vertices': output['face_vertices'],
+                'faces': output['face_faces'],
+                'colors': output['face_colors'],
+            }
+
+            write_obj(os.path.join(out_dir, 'face.obj'), face_mesh)
+            write_obj(os.path.join(out_dir, 'head.obj'), results)
+
+        return {OutputKeys.OUTPUT: results}
+
+    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
+        return inputs
diff --git a/modelscope/pipelines/cv/facial_landmark_confidence_pipeline.py b/modelscope/pipelines/cv/facial_landmark_confidence_pipeline.py
index cab8310e..8f26b286 100644
--- a/modelscope/pipelines/cv/facial_landmark_confidence_pipeline.py
+++ b/modelscope/pipelines/cv/facial_landmark_confidence_pipeline.py
@@ -24,8 +24,7 @@ logger = get_logger()
 
 
 @PIPELINES.register_module(
-    Tasks.facial_landmark_confidence,
-    module_name=Pipelines.facial_landmark_confidence)
+    Tasks.face_2d_keypoints, module_name=Pipelines.facial_landmark_confidence)
 class FacialLandmarkConfidencePipeline(FaceProcessingBasePipeline):
 
     def __init__(self, model: str, **kwargs):
@@ -56,8 +55,10 @@ class FacialLandmarkConfidencePipeline(FaceProcessingBasePipeline):
         lms = result[0].reshape(-1, 10).tolist()
         scores = [1 - result[1].tolist()]
         boxes = input['bbox'].cpu().numpy()[np.newaxis, :].tolist()
+        output_poses = []
         return {
             OutputKeys.SCORES: scores,
+            OutputKeys.POSES: output_poses,
             OutputKeys.KEYPOINTS: lms,
             OutputKeys.BOXES: boxes
         }
diff --git a/modelscope/pipelines/cv/image_color_enhance_pipeline.py b/modelscope/pipelines/cv/image_color_enhance_pipeline.py
index ca3dacec..daccfd04 100644
--- a/modelscope/pipelines/cv/image_color_enhance_pipeline.py
+++ b/modelscope/pipelines/cv/image_color_enhance_pipeline.py
@@ -6,7 +6,6 @@ from torchvision import transforms
 
 from modelscope.metainfo import Pipelines
 from modelscope.models.base import Model
-from modelscope.models.cv.image_color_enhance import ImageColorEnhance
 from modelscope.outputs import OutputKeys
 from modelscope.pipelines.base import Input, Pipeline
 from modelscope.pipelines.builder import PIPELINES
@@ -18,19 +17,43 @@ from modelscope.utils.logger import get_logger
 logger = get_logger()
 
 
+@PIPELINES.register_module(
+    Tasks.image_color_enhancement,
+    module_name=Pipelines.adaint_image_color_enhance)
+@PIPELINES.register_module(
+    Tasks.image_color_enhancement,
+    module_name=Pipelines.deeplpf_image_color_enhance)
 @PIPELINES.register_module(
     Tasks.image_color_enhancement, module_name=Pipelines.image_color_enhance)
 class ImageColorEnhancePipeline(Pipeline):
 
     def __init__(self,
-                 model: Union[ImageColorEnhance, str],
+                 model: Union[Model, 'AdaIntImageColorEnhance',
+                              'DeepLPFImageColorEnhance', 'ImageColorEnhance',
+                              str],
                  preprocessor: Optional[
                      ImageColorEnhanceFinetunePreprocessor] = None,
                  **kwargs):
-        """
-        use `model` and `preprocessor` to create a image color enhance pipeline for prediction
+        """The inference pipeline for image color enhance.
+
         Args:
-            model: model id on modelscope hub.
+            model (`str` or `Model` or module instance): A model instance or a model local dir
+                or a model id in the model hub.
+            preprocessor (`Preprocessor`, `optional`): A Preprocessor instance.
+            kwargs (dict, `optional`):
+                Extra kwargs passed into the preprocessor's constructor.
+
+        Example:
+            >>> import cv2
+            >>> from modelscope.outputs import OutputKeys
+            >>> from modelscope.pipelines import pipeline
+            >>> from modelscope.utils.constant import Tasks
+
+            >>> img = 'https://modelscope.oss-cn-beijing.aliyuncs.com/test/images/image_color_enhance.png'
+                image_color_enhance = pipeline(Tasks.image_color_enhancement,
+                    model='damo/cv_deeplpfnet_image-color-enhance-models')
+                result = image_color_enhance(img)
+            >>> cv2.imwrite('enhanced_result.png', result[OutputKeys.OUTPUT_IMG])
         """
         super().__init__(model=model, preprocessor=preprocessor, **kwargs)
         self.model.eval()
diff --git a/modelscope/pipelines/cv/image_debanding_pipeline.py b/modelscope/pipelines/cv/image_debanding_pipeline.py
new file mode 100644
index 00000000..76e51af4
--- /dev/null
+++ b/modelscope/pipelines/cv/image_debanding_pipeline.py
@@ -0,0 +1,66 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import Any, Dict, Optional, Union
+
+import torch
+from torchvision import transforms
+
+from modelscope.metainfo import Pipelines
+from modelscope.models.base import Model
+from modelscope.models.cv.image_debanding import RRDBImageDebanding
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines.base import Input, Pipeline
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.preprocessors import LoadImage
+from modelscope.utils.constant import Tasks
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+
+@PIPELINES.register_module(
+    Tasks.image_debanding, module_name=Pipelines.image_debanding)
+class ImageDebandingPipeline(Pipeline):
+
+    def __init__(self, model: Union[RRDBImageDebanding, str], **kwargs):
+        """The inference pipeline for image debanding.
+
+        Args:
+            model (`str` or `Model` or module instance): A model instance or a model local dir
+                or a model id in the model hub.
+            preprocessor (`Preprocessor`, `optional`): A Preprocessor instance.
+            kwargs (dict, `optional`):
+                Extra kwargs passed into the preprocessor's constructor.
+
+        Example:
+            >>> import cv2
+            >>> from modelscope.outputs import OutputKeys
+            >>> from modelscope.pipelines import pipeline
+            >>> from modelscope.utils.constant import Tasks
+            >>> debanding = pipeline(Tasks.image_debanding, model='damo/cv_rrdb_image-debanding')
+                result = debanding(
+                    'https://modelscope.oss-cn-beijing.aliyuncs.com/test/images/debanding.png')
+            >>> cv2.imwrite('result.png', result[OutputKeys.OUTPUT_IMG])
+        """
+        super().__init__(model=model, **kwargs)
+        self.model.eval()
+
+        if torch.cuda.is_available():
+            self._device = torch.device('cuda')
+        else:
+            self._device = torch.device('cpu')
+
+    def preprocess(self, input: Input) -> Dict[str, Any]:
+        img = LoadImage.convert_to_img(input)
+        test_transforms = transforms.Compose([transforms.ToTensor()])
+        img = test_transforms(img)
+        result = {'src': img.unsqueeze(0).to(self._device)}
+        return result
+
+    @torch.no_grad()
+    def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
+        return super().forward(input)
+
+    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
+        output_img = (inputs['outputs'].squeeze(0) * 255.).type(
+            torch.uint8).cpu().permute(1, 2, 0).numpy()[:, :, ::-1]
+        return {OutputKeys.OUTPUT_IMG: output_img}
diff --git a/modelscope/pipelines/cv/image_deblur_pipeline.py b/modelscope/pipelines/cv/image_deblur_pipeline.py
index 165e54a5..1c35872f 100644
--- a/modelscope/pipelines/cv/image_deblur_pipeline.py
+++ b/modelscope/pipelines/cv/image_deblur_pipeline.py
@@ -23,9 +23,8 @@ __all__ = ['ImageDeblurPipeline']
 class ImageDeblurPipeline(Pipeline):
     """
 
-    Example:
+    Examples:
 
-    ```python
     >>> from modelscope.pipelines import pipeline
     >>> from modelscope.utils.constant import Tasks
     >>> from modelscope.outputs import OutputKeys
@@ -35,7 +34,6 @@ class ImageDeblurPipeline(Pipeline):
     >>> image_deblur_pipeline = pipeline(Tasks.image_deblurring, 'damo/cv_nafnet_image-deblur_gopro')
     >>> result = image_deblur_pipeline(img)[OutputKeys.OUTPUT_IMG]
     >>> cv2.imwrite('result.png', result)
-    ```
     """
 
     def __init__(self,
diff --git a/modelscope/pipelines/cv/image_defrcn_fewshot_pipeline.py b/modelscope/pipelines/cv/image_defrcn_fewshot_pipeline.py
index ccd6eb8e..b9770d24 100644
--- a/modelscope/pipelines/cv/image_defrcn_fewshot_pipeline.py
+++ b/modelscope/pipelines/cv/image_defrcn_fewshot_pipeline.py
@@ -7,6 +7,7 @@ import numpy as np
 import torch
 
 from modelscope.metainfo import Pipelines
+from modelscope.models.base.base_model import Model
 from modelscope.outputs import OutputKeys
 from modelscope.pipelines.base import Input, Pipeline
 from modelscope.pipelines.builder import PIPELINES
@@ -18,24 +19,18 @@ from modelscope.utils.constant import ModelFile, Tasks
     Tasks.image_fewshot_detection,
     module_name=Pipelines.image_fewshot_detection)
 class ImageDefrcnDetectionPipeline(Pipeline):
-    """ Image DeFRCN few-shot detection Pipeline. Given a image,
-        pipeline will return the detection results on the image.
-        Example:
+    r"""
+    Image DeFRCN few-shot detection Pipeline. Given a image, pipeline will return the detection results on the image.
+
+    Examples:
 
-        ```python
         >>> from modelscope.pipelines import pipeline
         >>> detector = pipeline('image-fewshot-detection', 'damo/cv_resnet101_detection_fewshot-defrcn')
         >>> detector('/Path/Image')
-           {
-            'scores': [0.8307567834854126, 0.1606406420469284],
-            'labels': ['person', 'dog'],
-            'boxes': [
-                [27.391937255859375, 0.0, 353.0, 500.0],
-                [64.22428131103516, 229.2884521484375, 213.90573120117188, 370.0657958984375]
-            ]
-            }
-        >>> #
-        ```
+        >>> {'scores': [0.8307567834854126, 0.1606406420469284],
+        >>>  'labels': ['person', 'dog'],
+        >>>  'boxes': [[27.391937255859375, 0.0, 353.0, 500.0],
+        >>>            [64.22428131103516, 229.2884521484375, 213.90573120117188, 370.0657958984375]]}
     """
 
     def __init__(self, model: str, **kwargs):
@@ -44,6 +39,10 @@ class ImageDefrcnDetectionPipeline(Pipeline):
         """
         super().__init__(model=model, auto_collate=False, **kwargs)
 
+        assert isinstance(
+            self.model, Model
+        ), f'please check whether model config exists in {ModelFile.CONFIGURATION}'
+
         model_path = os.path.join(self.model.model_dir,
                                   ModelFile.TORCH_MODEL_FILE)
         self.model.model = self._load_pretrained(
@@ -65,12 +64,11 @@ class ImageDefrcnDetectionPipeline(Pipeline):
     def preprocess(self, input: Input) -> Dict[str, Any]:
 
         img = LoadImage.convert_to_ndarray(input)
-        img = img.astype(np.float)
 
         image = img[..., ::-1].copy()  # rgb to bgr
-        tim = torch.Tensor(image).permute(2, 0, 1)
+        tim = torch.Tensor(image).permute(2, 0, 1)  # hwc to chw
 
-        result = {'image': tim}
+        result = {'image': tim, 'image_numpy': image}
         return result
 
     def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
diff --git a/modelscope/pipelines/cv/image_detection_pipeline.py b/modelscope/pipelines/cv/image_detection_pipeline.py
index 08633c35..86963c37 100644
--- a/modelscope/pipelines/cv/image_detection_pipeline.py
+++ b/modelscope/pipelines/cv/image_detection_pipeline.py
@@ -10,13 +10,15 @@ from modelscope.pipelines.base import Input, Pipeline
 from modelscope.pipelines.builder import PIPELINES
 from modelscope.preprocessors import LoadImage
 from modelscope.utils.constant import Tasks
-from modelscope.utils.logger import get_logger
 
 
 @PIPELINES.register_module(
     Tasks.human_detection, module_name=Pipelines.human_detection)
 @PIPELINES.register_module(
     Tasks.image_object_detection, module_name=Pipelines.object_detection)
+@PIPELINES.register_module(
+    Tasks.image_object_detection,
+    module_name=Pipelines.abnormal_object_detection)
 class ImageDetectionPipeline(Pipeline):
 
     def __init__(self, model: str, **kwargs):
diff --git a/modelscope/pipelines/cv/image_driving_perception_pipeline.py b/modelscope/pipelines/cv/image_driving_perception_pipeline.py
new file mode 100644
index 00000000..037cdf3c
--- /dev/null
+++ b/modelscope/pipelines/cv/image_driving_perception_pipeline.py
@@ -0,0 +1,100 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os.path as osp
+from typing import Any, Dict
+
+import cv2
+import numpy as np
+
+from modelscope.metainfo import Pipelines
+from modelscope.models.cv.image_driving_perception import (
+    ImageDrivingPerceptionPreprocessor, driving_area_mask, lane_line_mask,
+    non_max_suppression, scale_coords, split_for_trace_model)
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines.base import Input, Pipeline
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.preprocessors import LoadImage
+from modelscope.utils.constant import ModelFile, Tasks
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+
+@PIPELINES.register_module(
+    Tasks.image_driving_perception,
+    module_name=Pipelines.yolopv2_image_driving_percetion_bdd100k)
+class ImageDrivingPerceptionPipeline(Pipeline):
+    """ Image Driving Perception Pipeline. Given a image,
+    pipeline will detects cars, and segments both lane lines and drivable areas.
+    Example:
+
+    ```python
+    >>> from modelscope.pipelines import pipeline
+    >>> image_driving_perception_pipeline = pipeline(Tasks.image_driving_perception,
+                                                        model='damo/cv_yolopv2_image-driving-perception_bdd100k')
+    >>> image_driving_perception_pipeline(img_path)
+    {
+        'boxes': [
+                    tensor([[1.0000e+00, 2.8600e+02, 4.0700e+02, 6.2600e+02],
+                            [8.8200e+02, 2.9600e+02, 1.0910e+03, 4.4700e+02],
+                            [3.7200e+02, 2.7500e+02, 5.2100e+02, 3.5500e+02],
+                            ...,
+                            [7.8600e+02, 2.8100e+02, 8.0400e+02, 3.0800e+02],
+                            [5.7000e+02, 2.8000e+02, 5.9400e+02, 3.0000e+02],
+                            [7.0500e+02, 2.7800e+02, 7.2100e+02, 2.9000e+02]])
+                ],
+        'masks': [
+                    array([[0, 0, 0, ..., 0, 0, 0],
+                            [0, 0, 0, ..., 0, 0, 0],
+                            [0, 0, 0, ..., 0, 0, 0],
+                            ...,
+                            [0, 0, 0, ..., 0, 0, 0],
+                            [0, 0, 0, ..., 0, 0, 0],
+                            [0, 0, 0, ..., 0, 0, 0]], dtype=int32),
+                    array([[0, 0, 0, ..., 0, 0, 0],
+                            [0, 0, 0, ..., 0, 0, 0],
+                            [0, 0, 0, ..., 0, 0, 0],
+                            ...,
+                            [0, 0, 0, ..., 0, 0, 0],
+                            [0, 0, 0, ..., 0, 0, 0],
+                            [0, 0, 0, ..., 0, 0, 0]], dtype=int32)
+                ]
+    }
+    >>> #
+    ```
+    """
+
+    def __init__(self, model: str, **kwargs):
+        """
+        use `model` and 'preprocessor' to create a image driving percetion pipeline for prediction
+        """
+        super().__init__(model=model, auto_collate=True, **kwargs)
+        if self.preprocessor is None:
+            self.preprocessor = ImageDrivingPerceptionPreprocessor()
+        logger.info('load model done')
+
+    def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
+        return self.model(input)
+
+    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
+        results_dict = {
+            OutputKeys.BOXES: [],
+            OutputKeys.MASKS: [],
+        }
+
+        pred = split_for_trace_model(inputs['pred'], inputs['anchor_grid'])
+
+        # Apply NMS
+        pred = non_max_suppression(pred)
+
+        da_seg_mask = driving_area_mask(inputs['driving_area_mask'])
+        ll_seg_mask = lane_line_mask(inputs['lane_line_mask'])
+
+        for det in pred:  # detections per image
+            if len(det):
+                # Rescale boxes from img_size to (720, 1280)
+                det[:, :4] = scale_coords(inputs['img_hw'], det[:, :4]).round()
+
+        results_dict[OutputKeys.BOXES].append(det[:, :4].cpu().numpy())
+        results_dict[OutputKeys.MASKS].append(da_seg_mask)
+        results_dict[OutputKeys.MASKS].append(ll_seg_mask)
+        return results_dict
diff --git a/modelscope/pipelines/cv/image_face_fusion_pipeline.py b/modelscope/pipelines/cv/image_face_fusion_pipeline.py
index 3ba253e1..dd343415 100644
--- a/modelscope/pipelines/cv/image_face_fusion_pipeline.py
+++ b/modelscope/pipelines/cv/image_face_fusion_pipeline.py
@@ -17,10 +17,11 @@ logger = get_logger()
 @PIPELINES.register_module(
     Tasks.image_face_fusion, module_name=Pipelines.image_face_fusion)
 class ImageFaceFusionPipeline(Pipeline):
-    """ Image face fusion pipeline
-    Example:
+    """
+    Image face fusion pipeline.
+
+    Examples:
 
-    python
     >>> from modelscope.pipelines import pipeline
     >>> image_face_fusion = pipeline(Tasks.image_face_fusion,
                    model='damo/cv_unet-image-face-fusion_damo')
@@ -31,7 +32,6 @@ class ImageFaceFusionPipeline(Pipeline):
        {
         "output_img": [H * W * 3] 0~255, we can use cv2.imwrite to save output_img as an image.
         }
-    >>> #
     """
 
     def __init__(self, model: str, **kwargs):
diff --git a/modelscope/pipelines/cv/image_human_parsing_pipeline.py b/modelscope/pipelines/cv/image_human_parsing_pipeline.py
new file mode 100644
index 00000000..01b29d81
--- /dev/null
+++ b/modelscope/pipelines/cv/image_human_parsing_pipeline.py
@@ -0,0 +1,126 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import Any, Dict, Optional, Union
+
+import numpy as np
+import torch
+import torchvision.transforms as T
+
+from modelscope.metainfo import Pipelines
+from modelscope.models.cv.image_human_parsing import (
+    M2FP, center_to_target_size_test)
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines.base import Input, Pipeline
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.preprocessors import LoadImage
+from modelscope.utils.constant import Tasks
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+
+@PIPELINES.register_module(
+    Tasks.image_segmentation, module_name=Pipelines.image_human_parsing)
+class ImageHumanParsingPipeline(Pipeline):
+
+    def __init__(self,
+                 model: Union[M2FP, str],
+                 preprocessor: Optional = None,
+                 **kwargs):
+        """use `model` and `preprocessor` to create an image human parsing
+        pipeline for prediction
+
+        Args:
+            model (M2FPModel | str): a model instance
+            preprocessor (None): a preprocessor instance
+        """
+        super().__init__(model=model, preprocessor=preprocessor, **kwargs)
+        self.model.eval()
+
+    def _get_preprocess_shape(self, oldh, oldw, short_edge_length, max_size):
+        h, w = oldh, oldw
+        size = short_edge_length * 1.0
+        scale = size / min(h, w)
+        if h < w:
+            newh, neww = size, scale * w
+        else:
+            newh, neww = scale * h, size
+        if max(newh, neww) > max_size:
+            scale = max_size * 1.0 / max(newh, neww)
+            newh = newh * scale
+            neww = neww * scale
+        neww = int(neww + 0.5)
+        newh = int(newh + 0.5)
+        return (newh, neww)
+
+    def preprocess(self,
+                   input: Input,
+                   min_size=640,
+                   max_size=1333) -> Dict[str, Any]:
+        image = LoadImage.convert_to_img(input)
+        w, h = image.size[:2]
+        dataset_dict = {'width': w, 'height': h}
+        if self.model.single_human:
+            image = np.asarray(image)
+            image, crop_box = center_to_target_size_test(
+                image, self.model.input_single_human['sizes'][0])
+            dataset_dict['image'] = torch.as_tensor(
+                np.ascontiguousarray(image.transpose(2, 0, 1)))
+            dataset_dict['crop_box'] = crop_box
+        else:
+            new_h, new_w = self._get_preprocess_shape(h, w, min_size, max_size)
+            test_transforms = T.Compose([
+                T.Resize((new_h, new_w)),
+                T.ToTensor(),
+            ])
+            image = test_transforms(image)
+            dataset_dict['image'] = image * 255.
+        result = {'batched_inputs': [dataset_dict]}
+        return result
+
+    def forward(self, input: Dict[str, Any],
+                **forward_params) -> Dict[str, Any]:
+        with torch.no_grad():
+            output = self.model(input)
+        return output
+
+    def postprocess(self,
+                    inputs: Dict[str, Any],
+                    score_thr=0.0) -> Dict[str, Any]:
+        predictions = inputs['eval_result'][0]
+        class_names = self.model.classes
+        results_dict = {
+            OutputKeys.MASKS: [],
+            OutputKeys.LABELS: [],
+            OutputKeys.SCORES: []
+        }
+        if 'sem_seg' in predictions:
+            semantic_pred = predictions['sem_seg']
+            semantic_seg = semantic_pred.argmax(dim=0).detach().cpu().numpy()
+            semantic_pred = semantic_pred.sigmoid().detach().cpu().numpy()
+            class_ids = np.unique(semantic_seg)
+            for class_id in class_ids:
+                label = class_names[class_id]
+                mask = np.array(semantic_seg == class_id, dtype=np.float64)
+                score = (mask * semantic_pred[class_id]).sum() / (
+                    mask.sum() + 1)
+                results_dict[OutputKeys.SCORES].append(score)
+                results_dict[OutputKeys.LABELS].append(label)
+                results_dict[OutputKeys.MASKS].append(mask)
+        elif 'parsing' in predictions:
+            parsing_res = predictions['parsing']
+            part_outputs = parsing_res['part_outputs']
+            human_outputs = parsing_res['human_outputs']
+
+            # process semantic_outputs
+            for output in part_outputs + human_outputs:
+                score = output['score']
+                label = class_names[output['category_id']]
+                mask = (output['mask'] > 0).float().detach().cpu().numpy()
+                if score > score_thr:
+                    results_dict[OutputKeys.SCORES].append(score)
+                    results_dict[OutputKeys.LABELS].append(label)
+                    results_dict[OutputKeys.MASKS].append(mask)
+        else:
+            raise NotImplementedError
+
+        return results_dict
diff --git a/modelscope/pipelines/cv/image_inpainting_sdv2_pipeline.py b/modelscope/pipelines/cv/image_inpainting_sdv2_pipeline.py
new file mode 100644
index 00000000..ca3b3e0d
--- /dev/null
+++ b/modelscope/pipelines/cv/image_inpainting_sdv2_pipeline.py
@@ -0,0 +1,128 @@
+# Copyright © Alibaba, Inc. and its affiliates.
+
+import math
+import os
+import sys
+import tempfile
+from typing import Any, Dict, Optional, Union
+
+import cv2
+import numpy as np
+import torch
+from diffusers import StableDiffusionInpaintPipeline
+
+from modelscope.metainfo import Pipelines
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.pipelines.multi_modal.diffusers_wrapped.diffusers_pipeline import \
+    DiffusersPipeline
+from modelscope.preprocessors.image import load_image
+from modelscope.utils.config import Config
+from modelscope.utils.constant import Tasks
+
+
+@PIPELINES.register_module(
+    Tasks.image_inpainting, module_name=Pipelines.image_inpainting_sdv2)
+class ImageInpaintingSDV2Pipeline(DiffusersPipeline):
+    """ Stable Diffusion for Image Inpainting Pipeline.
+
+    Example:
+
+    >>> import cv2
+    >>> from modelscope.outputs import OutputKeys
+    >>> from modelscope.pipelines import pipeline
+    >>> from modelscope.utils.constant import Tasks
+
+    >>> input_location = 'data/test/images/image_inpainting/image_inpainting.png'
+    >>> input_mask_location = 'data/test/images/image_inpainting/image_inpainting_mask.png'
+    >>> prompt = 'background'
+
+    >>> input = {
+    >>>     'image': input_location,
+    >>>     'mask': input_mask_location,
+    >>>     'prompt': prompt
+    >>> }
+    >>> image_inpainting = pipeline(Tasks.image_inpainting, model='damo/cv_stable-diffusion-v2_image-inpainting_base')
+    >>> output = image_inpainting(input)[OutputKeys.OUTPUT_IMG]
+    >>> cv2.imwrite('result.png', output)
+
+    """
+
+    def __init__(self, model: str, device: str = 'gpu', **kwargs):
+        """
+        Use `model` to create a stable diffusion pipeline for image inpainting.
+        Args:
+            model: model id on modelscope hub.
+            device: str = 'gpu'
+        """
+        super().__init__(model, device, **kwargs)
+
+        torch_dtype = kwargs.get('torch_dtype', torch.float16)
+
+        # build upon the diffuser stable diffusion pipeline
+        self.pipeline = StableDiffusionInpaintPipeline.from_pretrained(
+            model, torch_dtype=torch_dtype)
+        self.pipeline.to(self.device)
+
+        enable_attention_slicing = kwargs.get('enable_attention_slicing', True)
+        if enable_attention_slicing:
+            self.pipeline.enable_attention_slicing()
+
+    def _sanitize_parameters(self, **pipeline_parameters):
+        """
+        this method should sanitize the keyword args to preprocessor params,
+        forward params and postprocess params on '__call__' or '_process_single' method
+
+        Returns:
+            Dict[str, str]:  preprocess_params = {}
+            Dict[str, str]:  forward_params = pipeline_parameters
+            Dict[str, str]:  postprocess_params = pipeline_parameters
+        """
+        return {}, pipeline_parameters, pipeline_parameters
+
+    def forward(self, inputs: Dict[str, Any],
+                **forward_params) -> Dict[str, Any]:
+        if not isinstance(inputs, dict):
+            raise ValueError(
+                f'Expected the input to be a dictionary, but got {type(input)}'
+            )
+
+        num_inference_steps = inputs.get('num_inference_steps', 50)
+        guidance_scale = inputs.get('guidance_scale', 7.5)
+        negative_prompt = inputs.get('negative_prompt', None)
+        num_images_per_prompt = inputs.get('num_images_per_prompt', 1)
+        eta = inputs.get('eta', 0.0)
+
+        if 'prompt' in inputs.keys():
+            prompt = inputs['prompt']
+        else:
+            # for demo_service
+            prompt = forward_params.get('prompt', 'background')
+        print(f'Test with prompt: {prompt}')
+
+        image = load_image(inputs['image'])
+        mask = load_image(inputs['mask'])
+
+        w, h = image.size
+        print(f'loaded input image of size ({w}, {h})')
+        width, height = map(lambda x: x - x % 64,
+                            (w, h))  # resize to integer multiple of 64
+        image = image.resize((width, height))
+        mask = mask.resize((width, height))
+        out_image = self.pipeline(
+            prompt=prompt,
+            image=image,
+            mask_image=mask,
+            height=height,
+            width=width,
+            num_inference_steps=num_inference_steps,
+            guidance_scale=guidance_scale,
+            negative_prompt=negative_prompt,
+            num_images_per_prompt=num_images_per_prompt,
+            eta=eta).images[0]
+
+        return {'result': out_image}
+
+    def postprocess(self, inputs: Dict[str, Any], **kwargs) -> Dict[str, Any]:
+        result = np.array(inputs['result'])
+        return {OutputKeys.OUTPUT_IMG: result[:, :, ::-1]}
diff --git a/modelscope/pipelines/cv/image_instance_segmentation_pipeline.py b/modelscope/pipelines/cv/image_instance_segmentation_pipeline.py
index 5a0f0d7e..e9dcc27d 100644
--- a/modelscope/pipelines/cv/image_instance_segmentation_pipeline.py
+++ b/modelscope/pipelines/cv/image_instance_segmentation_pipeline.py
@@ -8,6 +8,7 @@ import torch
 from PIL import Image
 
 from modelscope.metainfo import Pipelines
+from modelscope.models.base.base_model import Model
 from modelscope.models.cv.image_instance_segmentation import (
     CascadeMaskRCNNSwinModel, get_img_ins_seg_result)
 from modelscope.pipelines.base import Input, Pipeline
@@ -40,6 +41,8 @@ class ImageInstanceSegmentationPipeline(Pipeline):
         super().__init__(model=model, preprocessor=preprocessor, **kwargs)
 
         if preprocessor is None:
+            assert isinstance(self.model, Model), \
+                f'please check whether model config exists in {ModelFile.CONFIGURATION}'
             config_path = os.path.join(self.model.model_dir,
                                        ModelFile.CONFIGURATION)
             cfg = Config.from_file(config_path)
diff --git a/modelscope/pipelines/cv/image_matching_pipeline.py b/modelscope/pipelines/cv/image_matching_pipeline.py
index d16590d4..15406a86 100644
--- a/modelscope/pipelines/cv/image_matching_pipeline.py
+++ b/modelscope/pipelines/cv/image_matching_pipeline.py
@@ -22,26 +22,24 @@ logger = get_logger()
 class ImageMatchingPipeline(Pipeline):
     """ Image Matching Pipeline.
 
-    Example:
+    Examples:
 
-    ```python
-    from modelscope.outputs import OutputKeys
-    from modelscope.pipelines import pipeline
-    from modelscope.utils.constant import Tasks
+    >>> from modelscope.outputs import OutputKeys
+    >>> from modelscope.pipelines import pipeline
+    >>> from modelscope.utils.constant import Tasks
 
 
-    task = 'image-matching'
-    model_id = 'damo/cv_quadtree_attention_image-matching_outdoor'
+    >>> task = 'image-matching'
+    >>> model_id = 'damo/cv_quadtree_attention_image-matching_outdoor'
 
-    input_location = [
-                        ['data/test/images/image_matching1.jpg',
-                        'data/test/images/image_matching2.jpg']
-                    ]
-    estimator = pipeline(Tasks.image_matching, model=self.model_id)
-    result = estimator(input_location)
-    kpts0, kpts1, conf = result[0][OutputKeys.MATCHES]
-    print(f'Found {len(kpts0)} matches')
-    ```
+    >>> input_location = [
+    >>>                     ['data/test/images/image_matching1.jpg',
+    >>>                     'data/test/images/image_matching2.jpg']
+    >>>                 ]
+    >>> estimator = pipeline(Tasks.image_matching, model=self.model_id)
+    >>> result = estimator(input_location)
+    >>> kpts0, kpts1, conf = result[0][OutputKeys.MATCHES]
+    >>> print(f'Found {len(kpts0)} matches')
     """
 
     def __init__(self, model: str, **kwargs):
diff --git a/modelscope/pipelines/cv/image_matting_pipeline.py b/modelscope/pipelines/cv/image_matting_pipeline.py
index fb5d8f8b..5f5d1d56 100644
--- a/modelscope/pipelines/cv/image_matting_pipeline.py
+++ b/modelscope/pipelines/cv/image_matting_pipeline.py
@@ -4,6 +4,7 @@ from typing import Any, Dict
 
 import cv2
 import numpy as np
+import tensorflow as tf
 
 from modelscope.metainfo import Pipelines
 from modelscope.outputs import OutputKeys
@@ -14,11 +15,16 @@ from modelscope.utils.constant import ModelFile, Tasks
 from modelscope.utils.device import device_placement
 from modelscope.utils.logger import get_logger
 
+if tf.__version__ >= '2.0':
+    tf = tf.compat.v1
+
 logger = get_logger()
 
 
 @PIPELINES.register_module(
     Tasks.portrait_matting, module_name=Pipelines.portrait_matting)
+@PIPELINES.register_module(
+    Tasks.universal_matting, module_name=Pipelines.universal_matting)
 class ImageMattingPipeline(Pipeline):
 
     def __init__(self, model: str, **kwargs):
@@ -28,9 +34,6 @@ class ImageMattingPipeline(Pipeline):
             model: model id on modelscope hub.
         """
         super().__init__(model=model, **kwargs)
-        import tensorflow as tf
-        if tf.__version__ >= '2.0':
-            tf = tf.compat.v1
         model_path = osp.join(self.model, ModelFile.TF_GRAPH_FILE)
 
         with device_placement(self.framework, self.device_name):
diff --git a/modelscope/pipelines/cv/image_open_vocabulary_detection_pipeline.py b/modelscope/pipelines/cv/image_open_vocabulary_detection_pipeline.py
new file mode 100644
index 00000000..61048405
--- /dev/null
+++ b/modelscope/pipelines/cv/image_open_vocabulary_detection_pipeline.py
@@ -0,0 +1,76 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os
+from typing import Any, Dict, Union
+
+import cv2
+import numpy as np
+import PIL
+import torch
+from PIL import Image
+
+from modelscope.metainfo import Pipelines
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines.base import Input, Model, Pipeline
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.preprocessors import LoadImage
+from modelscope.utils.constant import Tasks
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+
+@PIPELINES.register_module(
+    Tasks.open_vocabulary_detection,
+    module_name=Pipelines.open_vocabulary_detection_vild)
+# @PIPELINES.register_module(
+#     Tasks.image_object_detection, module_name=Pipelines.open_vocabulary_detection)
+class ImageOpenVocabularyDetectionPipeline(Pipeline):
+
+    def __init__(self, model: str, **kwargs):
+        """
+        use `model` to create a image open vocabulary detection pipeline for prediction
+        Args:
+            model: model id on modelscope hub.
+        Example:
+            >>> from modelscope.pipelines import pipeline
+            >>> vild_pipeline = pipeline(Tasks.open_vocabulary_detection,
+                model='damo/cv_resnet152_open-vocabulary-detection_vild')
+
+            >>> image_path = 'test.jpg'
+            >>> category_names =  ';'.join([
+                    'flipflop', 'street sign', 'bracelet', 'necklace', 'shorts',
+                    'floral camisole', 'orange shirt', 'purple dress', 'yellow tee',
+                    'green umbrella', 'pink striped umbrella', 'transparent umbrella',
+                    'plain pink umbrella', 'blue patterned umbrella', 'koala',
+                    'electric box', 'car', 'pole'
+                    ])
+            >>> input_dict = {'img':image_path, 'category_names':category_names}
+            >>> result = vild_pipeline(input_dict)
+            >>> print(result[OutputKeys.BOXES])
+        """
+        super().__init__(model=model, **kwargs)
+
+        logger.info('open vocabulary detection model, pipeline init')
+
+    def preprocess(self, input: Dict[str, Any]) -> Dict[str, Any]:
+        # img_path, category_names = input[0], input[1]
+
+        img = LoadImage(mode='rgb')(input['img'])['img']
+        data = {'img': img, 'category_names': input['category_names']}
+
+        return data
+
+    def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
+        results = self.model.forward(**input)
+        return results
+
+    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
+        scores, labels, bboxes = self.model.postprocess(inputs)
+
+        outputs = {
+            OutputKeys.SCORES: scores,
+            OutputKeys.LABELS: labels,
+            OutputKeys.BOXES: bboxes
+        }
+
+        return outputs
diff --git a/modelscope/pipelines/cv/image_paintbyexample_pipeline.py b/modelscope/pipelines/cv/image_paintbyexample_pipeline.py
new file mode 100644
index 00000000..1b969a89
--- /dev/null
+++ b/modelscope/pipelines/cv/image_paintbyexample_pipeline.py
@@ -0,0 +1,150 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import Any, Dict
+
+import cv2
+import numpy as np
+import PIL
+import torch
+import torch.nn as nn
+import torchvision
+from einops import rearrange
+from PIL import Image
+from torch.utils.data._utils.collate import default_collate
+from torchvision.transforms import Resize
+
+from modelscope.metainfo import Pipelines
+from modelscope.models.cv.image_paintbyexample import \
+    StablediffusionPaintbyexample
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines.base import Input, Pipeline
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.preprocessors.image import load_image
+from modelscope.utils.constant import Tasks
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+
+@PIPELINES.register_module(
+    Tasks.image_paintbyexample, module_name=Pipelines.image_paintbyexample)
+class ImagePaintbyexamplePipeline(Pipeline):
+
+    def __init__(self, model: str, **kwargs):
+        """
+            model: model id on modelscope hub.
+        """
+        assert isinstance(model, str), 'model must be a single str'
+        from paint_ldm.models.diffusion.plms import PLMSSampler
+        super().__init__(model=model, auto_collate=False, **kwargs)
+        self.sampler = PLMSSampler(self.model.model)
+        self.start_code = None
+
+    def get_tensor(self, normalize=True, toTensor=True):
+        transform_list = []
+        if toTensor:
+            transform_list += [torchvision.transforms.ToTensor()]
+
+        if normalize:
+            transform_list += [
+                torchvision.transforms.Normalize((0.5, 0.5, 0.5),
+                                                 (0.5, 0.5, 0.5))
+            ]
+        return torchvision.transforms.Compose(transform_list)
+
+    def get_tensor_clip(self, normalize=True, toTensor=True):
+        transform_list = []
+        if toTensor:
+            transform_list += [torchvision.transforms.ToTensor()]
+
+        if normalize:
+            transform_list += [
+                torchvision.transforms.Normalize(
+                    (0.48145466, 0.4578275, 0.40821073),
+                    (0.26862954, 0.26130258, 0.27577711))
+            ]
+        return torchvision.transforms.Compose(transform_list)
+
+    def preprocess(self, input: Dict[str, Any]) -> Dict[str, Any]:
+        if isinstance(input['img'], str):
+            image_name, mask_name, ref_name = input['img'], input[
+                'mask'], input['reference']
+            img = load_image(image_name).resize((512, 512))
+            ref = load_image(ref_name).resize((224, 224))
+            mask = load_image(mask_name).resize((512, 512)).convert('L')
+        elif isinstance(input['img'], PIL.Image.Image):
+            img = input['img'].convert('RGB').resize((512, 512))
+            ref = input['reference'].convert('RGB').resize((224, 224))
+            mask = input['mask'].resize((512, 512)).convert('L')
+        else:
+            raise TypeError(
+                'input should be either str or PIL.Image, and both inputs should have the same type'
+            )
+        img = self.get_tensor()(img)
+        img = img.unsqueeze(0)
+        ref = self.get_tensor_clip()(ref)
+        ref = ref.unsqueeze(0)
+        mask = np.array(mask)[None, None]
+        mask = 1 - mask.astype(np.float32) / 255.0
+        mask[mask < 0.5] = 0
+        mask[mask >= 0.5] = 1
+        mask = torch.from_numpy(mask)
+        inpaint_image = img * mask
+        test_model_kwargs = {}
+        test_model_kwargs['inpaint_mask'] = mask.to(self.device)
+        test_model_kwargs['inpaint_image'] = inpaint_image.to(self.device)
+        test_model_kwargs['ref_tensor'] = ref.to(self.device)
+
+        return test_model_kwargs
+
+    def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
+        result = self.perform_inference(input)
+        return {OutputKeys.OUTPUT_IMG: result}
+
+    def perform_inference(self, test_model_kwargs):
+        with torch.no_grad():
+            with self.model.model.ema_scope():
+                ref_tensor = test_model_kwargs['ref_tensor']
+                uc = self.model.model.learnable_vector
+                c = self.model.model.get_learned_conditioning(
+                    ref_tensor.to(torch.float32))
+                c = self.model.model.proj_out(c)
+                z_inpaint = self.model.model.encode_first_stage(
+                    test_model_kwargs['inpaint_image'])
+                z_inpaint = self.model.model.get_first_stage_encoding(
+                    z_inpaint).detach()
+                test_model_kwargs['inpaint_image'] = z_inpaint
+                test_model_kwargs['inpaint_mask'] = Resize(
+                    [z_inpaint.shape[-2], z_inpaint.shape[-1]])(
+                        test_model_kwargs['inpaint_mask'])
+
+                shape = [4, 512 // 8, 512 // 8]
+                samples_ddim, _ = self.sampler.sample(
+                    S=50,
+                    conditioning=c,
+                    batch_size=1,
+                    shape=shape,
+                    verbose=False,
+                    unconditional_guidance_scale=5,
+                    unconditional_conditioning=uc,
+                    eta=0.0,
+                    x_T=self.start_code,
+                    test_model_kwargs=test_model_kwargs)
+
+                x_samples_ddim = self.model.model.decode_first_stage(
+                    samples_ddim)
+                x_samples_ddim = torch.clamp(
+                    (x_samples_ddim + 1.0) / 2.0, min=0.0, max=1.0)
+                x_samples_ddim = x_samples_ddim.cpu().permute(0, 2, 3,
+                                                              1).numpy()
+
+                x_checked_image = x_samples_ddim
+                x_checked_image_torch = torch.from_numpy(
+                    x_checked_image).permute(0, 3, 1, 2)[0]
+
+                x_sample = 255. * rearrange(
+                    x_checked_image_torch.cpu().numpy(), 'c h w -> h w c')
+                img = Image.fromarray(x_sample.astype(np.uint8))
+        return img
+
+    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
+        return inputs
diff --git a/modelscope/pipelines/cv/image_quality_assessment_degradation_pipeline.py b/modelscope/pipelines/cv/image_quality_assessment_degradation_pipeline.py
new file mode 100644
index 00000000..e9c07979
--- /dev/null
+++ b/modelscope/pipelines/cv/image_quality_assessment_degradation_pipeline.py
@@ -0,0 +1,95 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import math
+import tempfile
+from typing import Any, Dict, Optional, Union
+
+import cv2
+import numpy as np
+import torch
+from torchvision import transforms
+
+from modelscope.metainfo import Pipelines
+from modelscope.models.cv.image_quality_assessment_degradation import \
+    ImageQualityAssessmentDegradation
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines.base import Input, Pipeline
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.preprocessors import LoadImage
+from modelscope.utils.constant import Tasks
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+
+@PIPELINES.register_module(
+    Tasks.image_quality_assessment_degradation,
+    module_name=Pipelines.image_quality_assessment_degradation)
+class ImageQualityAssessmentDegradationPipeline(Pipeline):
+    """ Image Quality Assessment Degradation Pipeline which will return mean option score for the input image.
+
+        Example:
+
+        ```python
+        >>> from modelscope.pipelines import pipeline
+        >>> from modelscope.outputs import OutputKeys
+        >>> from modelscope.utils.constant import Tasks
+
+        >>> test_image = 'https://modelscope.oss-cn-beijing.aliyuncs.com/test/images/dogs.jpg'
+        >>> assessment_predictor = pipeline(Tasks.image_quality_assessment_degradation, \
+            model='damo/cv_resnet50_image-quality-assessment_degradation')
+        >>> out_res = assessment_predictor(test_image)[OutputKeys.SCORES]
+        >>> print('Pipeline: the output noise degree is {}, the output blur degree is {}, \
+                the output compression degree is {}'.format(out_res[0], out_res[1], out_res[2]))
+
+        ```
+        """
+
+    def __init__(self, model: Union[ImageQualityAssessmentDegradation, str],
+                 **kwargs):
+        """
+        use `model` to create image quality assessment degradation pipeline for prediction
+        Args:
+            model: model id on modelscope hub or `ImageQualityAssessmentDegradation` Model.
+            preprocessor: preprocessor for input image
+
+        """
+        super().__init__(model=model, **kwargs)
+
+        if torch.cuda.is_available():
+            self._device = torch.device('cuda')
+        else:
+            self._device = torch.device('cpu')
+
+        logger.info('load vqa-degradation model done')
+
+    def preprocess(self, input: Input) -> Dict[str, Any]:
+        img = LoadImage.convert_to_img(input)
+        w, h = img.size
+        if h * w < 1280 * 720:
+            img = transforms.functional.resize(img, 720)
+        test_transforms = transforms.Compose([transforms.ToTensor()])
+        img = test_transforms(img).unsqueeze(0)
+        result = {'src': img.to(self._device)}
+        return result
+
+    @torch.no_grad()
+    def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
+        """
+        inference for image quality assessment degradation prediction
+        Args:
+            input: dict including torch tensor.
+
+        """
+        outputs = self.model._inference_forward(input['src'])
+        noise_degree, blur_degree, comp_degree = outputs['noise_degree'].cpu(
+        ), outputs['blur_degree'].cpu(), outputs['comp_degree'].cpu()
+        return {
+            OutputKeys.SCORES:
+            [noise_degree.item(),
+             blur_degree.item(),
+             comp_degree.item()],
+            OutputKeys.LABELS: ['噪声强度', '模糊程度', '压缩强度']
+        }
+
+    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
+        return inputs
diff --git a/modelscope/pipelines/cv/image_quality_assessment_mos_pipeline.py b/modelscope/pipelines/cv/image_quality_assessment_mos_pipeline.py
new file mode 100644
index 00000000..657c4405
--- /dev/null
+++ b/modelscope/pipelines/cv/image_quality_assessment_mos_pipeline.py
@@ -0,0 +1,80 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import math
+import tempfile
+from typing import Any, Dict, Optional, Union
+
+import cv2
+import numpy as np
+import torch
+from torchvision import transforms
+
+from modelscope.metainfo import Pipelines
+from modelscope.models.cv.image_quality_assessment_mos import \
+    ImageQualityAssessmentMos
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines.base import Input, Pipeline
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.preprocessors import LoadImage
+from modelscope.preprocessors.cv import \
+    ImageQualityAssessmentMosPreprocessor as MosPreprocessor
+from modelscope.utils.constant import Tasks
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+
+@PIPELINES.register_module(
+    Tasks.image_quality_assessment_mos,
+    module_name=Pipelines.image_quality_assessment_mos)
+class ImageQualityAssessmentMosPipeline(Pipeline):
+    """ Image Quality Assessment MOS Pipeline which will return mean option score for the input image.
+
+        Example:
+
+        ```python
+        >>> from modelscope.pipelines import pipeline
+        >>> from modelscope.outputs import OutputKeys
+        >>> from modelscope.utils.constant import Tasks
+
+        >>> test_image = 'https://modelscope.oss-cn-beijing.aliyuncs.com/test/images/dogs.jpg'
+        >>> assessment_predictor = pipeline(Tasks.image_quality_assessment_mos, \
+            model='damo/cv_resnet_image-quality-assessment-mos_youtubeUGC')
+        >>> out_mos = assessment_predictor(test_image)[OutputKeys.SCORE]
+        >>> print('Pipeline: the output mos is {}'.format(out_mos))
+
+        ```
+        """
+
+    def __init__(self,
+                 model: Union[ImageQualityAssessmentMos, str],
+                 preprocessor=MosPreprocessor(),
+                 **kwargs):
+        """
+        use `model` to create image quality assessment mos pipeline for prediction
+        Args:
+            model: model id on modelscope hub or `ImageQualityAssessmentMos` Model.
+            preprocessor: preprocessor for input image
+
+        """
+        super().__init__(model=model, preprocessor=preprocessor, **kwargs)
+
+        if torch.cuda.is_available():
+            self._device = torch.device('cuda')
+        else:
+            self._device = torch.device('cpu')
+
+        logger.info('load vqa-mos model done')
+
+    @torch.no_grad()
+    def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
+        """
+        inference for image quality assessment prediction
+        Args:
+            input: dict including torch tensor.
+
+        """
+        outputs = self.model.forward({'input': input['input']})['output'].cpu()
+        return {OutputKeys.SCORE: outputs.item()}
+
+    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
+        return inputs
diff --git a/modelscope/pipelines/cv/image_restoration_pipeline.py b/modelscope/pipelines/cv/image_restoration_pipeline.py
new file mode 100644
index 00000000..85c1f55e
--- /dev/null
+++ b/modelscope/pipelines/cv/image_restoration_pipeline.py
@@ -0,0 +1,54 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+from typing import Any, Dict
+
+from modelscope.metainfo import Pipelines
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines.base import Input, Pipeline
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.preprocessors import LoadImage
+from modelscope.utils.constant import Tasks
+
+
+@PIPELINES.register_module(
+    Tasks.image_demoireing, module_name=Pipelines.image_demoire)
+class ImageRestorationPipeline(Pipeline):
+    """ Image Restoration Pipeline .
+
+    Take image_demoireing as an example:
+        >>> from modelscope.pipelines import pipeline
+        >>> image_demoire = pipeline(Tasks.image_demoireing, model=model_id)
+        >>> image_demoire("https://modelscope.oss-cn-beijing.aliyuncs.com/test/images/image_moire.jpg")
+
+    """
+
+    def __init__(self, model: str, **kwargs):
+        """
+            model: model id on modelscope hub.
+        """
+        super().__init__(model=model, auto_collate=False, **kwargs)
+
+    def preprocess(self, input: Input) -> Dict[str, Any]:
+
+        img = LoadImage.convert_to_ndarray(input)
+        img_h, img_w, _ = img.shape
+        result = self.preprocessor(img)
+        result['img_h'] = img_h
+        result['img_w'] = img_w
+        return result
+
+    def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
+
+        output = self.model(input)
+        result = {
+            'img': output,
+            'img_w': input['img_w'],
+            'img_h': input['img_h']
+        }
+        return result
+
+    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
+
+        data = inputs['img']
+        outputs = {OutputKeys.OUTPUT_IMG: data}
+        return outputs
diff --git a/modelscope/pipelines/cv/image_skychange_pipeline.py b/modelscope/pipelines/cv/image_skychange_pipeline.py
index c71135b5..8e5b40a8 100644
--- a/modelscope/pipelines/cv/image_skychange_pipeline.py
+++ b/modelscope/pipelines/cv/image_skychange_pipeline.py
@@ -22,22 +22,19 @@ logger = get_logger()
 @PIPELINES.register_module(
     Tasks.image_skychange, module_name=Pipelines.image_skychange)
 class ImageSkychangePipeline(Pipeline):
-    """ Image Sky Change Pipeline. Given two images(sky_image and scene_image),
-    pipeline will replace the sky style of sky_image with the sky style of scene_image.
-    Example:
+    """
+    Image Sky Change Pipeline. Given two images(sky_image and scene_image), pipeline will replace the sky style
+    of sky_image with the sky style of scene_image.
+
+    Examples:
 
-    ```python
     >>> from modelscope.pipelines import pipeline
     >>> detector = pipeline('image-skychange', 'damo/cv_hrnetocr_skychange')
     >>> detector({
             'sky_image': 'sky_image.jpg', # sky_image path (str)
             'scene_image': 'scene_image.jpg', # scene_image path (str)
         })
-       {
-        "output_img": [H * W * 3] 0~255, we can use cv2.imwrite to save output_img as an image.
-        }
-    >>> #
-    ```
+    >>> {"output_img": [H * W * 3] 0~255, we can use cv2.imwrite to save output_img as an image.}
     """
 
     def __init__(self, model: str, **kwargs):
diff --git a/modelscope/pipelines/cv/image_structured_model_probing_pipeline.py b/modelscope/pipelines/cv/image_structured_model_probing_pipeline.py
new file mode 100644
index 00000000..bc2561e2
--- /dev/null
+++ b/modelscope/pipelines/cv/image_structured_model_probing_pipeline.py
@@ -0,0 +1,79 @@
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
+
+import math
+import os
+import os.path as osp
+from typing import Any, Dict
+
+import numpy as np
+import torch
+import torchvision.transforms as transforms
+from mmcv.parallel import collate, scatter
+
+from modelscope.metainfo import Pipelines
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines.base import Input, Pipeline
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.preprocessors import LoadImage
+from modelscope.utils.config import Config
+from modelscope.utils.constant import ModelFile, Tasks
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+
+@PIPELINES.register_module(
+    Tasks.image_classification,
+    module_name=Pipelines.image_structured_model_probing)
+class ImageStructuredModelProbingPipeline(Pipeline):
+
+    def __init__(self, model: str, **kwargs):
+        """
+        use `model` to create a vision middleware pipeline for prediction
+        Args:
+            model: model id on modelscope hub.
+        Example:
+            >>> from modelscope.pipelines import pipeline
+            >>> recognition_pipeline = pipeline(self.task, self.model_id)
+            >>> file_name = 'data/test/images/\
+                image_structured_model_probing_test_image.jpg'
+            >>> result = recognition_pipeline(file_name)
+            >>> print(f'recognition output: {result}.')
+        """
+        super().__init__(model=model, **kwargs)
+        self.model.eval()
+        model_dir = os.path.join(model, 'food101-clip-vitl14-full.pt')
+        model_file = torch.load(model_dir)
+        self.label_map = model_file['meta_info']['label_map']
+        logger.info('load model done')
+
+        self.transform = transforms.Compose([
+            transforms.Resize((224, 224)),
+            transforms.ToTensor(),
+            transforms.Normalize(
+                mean=[0.48145466, 0.4578275, 0.40821073],
+                std=[0.26862954, 0.26130258, 0.27577711])
+        ])
+
+    def preprocess(self, input: Input) -> Dict[str, Any]:
+
+        img = LoadImage.convert_to_img(input)
+
+        data = self.transform(img)
+        data = collate([data], samples_per_gpu=1)
+        if next(self.model.parameters()).is_cuda:
+            data = scatter(data, [next(self.model.parameters()).device])[0]
+
+        return data
+
+    def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
+        with torch.no_grad():
+            results = self.model(input)
+            return results
+
+    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
+        scores = torch.softmax(inputs, dim=1).cpu()
+        labels = torch.argmax(scores, dim=1).cpu().tolist()
+        label_names = [self.label_map[label] for label in labels]
+
+        return {OutputKeys.LABELS: label_names, OutputKeys.SCORES: scores}
diff --git a/modelscope/pipelines/cv/language_guided_video_summarization_pipeline.py b/modelscope/pipelines/cv/language_guided_video_summarization_pipeline.py
index 862d4fc7..49444187 100755
--- a/modelscope/pipelines/cv/language_guided_video_summarization_pipeline.py
+++ b/modelscope/pipelines/cv/language_guided_video_summarization_pipeline.py
@@ -14,6 +14,7 @@ import torch
 from PIL import Image
 
 from modelscope.metainfo import Pipelines
+from modelscope.models.base.base_model import Model
 from modelscope.models.cv.language_guided_video_summarization import \
     ClipItVideoSummarization
 from modelscope.models.cv.language_guided_video_summarization.summarizer import (
@@ -44,8 +45,9 @@ class LanguageGuidedVideoSummarizationPipeline(Pipeline):
         """
         super().__init__(model=model, auto_collate=False, **kwargs)
         logger.info(f'loading model from {model}')
+        assert isinstance(self.model, Model), \
+            f'please check whether model config exists in {ModelFile.CONFIGURATION}'
         self.model_dir = self.model.model_dir
-
         self.tmp_dir = kwargs.get('tmp_dir', None)
         if self.tmp_dir is None:
             self.tmp_dir = tempfile.TemporaryDirectory().name
diff --git a/modelscope/pipelines/cv/mobile_image_super_resolution_pipeline.py b/modelscope/pipelines/cv/mobile_image_super_resolution_pipeline.py
new file mode 100644
index 00000000..4ff98c8f
--- /dev/null
+++ b/modelscope/pipelines/cv/mobile_image_super_resolution_pipeline.py
@@ -0,0 +1,104 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import Any, Dict, Optional, Union
+
+import numpy as np
+import skimage.color as sc
+import torch
+from torchvision import transforms
+
+from modelscope.metainfo import Pipelines
+from modelscope.models import Model
+from modelscope.models.cv.super_resolution import ECBSRModel
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines.base import Input, Pipeline
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.preprocessors import LoadImage
+from modelscope.utils.constant import Tasks
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+__all__ = ['MobileImageSuperResolutionPipeline']
+
+
+@PIPELINES.register_module(
+    Tasks.image_super_resolution,
+    module_name=Pipelines.mobile_image_super_resolution)
+class MobileImageSuperResolutionPipeline(Pipeline):
+
+    def __init__(self,
+                 model: Union[ECBSRModel, str],
+                 preprocessor=None,
+                 **kwargs):
+        """The inference pipeline for all the image super-resolution tasks.
+
+        Args:
+            model (`str` or `Model` or module instance): A model instance or a model local dir
+                or a model id in the model hub.
+            preprocessor (`Preprocessor`, `optional`): A Preprocessor instance.
+            kwargs (dict, `optional`):
+                Extra kwargs passed into the preprocessor's constructor.
+
+        Example:
+            >>> from modelscope.pipelines import pipeline
+            >>> import cv2
+            >>> from modelscope.outputs import OutputKeys
+            >>> from modelscope.pipelines import pipeline
+            >>> from modelscope.utils.constant import Tasks
+            >>> sr = pipeline(Tasks.image_super_resolution, model='damo/cv_ecbsr_image-super-resolution_mobile')
+            >>> result = sr('data/test/images/butterfly_lrx2_y.png')
+            >>> cv2.imwrite('result.png', result[OutputKeys.OUTPUT_IMG])
+        """
+        super().__init__(model=model, preprocessor=preprocessor, **kwargs)
+        self.model.eval()
+        self.config = self.model.config
+
+        self.y_input = self.model.config.model.y_input
+        self.tensor_max_value = self.model.config.model.tensor_max_value
+
+        if torch.cuda.is_available():
+            self._device = torch.device('cuda')
+        else:
+            self._device = torch.device('cpu')
+        logger.info('load image mobile sr model done')
+
+    def preprocess(self, input: Input) -> Dict[str, Any]:
+        img = LoadImage.convert_to_img(input)
+
+        if self.y_input:
+            img = sc.rgb2ycbcr(img)[:, :, 0:1]
+
+        img = np.ascontiguousarray(img.transpose((2, 0, 1)))
+        img = torch.from_numpy(img).to(self._device)
+
+        img = img.float()
+        if self.tensor_max_value == 1.0:
+            img /= 255.0
+
+        result = {'input': img.unsqueeze(0)}
+        return result
+
+    def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
+
+        def set_phase(model, is_train):
+            if is_train:
+                model.train()
+            else:
+                model.eval()
+
+        is_train = False
+        set_phase(self.model, is_train)
+        with torch.no_grad():
+            output = self.model(input)  # output Tensor
+
+        return {'output_tensor': output['outputs']}
+
+    def postprocess(self, input: Dict[str, Any]) -> Dict[str, Any]:
+        output = input['output_tensor'].squeeze(0)
+        if self.tensor_max_value == 1.0:
+            output *= 255.0
+
+        output = output.clamp(0, 255).to(torch.uint8)
+        output = output.permute(1, 2, 0).contiguous().cpu().numpy()
+
+        return {OutputKeys.OUTPUT_IMG: output}
diff --git a/modelscope/pipelines/cv/motion_generation_pipeline.py b/modelscope/pipelines/cv/motion_generation_pipeline.py
new file mode 100644
index 00000000..0d8a21c9
--- /dev/null
+++ b/modelscope/pipelines/cv/motion_generation_pipeline.py
@@ -0,0 +1,128 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import os.path as osp
+import tempfile
+from typing import Any, Dict
+
+import numpy as np
+import torch
+
+from modelscope.metainfo import Pipelines
+from modelscope.models.cv.motion_generation import (ClassifierFreeSampleModel,
+                                                    create_model,
+                                                    load_model_wo_clip)
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines.base import Input, Pipeline
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.utils.config import Config
+from modelscope.utils.constant import ModelFile, Tasks
+from modelscope.utils.cv.motion_utils.motion_process import recover_from_ric
+from modelscope.utils.cv.motion_utils.plot_script import plot_3d_motion
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+
+@PIPELINES.register_module(
+    Tasks.motion_generation, module_name=Pipelines.motion_generattion)
+class MDMMotionGeneration(Pipeline):
+
+    def __init__(self, model: str, **kwargs):
+        """
+        use `model` to create motion generation pipeline for prediction
+        Args:
+            model: model id on modelscope hub.
+        """
+        super().__init__(model=model, **kwargs)
+        model_path = osp.join(self.model, ModelFile.TORCH_MODEL_FILE)
+        logger.info(f'loading model from {model_path}')
+        config_path = osp.join(self.model, ModelFile.CONFIGURATION)
+        logger.info(f'loading config from {config_path}')
+        self.mean = np.load(osp.join(self.model, 'Mean.npy'))
+        self.std = np.load(osp.join(self.model, 'Std.npy'))
+        self.cfg = Config.from_file(config_path)
+        self.cfg.update({'smpl_data_path': osp.join(self.model, 'smpl')})
+        self.cfg.update(kwargs)
+        self.n_joints = 22
+        self.fps = 20
+        self.n_frames = 120
+        self.mdm, self.diffusion = create_model(self.cfg)
+        state_dict = torch.load(model_path, map_location='cpu')
+        load_model_wo_clip(self.mdm, state_dict)
+        self.mdm = ClassifierFreeSampleModel(self.mdm)
+        self.mdm.to(self.device)
+        self.mdm.eval()
+        logger.info('load model done')
+
+    def preprocess(self, input: Input) -> Dict[str, Any]:
+        if isinstance(input, str):
+            input_text = input
+        else:
+            raise TypeError(f'input should be a str,'
+                            f'  but got {type(input)}')
+        result = {'input_text': input_text}
+        return result
+
+    def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
+        texts = [input['input_text']]
+        model_kwargs = {
+            'y': {
+                'mask': torch.ones(1, 1, 1, self.n_frames) > 0,
+                'lengths': torch.tensor([self.n_frames]),
+                'tokens': None,
+                'text': texts,
+                'scale': torch.ones(1, device=self.device) * 2.5
+            }
+        }
+        sample_fn = self.diffusion.p_sample_loop
+        sample = sample_fn(
+            self.mdm,
+            (1, self.mdm.njoints, self.mdm.nfeats, self.n_frames),
+            clip_denoised=False,
+            model_kwargs=model_kwargs,
+            skip_timesteps=0,
+            init_image=None,
+            progress=True,
+            dump_steps=None,
+            noise=None,
+            const_noise=False,
+        )
+        sample = (sample.cpu().permute(0, 2, 3, 1) * self.std
+                  + self.mean).float()
+        sample = recover_from_ric(sample, self.n_joints)
+        sample = sample.view(-1, *sample.shape[2:]).permute(0, 2, 3, 1)
+
+        sample = self.mdm.rot2xyz(
+            x=sample,
+            mask=None,
+            pose_rep='xyz',
+            glob=True,
+            translation=True,
+            jointstype='smpl',
+            vertstrans=True,
+            betas=None,
+            beta=0,
+            glob_rot=None,
+            get_rotations_back=False)
+        motion = sample.cpu().numpy()
+        motion = motion[0].transpose(2, 0, 1)
+        out = {OutputKeys.KEYPOINTS: motion, 'text': input['input_text']}
+        return out
+
+    def postprocess(self, inputs: Dict[str, Any], **kwargs) -> Dict[str, Any]:
+        output_video_path = kwargs.get(
+            'output_video',
+            tempfile.NamedTemporaryFile(suffix='.mp4').name)
+        kinematic_chain = [[0, 2, 5, 8, 11], [0, 1, 4, 7, 10],
+                           [0, 3, 6, 9, 12, 15], [9, 14, 17, 19, 21],
+                           [9, 13, 16, 18, 20]]
+        if output_video_path is not None:
+            plot_3d_motion(
+                output_video_path,
+                kinematic_chain,
+                inputs[OutputKeys.KEYPOINTS],
+                inputs.pop('text'),
+                dataset='humanml',
+                fps=20)
+        inputs.update({OutputKeys.OUTPUT_VIDEO: output_video_path})
+        return inputs
diff --git a/modelscope/pipelines/cv/nerf_recon_acc_pipeline.py b/modelscope/pipelines/cv/nerf_recon_acc_pipeline.py
new file mode 100644
index 00000000..27bbb2e3
--- /dev/null
+++ b/modelscope/pipelines/cv/nerf_recon_acc_pipeline.py
@@ -0,0 +1,63 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import Any, Dict
+
+from modelscope.metainfo import Pipelines
+from modelscope.models.cv.nerf_recon_acc import NeRFReconPreprocessor
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines.base import Input, Model, Pipeline
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.utils.constant import Tasks
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+
+@PIPELINES.register_module(
+    Tasks.nerf_recon_acc, module_name=Pipelines.nerf_recon_acc)
+class NeRFReconAccPipeline(Pipeline):
+    """ NeRF reconstruction acceleration pipeline
+    Example:
+
+    ```python
+    >>> from modelscope.pipelines import pipeline
+    >>> nerf_recon_acc = pipeline(Tasks.nerf_recon_acc,
+                'damo/cv_nerf-3d-reconstruction-accelerate_damo')
+    >>> nerf_recon_acc({
+            'video_input_path': 'input.mp4', # input video path (str)
+            'data_dir': '/data/lego', # data dir path (str)
+        })
+       {
+        "output": 'render.mp4' # saved path of render video (str)
+        }
+    >>> #
+    ```
+    """
+
+    def __init__(self, model, data_type='colmap', use_mask=True, **kwargs):
+        """
+        use `model` to create a image sky change pipeline for image editing
+        Args:
+            model (`str` or `Model`): model_id on modelscope hub
+            preprocessor(`Preprocessor`, *optional*,  defaults to None): `NeRFReconPreprocessor`.
+        """
+        super().__init__(model=model, **kwargs)
+        if not isinstance(self.model, Model):
+            logger.error('model object is not initialized.')
+            raise Exception('model object is not initialized.')
+        self.data_type = data_type
+        if self.data_type != 'blender' and self.data_type != 'colmap':
+            raise Exception('data type {} is not support currently'.format(
+                self.data_type))
+        self.use_mask = use_mask
+
+        self.preprocessor = NeRFReconPreprocessor(
+            data_type=self.data_type, use_mask=self.use_mask)
+        logger.info('load model done')
+
+    def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
+        data_dir = input['data_dir']
+        result = self.model.nerf_reconstruction(data_dir)
+        return {OutputKeys.OUTPUT_VIDEO: result}
+
+    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
+        return inputs
diff --git a/modelscope/pipelines/cv/object_detection_3d_pipeline.py b/modelscope/pipelines/cv/object_detection_3d_pipeline.py
new file mode 100644
index 00000000..3374a002
--- /dev/null
+++ b/modelscope/pipelines/cv/object_detection_3d_pipeline.py
@@ -0,0 +1,142 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os
+import os.path as osp
+from tempfile import TemporaryDirectory
+from typing import Any, Dict
+
+import cv2
+import numpy as np
+import PIL
+import torch
+
+from modelscope.metainfo import Pipelines
+from modelscope.models.cv.object_detection_3d.depe import DepeDetect
+from modelscope.models.cv.object_detection_3d.depe.result_vis import \
+    plot_result
+from modelscope.msdatasets import MsDataset
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines.base import Input, Pipeline
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.utils.config import Config
+from modelscope.utils.constant import Tasks
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+
+@PIPELINES.register_module(
+    Tasks.object_detection_3d, module_name=Pipelines.object_detection_3d_depe)
+class ObjectDetection3DPipeline(Pipeline):
+
+    def __init__(self, model: str, **kwargs):
+        """
+        use `model` to create a 3d object detection pipeline for prediction
+        Args:
+            model: model id on modelscope hub.
+
+        Example:
+            >>> import cv2
+            >>> from modelscope.pipelines import pipeline
+            >>> from modelscope.msdatasets import MsDataset
+            >>> ms_ds_nuscenes = MsDataset.load('nuScenes_mini', namespace='shaoxuan')
+            >>> data_path = ms_ds_nuscenes.config_kwargs['split_config']
+            >>> val_dir = data_path['validation']
+            >>> val_root = val_dir + '/' + os.listdir(val_dir)[0] + '/'
+            >>> depe = pipeline('object-detection-3d', model='damo/cv_object-detection-3d_depe')
+            >>> input_dict = {'data_root': val_root, 'sample_idx': 0}
+            >>> result = depe(input_dict)
+            >>> cv2.imwrite('result.jpg', result['output_img'])
+        """
+        super().__init__(model=model, **kwargs)
+        config_path = osp.join(model, 'mmcv_depe.py')
+        self.cfg = Config.from_file(config_path)
+        if torch.cuda.is_available():
+            self.device = torch.device('cuda')
+        else:
+            self.device = torch.device('cpu')
+        self.detector = DepeDetect(model).to(self.device)
+
+    def __call__(self, input, **kwargs):
+        """
+        Detect 3D objects in images from multi-cameras that passed as inputs
+
+        Args:
+            input (`Dict[str, Any]`):
+                A dictionary of input consist 2 keys:
+                - `data_root` is the path of input data in nuScenes format,
+                you can create your own data according steps from model-card,
+                if `data_root` is False, a default input data from
+                nuScenes-mini validation set will be used, which includes 81
+                samples from 2 scenes.
+                - `sample_idx` is the index of sample to be inferenced, the
+                value should in range of sample number in input data.
+
+        Return:
+            A dictionary of result consist 1 keys:
+            - `output_img` plots all detection results in one image.
+
+        """
+        return super().__call__(input, **kwargs)
+
+    def get_default_data(self):
+        ms_ds_nuscenes = MsDataset.load('nuScenes_mini', namespace='shaoxuan')
+        data_path = ms_ds_nuscenes.config_kwargs['split_config']
+        val_dir = data_path['validation']
+        val_root = val_dir + '/' + os.listdir(val_dir)[0] + '/'
+        return val_root
+
+    def preprocess(self, input: Input) -> Dict[str, Any]:
+        assert 'sample_idx' in input
+        idx = input['sample_idx']
+        if isinstance(input['sample_idx'], str):
+            input['sample_idx'] = int(input['sample_idx'])
+        data_root = input.get('data_root', False)
+        if data_root is False:
+            data_root = self.get_default_data()
+            logger.info(f'Note: forward using default data in: {data_root}')
+        try:
+            if not os.path.exists('/data/Dataset'):
+                os.system('mkdir -p /data/Dataset')
+            os.system(f'ln -snf {data_root} /data/Dataset/nuScenes')
+        except Exception as e:
+            raise RuntimeError(
+                f'exception:{e}, please make sure to have permission create and write in: /data/Dataset'
+            )
+        # build the dataloader
+        from mmdet3d.datasets import build_dataloader, build_dataset
+        self.cfg.data.test.idx_range = (idx, idx + 1)
+        self.cfg.data.test.test_mode = True
+        self.dataset = build_dataset(self.cfg.data.test)
+        data_loader = build_dataloader(
+            self.dataset,
+            samples_per_gpu=1,
+            workers_per_gpu=4,
+            dist=False,
+            shuffle=False)
+        result = next(iter(data_loader))
+        if 'img_metas' in result:
+            from mmcv.parallel import scatter
+            if next(self.detector.parameters()).is_cuda:
+                # scatter to specified GPU
+                result = scatter(
+                    result, [next(self.detector.parameters()).device.index])[0]
+        return result
+
+    def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
+        with torch.no_grad():
+            result = self.detector(**input)
+        return result
+
+    def postprocess(self, inputs: Dict[str, Any], **kwargs) -> Dict[str, Any]:
+        save_path = kwargs.get('save_path', None)
+        if save_path is None:
+            save_path = TemporaryDirectory().name
+        else:
+            if not os.path.exists(save_path):
+                os.makedirs(save_path)
+        file_path = osp.join(save_path, 'pts_bbox', 'results_nusc.json')
+        kwargs_format = {'jsonfile_prefix': save_path}
+        self.dataset.format_results(inputs, **kwargs_format)
+        logger.info(f'Done, results saved into: {file_path}')
+        result_img = plot_result(file_path, vis_thred=0.3)[0]
+        return {OutputKeys.OUTPUT_IMG: result_img.astype(np.uint8)}
diff --git a/modelscope/pipelines/cv/ocr_detection_pipeline.py b/modelscope/pipelines/cv/ocr_detection_pipeline.py
index 682b05c4..ed198b66 100644
--- a/modelscope/pipelines/cv/ocr_detection_pipeline.py
+++ b/modelscope/pipelines/cv/ocr_detection_pipeline.py
@@ -12,14 +12,14 @@ from modelscope.metainfo import Pipelines
 from modelscope.outputs import OutputKeys
 from modelscope.pipelines.base import Input, Pipeline
 from modelscope.pipelines.builder import PIPELINES
-from modelscope.pipelines.cv.ocr_utils.model_vlpt import VLPTModel
+from modelscope.pipelines.cv.ocr_utils.model_vlpt import DBModel, VLPTModel
 from modelscope.preprocessors import LoadImage
 from modelscope.utils.constant import ModelFile, Tasks
 from modelscope.utils.device import device_placement
 from modelscope.utils.logger import get_logger
-from .ocr_utils import (SegLinkDetector, cal_width, combine_segments_python,
-                        decode_segments_links_python, nms_python,
-                        polygons_from_bitmap, rboxes_to_polygons)
+from .ocr_utils import (SegLinkDetector, boxes_from_bitmap, cal_width,
+                        combine_segments_python, decode_segments_links_python,
+                        nms_python, polygons_from_bitmap, rboxes_to_polygons)
 
 if tf.__version__ >= '2.0':
     import tf_slim as slim
@@ -48,6 +48,33 @@ tf.app.flags.DEFINE_float('link_threshold', 0.6,
 @PIPELINES.register_module(
     Tasks.ocr_detection, module_name=Pipelines.ocr_detection)
 class OCRDetectionPipeline(Pipeline):
+    """ OCR Recognition Pipeline.
+
+    Example:
+
+    ```python
+    >>> from modelscope.pipelines import pipeline
+
+    >>> ocr_detection = pipeline('ocr_detection', model='damo/cv_resnet18_ocr-detection-line-level_damo')
+    >>> result = ocr_detection('https://modelscope.oss-cn-beijing.aliyuncs.com/test/images/ocr_detection.jpg')
+
+        {'polygons': array([[220,  14, 780,  14, 780,  64, 220,  64],
+       [196, 369, 604, 370, 604, 425, 196, 425],
+       [ 21, 730, 425, 731, 425, 787,  21, 786],
+       [421, 731, 782, 731, 782, 789, 421, 789],
+       [  0, 121, 109,   0, 147,  35,  26, 159],
+       [697, 160, 773, 160, 773, 197, 697, 198],
+       [547, 205, 623, 205, 623, 244, 547, 244],
+       [548, 161, 623, 161, 623, 199, 547, 199],
+       [698, 206, 772, 206, 772, 244, 698, 244]])}
+    ```
+    note:
+    model = damo/cv_resnet18_ocr-detection-line-level_damo, for general text line detection, based on SegLink++.
+    model = damo/cv_resnet18_ocr-detection-word-level_damo, for general text word detection, based on SegLink++.
+    model = damo/cv_resnet50_ocr-detection-vlpt, for toaltext dataset, based on VLPT_pretrained DBNet.
+    model = damo/cv_resnet18_ocr-detection-db-line-level_damo, for general text line detection, based on DBNet.
+
+    """
 
     def __init__(self, model: str, **kwargs):
         """
@@ -57,6 +84,7 @@ class OCRDetectionPipeline(Pipeline):
         """
         super().__init__(model=model, **kwargs)
         if 'vlpt' in self.model:
+            # for model cv_resnet50_ocr-detection-vlpt
             model_path = osp.join(self.model, ModelFile.TORCH_MODEL_FILE)
             logger.info(f'loading model from {model_path}')
 
@@ -71,7 +99,24 @@ class OCRDetectionPipeline(Pipeline):
                 self.infer_model.load_state_dict(checkpoint['state_dict'])
             else:
                 self.infer_model.load_state_dict(checkpoint)
+        elif 'db' in self.model:
+            # for model cv_resnet18_ocr-detection-db-line-level_damo (original dbnet)
+            model_path = osp.join(self.model, ModelFile.TORCH_MODEL_FILE)
+            logger.info(f'loading model from {model_path}')
+
+            self.thresh = 0.2
+            self.image_short_side = 736
+            self.device = torch.device(
+                'cuda' if torch.cuda.is_available() else 'cpu')
+            self.infer_model = DBModel().to(self.device)
+            self.infer_model.eval()
+            checkpoint = torch.load(model_path, map_location=self.device)
+            if 'state_dict' in checkpoint:
+                self.infer_model.load_state_dict(checkpoint['state_dict'])
+            else:
+                self.infer_model.load_state_dict(checkpoint)
         else:
+            # for model seglink++
             tf.reset_default_graph()
             model_path = osp.join(
                 osp.join(self.model, ModelFile.TF_CHECKPOINT_FOLDER),
@@ -147,9 +192,8 @@ class OCRDetectionPipeline(Pipeline):
                         model_loader.restore(sess, model_path)
 
     def preprocess(self, input: Input) -> Dict[str, Any]:
-        if 'vlpt' in self.model:
+        if 'vlpt' in self.model or 'db' in self.model:
             img = LoadImage.convert_to_ndarray(input)[:, :, ::-1]
-
             height, width, _ = img.shape
             if height < width:
                 new_height = self.image_short_side
@@ -160,7 +204,6 @@ class OCRDetectionPipeline(Pipeline):
                 new_height = int(
                     math.ceil(new_width / width * height / 32) * 32)
             resized_img = cv2.resize(img, (new_width, new_height))
-
             resized_img = resized_img - np.array([123.68, 116.78, 103.94],
                                                  dtype=np.float32)
             resized_img /= 255.
@@ -192,7 +235,7 @@ class OCRDetectionPipeline(Pipeline):
             return result
 
     def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
-        if 'vlpt' in self.model:
+        if 'vlpt' in self.model or 'db' in self.model:
             pred = self.infer_model(input['img'])
             return {'results': pred, 'org_shape': input['org_shape']}
         else:
@@ -213,6 +256,15 @@ class OCRDetectionPipeline(Pipeline):
                                                  height)
             result = {OutputKeys.POLYGONS: np.array(boxes)}
             return result
+        elif 'db' in self.model:
+            pred = inputs['results'][0]
+            height, width = inputs['org_shape']
+            segmentation = pred > self.thresh
+
+            boxes, scores = boxes_from_bitmap(pred, segmentation, width,
+                                              height)
+            result = {OutputKeys.POLYGONS: np.array(boxes)}
+            return result
         else:
             rboxes = inputs['combined_rboxes'][0]
             count = inputs['combined_counts'][0]
diff --git a/modelscope/pipelines/cv/ocr_recognition_pipeline.py b/modelscope/pipelines/cv/ocr_recognition_pipeline.py
index d90f8db6..f5b2f667 100644
--- a/modelscope/pipelines/cv/ocr_recognition_pipeline.py
+++ b/modelscope/pipelines/cv/ocr_recognition_pipeline.py
@@ -1,133 +1,74 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
-import math
-import os.path as osp
-from typing import Any, Dict
-
-import cv2
-import numpy as np
-import PIL
-import torch
-
 from modelscope.metainfo import Pipelines
+from modelscope.models.cv.ocr_recognition import OCRRecognition
 from modelscope.outputs import OutputKeys
 from modelscope.pipelines.base import Input, Pipeline
 from modelscope.pipelines.builder import PIPELINES
-from modelscope.pipelines.cv.ocr_utils.model_convnext_transformer import \
-    OCRRecModel
-from modelscope.preprocessors import load_image
-from modelscope.utils.constant import ModelFile, Tasks
+from modelscope.utils.constant import Tasks
 from modelscope.utils.logger import get_logger
 
 logger = get_logger()
 
-# constant
-NUM_CLASSES = 7644
-IMG_HEIGHT = 32
-IMG_WIDTH = 300
-PRED_LENTH = 75
-PRED_PAD = 6
-
 
 @PIPELINES.register_module(
     Tasks.ocr_recognition, module_name=Pipelines.ocr_recognition)
 class OCRRecognitionPipeline(Pipeline):
+    """ OCR Recognition Pipeline.
+
+    Example:
+
+    ```python
+    >>> from modelscope.pipelines import pipeline
+
+    >>> ocr_recognition = pipeline('ocr-recognition', 'damo/cv_crnn_ocr-recognition-general_damo')
+    >>> ocr_recognition("http://duguang-labelling.oss-cn-shanghai.aliyuncs.com"
+        "/mass_img_tmp_20220922/ocr_recognition_handwritten.jpg")
+
+        {'text': '电子元器件提供BOM配单'}
+    ```
+    """
 
     def __init__(self, model: str, **kwargs):
         """
+        use `model` to create a ocr recognition pipeline for prediction
         Args:
-            model: model id on modelscope hub.
+            model: model id on modelscope hub or `OCRRecognition` Model.
+            preprocessor: `OCRRecognitionPreprocessor`.
         """
+        assert isinstance(model, str), 'model must be a single str'
         super().__init__(model=model, **kwargs)
-        model_path = osp.join(self.model, ModelFile.TORCH_MODEL_FILE)
-        label_path = osp.join(self.model, 'label_dict.txt')
-        logger.info(f'loading model from {model_path}')
+        logger.info(f'loading model from dir {model}')
+        self.ocr_recognizer = self.model.to(self.device)
+        self.ocr_recognizer.eval()
+        logger.info('loading model done')
 
-        self.device = torch.device(
-            'cuda' if torch.cuda.is_available() else 'cpu')
-        self.infer_model = OCRRecModel(NUM_CLASSES).to(self.device)
-        self.infer_model.eval()
-        self.infer_model.load_state_dict(
-            torch.load(model_path, map_location=self.device))
-        self.labelMapping = dict()
-        with open(label_path, 'r', encoding='utf-8') as f:
-            lines = f.readlines()
-            cnt = 2
-            for line in lines:
-                line = line.strip('\n')
-                self.labelMapping[cnt] = line
-                cnt += 1
+    def __call__(self, input, **kwargs):
+        """
+        Recognize text sequence in the text image.
 
-    def preprocess(self, input: Input) -> Dict[str, Any]:
-        if isinstance(input, str):
-            img = np.array(load_image(input).convert('L'))
-        elif isinstance(input, PIL.Image.Image):
-            img = np.array(input.convert('L'))
-        elif isinstance(input, np.ndarray):
-            if len(input.shape) == 3:
-                img = cv2.cvtColor(input, cv2.COLOR_RGB2GRAY)
-        else:
-            raise TypeError(f'input should be either str, PIL.Image,'
-                            f' np.array, but got {type(input)}')
-        data = []
-        img_h, img_w = img.shape
-        wh_ratio = img_w / img_h
-        true_w = int(IMG_HEIGHT * wh_ratio)
-        split_batch_cnt = 1
-        if true_w < IMG_WIDTH * 1.2:
-            img = cv2.resize(img, (min(true_w, IMG_WIDTH), IMG_HEIGHT))
-        else:
-            split_batch_cnt = math.ceil((true_w - 48) * 1.0 / 252)
-            img = cv2.resize(img, (true_w, IMG_HEIGHT))
+        Args:
+            input (`Image`):
+                The pipeline handles three types of images:
 
-        if split_batch_cnt == 1:
-            mask = np.zeros((IMG_HEIGHT, IMG_WIDTH))
-            mask[:, :img.shape[1]] = img
-            data.append(mask)
-        else:
-            for idx in range(split_batch_cnt):
-                mask = np.zeros((IMG_HEIGHT, IMG_WIDTH))
-                left = (PRED_LENTH * 4 - PRED_PAD * 4) * idx
-                trunk_img = img[:, left:min(left + PRED_LENTH * 4, true_w)]
-                mask[:, :trunk_img.shape[1]] = trunk_img
-                data.append(mask)
+                - A string containing an HTTP link pointing to an image
+                - A string containing a local path to an image
+                - An image loaded in PIL or opencv directly
 
-        data = torch.FloatTensor(data).view(
-            len(data), 1, IMG_HEIGHT, IMG_WIDTH) / 255.
-        data = data.to(self.device)
+                The pipeline currently supports single image input.
 
-        result = {'img': data}
+        Return:
+            A text sequence (string) of the input text image.
+        """
+        return super().__call__(input, **kwargs)
 
-        return result
+    def preprocess(self, inputs):
+        outputs = self.preprocessor(inputs)
+        return outputs
 
-    def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
-        pred = self.infer_model(input['img'])
-        return {'results': pred}
+    def forward(self, inputs):
+        outputs = self.ocr_recognizer(inputs)
+        return outputs
 
-    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
-        preds = inputs['results']
-        batchSize, length = preds.shape
-        pred_idx = []
-        if batchSize == 1:
-            pred_idx = preds[0].cpu().data.tolist()
-        else:
-            for idx in range(batchSize):
-                if idx == 0:
-                    pred_idx.extend(preds[idx].cpu().data[:PRED_LENTH
-                                                          - PRED_PAD].tolist())
-                elif idx == batchSize - 1:
-                    pred_idx.extend(preds[idx].cpu().data[PRED_PAD:].tolist())
-                else:
-                    pred_idx.extend(preds[idx].cpu().data[PRED_PAD:PRED_LENTH
-                                                          - PRED_PAD].tolist())
-
-        # ctc decoder
-        last_p = 0
-        str_pred = []
-        for p in pred_idx:
-            if p != last_p and p != 0:
-                str_pred.append(self.labelMapping[p])
-            last_p = p
-
-        final_str = ''.join(str_pred)
-        result = {OutputKeys.TEXT: final_str}
-        return result
+    def postprocess(self, inputs):
+        outputs = {OutputKeys.TEXT: inputs}
+        return outputs
diff --git a/modelscope/pipelines/cv/ocr_utils/__init__.py b/modelscope/pipelines/cv/ocr_utils/__init__.py
index 979ea82c..2d40201a 100644
--- a/modelscope/pipelines/cv/ocr_utils/__init__.py
+++ b/modelscope/pipelines/cv/ocr_utils/__init__.py
@@ -6,14 +6,15 @@ from modelscope.utils.import_utils import LazyImportModule
 if TYPE_CHECKING:
     from .model_resnet_mutex_v4_linewithchar import SegLinkDetector
     from .ops import decode_segments_links_python, combine_segments_python
-    from .utils import rboxes_to_polygons, cal_width, nms_python, polygons_from_bitmap
+    from .utils import (rboxes_to_polygons, cal_width, nms_python,
+                        polygons_from_bitmap, rboxes_from_bitmap)
 else:
     _import_structure = {
         'model_resnet_mutex_v4_linewithchar': ['SegLinkDetector'],
         'ops': ['decode_segments_links_python', 'combine_segments_python'],
         'utils': [
             'rboxes_to_polygons', 'cal_width', 'nms_python',
-            'polygons_from_bitmap'
+            'polygons_from_bitmap', 'boxes_from_bitmap'
         ]
     }
 
diff --git a/modelscope/pipelines/cv/ocr_utils/model_vlpt.py b/modelscope/pipelines/cv/ocr_utils/model_vlpt.py
index 19ac9807..a84502cb 100644
--- a/modelscope/pipelines/cv/ocr_utils/model_vlpt.py
+++ b/modelscope/pipelines/cv/ocr_utils/model_vlpt.py
@@ -429,3 +429,15 @@ class VLPTModel(nn.Module):
 
     def forward(self, x):
         return self.decoder(self.backbone(x))
+
+
+class DBModel(nn.Module):
+
+    def __init__(self, *args, **kwargs):
+        super(DBModel, self).__init__()
+        self.backbone = ResNet(BasicBlock, [2, 2, 2, 2], **kwargs)
+        self.decoder = SegDetector(
+            in_channels=[64, 128, 256, 512], adaptive=True, k=50, **kwargs)
+
+    def forward(self, x):
+        return self.decoder(self.backbone(x))
diff --git a/modelscope/pipelines/cv/ocr_utils/ops.py b/modelscope/pipelines/cv/ocr_utils/ops.py
index a36838a6..73b58c38 100644
--- a/modelscope/pipelines/cv/ocr_utils/ops.py
+++ b/modelscope/pipelines/cv/ocr_utils/ops.py
@@ -68,7 +68,7 @@ def _nn_variable(name, shape, init_method, collection=None, **kwargs):
     shape: variable shape
     init_method: 'zero', 'kaiming', 'xavier', or (mean, std)
     collection: if not none, add variable to this collection
-    kwargs: extra paramters passed to tf.get_variable
+    kwargs: extra parameters passed to tf.get_variable
   RETURN
     var: a new or existing variable
   """
diff --git a/modelscope/pipelines/cv/ocr_utils/utils.py b/modelscope/pipelines/cv/ocr_utils/utils.py
index b024844d..6de22b3f 100644
--- a/modelscope/pipelines/cv/ocr_utils/utils.py
+++ b/modelscope/pipelines/cv/ocr_utils/utils.py
@@ -164,13 +164,59 @@ def polygons_from_bitmap(pred, _bitmap, dest_width, dest_height):
     return boxes, scores
 
 
+def boxes_from_bitmap(pred, _bitmap, dest_width, dest_height):
+    """
+    _bitmap: single map with shape (1, H, W),
+        whose values are binarized as {0, 1}
+    """
+
+    assert _bitmap.size(0) == 1
+    bitmap = _bitmap.cpu().numpy()[0]
+    pred = pred.cpu().detach().numpy()[0]
+    height, width = bitmap.shape
+    boxes = []
+    scores = []
+
+    contours, _ = cv2.findContours((bitmap * 255).astype(np.uint8),
+                                   cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)
+
+    for contour in contours[:100]:
+        points, sside = get_mini_boxes(contour)
+        if sside < 3:
+            continue
+        points = np.array(points)
+
+        score = box_score_fast(pred, points.reshape(-1, 2))
+        if 0.3 > score:
+            continue
+
+        box = unclip(points, unclip_ratio=1.5).reshape(-1, 1, 2)
+        box, sside = get_mini_boxes(box)
+
+        if sside < 3 + 2:
+            continue
+
+        box = np.array(box).astype(np.int32)
+        if not isinstance(dest_width, int):
+            dest_width = dest_width.item()
+            dest_height = dest_height.item()
+
+        box[:, 0] = np.clip(
+            np.round(box[:, 0] / width * dest_width), 0, dest_width)
+        box[:, 1] = np.clip(
+            np.round(box[:, 1] / height * dest_height), 0, dest_height)
+        boxes.append(box.reshape(-1).tolist())
+        scores.append(score)
+    return boxes, scores
+
+
 def box_score_fast(bitmap, _box):
     h, w = bitmap.shape[:2]
     box = _box.copy()
-    xmin = np.clip(np.floor(box[:, 0].min()).astype(np.int), 0, w - 1)
-    xmax = np.clip(np.ceil(box[:, 0].max()).astype(np.int), 0, w - 1)
-    ymin = np.clip(np.floor(box[:, 1].min()).astype(np.int), 0, h - 1)
-    ymax = np.clip(np.ceil(box[:, 1].max()).astype(np.int), 0, h - 1)
+    xmin = np.clip(np.floor(box[:, 0].min()).astype(np.int32), 0, w - 1)
+    xmax = np.clip(np.ceil(box[:, 0].max()).astype(np.int32), 0, w - 1)
+    ymin = np.clip(np.floor(box[:, 1].min()).astype(np.int32), 0, h - 1)
+    ymax = np.clip(np.ceil(box[:, 1].max()).astype(np.int32), 0, h - 1)
 
     mask = np.zeros((ymax - ymin + 1, xmax - xmin + 1), dtype=np.uint8)
     box[:, 0] = box[:, 0] - xmin
diff --git a/modelscope/pipelines/cv/panorama_depth_estimation_pipeline.py b/modelscope/pipelines/cv/panorama_depth_estimation_pipeline.py
index a1973285..c53ef981 100644
--- a/modelscope/pipelines/cv/panorama_depth_estimation_pipeline.py
+++ b/modelscope/pipelines/cv/panorama_depth_estimation_pipeline.py
@@ -25,8 +25,8 @@ class PanoramaDepthEstimationPipeline(Pipeline):
     """ This pipeline will estimation the depth panoramic image from one rgb panoramic image.
         The input panoramic image should be equirectanlar, in the size of 512x1024.
 
-    Example:
-    '''python
+    Examples:
+
     >>> import cv2
     >>> from modelscope.outputs import OutputKeys
     >>> from modelscope.pipelines import pipeline
@@ -40,7 +40,6 @@ class PanoramaDepthEstimationPipeline(Pipeline):
     >>> result = estimator(input_location)
     >>> depth_vis = result[OutputKeys.DEPTHS_COLOR]
     >>> cv2.imwrite('result.jpg', depth_vis)
-    '''
     """
 
     def __init__(self, model: str, **kwargs):
diff --git a/modelscope/pipelines/cv/realtime_object_detection_pipeline.py b/modelscope/pipelines/cv/realtime_object_detection_pipeline.py
deleted file mode 100644
index 9f558f88..00000000
--- a/modelscope/pipelines/cv/realtime_object_detection_pipeline.py
+++ /dev/null
@@ -1,51 +0,0 @@
-# Copyright (c) Alibaba, Inc. and its affiliates.
-import os.path as osp
-from typing import Any, Dict, List, Union
-
-import cv2
-import json
-import numpy as np
-import torch
-from PIL import Image
-from torchvision import transforms
-
-from modelscope.metainfo import Pipelines
-from modelscope.models.cv.realtime_object_detection import RealtimeDetector
-from modelscope.outputs import OutputKeys
-from modelscope.pipelines import pipeline
-from modelscope.pipelines.base import Input, Model, Pipeline, Tensor
-from modelscope.pipelines.builder import PIPELINES
-from modelscope.preprocessors import load_image
-from modelscope.utils.constant import ModelFile, Tasks
-from modelscope.utils.logger import get_logger
-
-logger = get_logger()
-
-
-@PIPELINES.register_module(
-    Tasks.image_object_detection,
-    module_name=Pipelines.realtime_object_detection)
-class RealtimeObjectDetectionPipeline(Pipeline):
-
-    def __init__(self, model: str, **kwargs):
-        super().__init__(model=model, **kwargs)
-        self.model = RealtimeDetector(model)
-
-    def preprocess(self, input: Input) -> Dict[Tensor, Union[str, np.ndarray]]:
-        output = self.model.preprocess(input)
-        return {'pre_output': output}
-
-    def forward(self, input: Tensor) -> Dict[Tensor, Dict[str, np.ndarray]]:
-        pre_output = input['pre_output']
-        forward_output = self.model(pre_output)
-        return {'forward_output': forward_output}
-
-    def postprocess(self, input: Dict[Tensor, Dict[str, np.ndarray]],
-                    **kwargs) -> str:
-        forward_output = input['forward_output']
-        bboxes, scores, labels = forward_output
-        return {
-            OutputKeys.BOXES: bboxes,
-            OutputKeys.SCORES: scores,
-            OutputKeys.LABELS: labels,
-        }
diff --git a/modelscope/pipelines/cv/realtime_video_object_detection_pipeline.py b/modelscope/pipelines/cv/realtime_video_object_detection_pipeline.py
index 073fad66..ed2c0d35 100644
--- a/modelscope/pipelines/cv/realtime_video_object_detection_pipeline.py
+++ b/modelscope/pipelines/cv/realtime_video_object_detection_pipeline.py
@@ -10,8 +10,7 @@ from PIL import Image
 from torchvision import transforms
 
 from modelscope.metainfo import Pipelines
-from modelscope.models.cv.realtime_object_detection import \
-    RealtimeVideoDetector
+from modelscope.models.cv.stream_yolo import RealtimeVideoDetector
 from modelscope.outputs import OutputKeys
 from modelscope.pipelines import pipeline
 from modelscope.pipelines.base import Input, Model, Pipeline, Tensor
diff --git a/modelscope/pipelines/cv/video_colorization_pipeline.py b/modelscope/pipelines/cv/video_colorization_pipeline.py
new file mode 100644
index 00000000..0d38331d
--- /dev/null
+++ b/modelscope/pipelines/cv/video_colorization_pipeline.py
@@ -0,0 +1,164 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os
+import subprocess
+import tempfile
+from typing import Any, Dict
+
+import cv2
+import numpy as np
+import PIL
+import torch
+from torchvision import models, transforms
+
+from modelscope.metainfo import Pipelines
+from modelscope.models.cv.image_colorization import (DynamicUnetDeep,
+                                                     DynamicUnetWide, NormType)
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines.base import Input, Pipeline
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.preprocessors.cv import VideoReader
+from modelscope.utils.constant import ModelFile, Tasks
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+
+@PIPELINES.register_module(
+    Tasks.video_colorization, module_name=Pipelines.video_colorization)
+class VideoColorizationPipeline(Pipeline):
+
+    def __init__(self, model: str, **kwargs):
+        """
+        use `model` to create a video colorization pipeline for prediction
+        Args:
+            model: model id on modelscope hub.
+        """
+        super().__init__(model=model, **kwargs)
+        self.cut = 8
+        self.size = 512
+        if torch.cuda.is_available():
+            self.device = torch.device('cuda')
+        else:
+            self.device = torch.device('cpu')
+
+        self.orig_img = None
+        self.model_type = 'stable'
+        self.norm = transforms.Compose([
+            transforms.ToTensor(),
+            transforms.Normalize(
+                mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
+        ])
+        self.denorm = transforms.Normalize(
+            mean=[-0.485 / 0.229, -0.456 / 0.224, -0.406 / 0.225],
+            std=[1 / 0.229, 1 / 0.224, 1 / 0.225])
+
+        if self.model_type == 'stable':
+            body = models.resnet101(pretrained=True)
+            body = torch.nn.Sequential(*list(body.children())[:self.cut])
+            self.model = DynamicUnetWide(
+                body,
+                n_classes=3,
+                blur=True,
+                blur_final=True,
+                self_attention=True,
+                y_range=(-3.0, 3.0),
+                norm_type=NormType.Spectral,
+                last_cross=True,
+                bottle=False,
+                nf_factor=2,
+            ).to(self.device)
+        else:
+            body = models.resnet34(pretrained=True)
+            body = torch.nn.Sequential(*list(body.children())[:self.cut])
+            self.model = DynamicUnetDeep(
+                body,
+                n_classes=3,
+                blur=True,
+                blur_final=True,
+                self_attention=True,
+                y_range=(-3.0, 3.0),
+                norm_type=NormType.Spectral,
+                last_cross=True,
+                bottle=False,
+                nf_factor=1.5,
+            ).to(self.device)
+
+        model_path = f'{model}/{ModelFile.TORCH_MODEL_FILE}'
+        self.model.load_state_dict(
+            torch.load(model_path, map_location=torch.device('cpu'))['model'],
+            strict=True)
+
+        logger.info('load model done')
+
+    def preprocess(self, input: Input) -> Dict[str, Any]:
+        # input is a video file
+        video_reader = VideoReader(input)
+        inputs = []
+        for frame in video_reader:
+            inputs.append(frame)
+        fps = video_reader.fps
+
+        self.orig_inputs = inputs.copy()
+        self.height, self.width = inputs[0].shape[:2]
+        if self.width * self.height < 100000:
+            self.size = 256
+
+        for i, img in enumerate(inputs):
+            img = PIL.Image.fromarray(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
+            img = img.resize((self.size, self.size),
+                             resample=PIL.Image.BILINEAR)
+            img = self.norm(img).unsqueeze(0)
+            inputs[i] = img
+
+        return {'video': inputs, 'fps': fps}
+
+    def forward(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
+        self.model.eval()
+        outputs = []
+        with torch.no_grad():
+            for i, img in enumerate(inputs['video']):
+                img = img.to(self.device)
+                out = self.model(img)[0]
+
+                out = self.denorm(out)
+                out = out.float().clamp(min=0, max=1)
+                out_img = (out.permute(1, 2, 0).flip(2).cpu().numpy()
+                           * 255).astype(np.uint8)
+
+                color_np = cv2.resize(out_img, (self.width, self.height))
+                orig_np = np.asarray(self.orig_inputs[i])
+                color_yuv = cv2.cvtColor(color_np, cv2.COLOR_BGR2YUV)
+                orig_yuv = cv2.cvtColor(orig_np, cv2.COLOR_BGR2YUV)
+
+                hires = np.copy(orig_yuv)
+                hires[:, :, 1:3] = color_yuv[:, :, 1:3]
+                out_img = cv2.cvtColor(hires, cv2.COLOR_YUV2BGR)
+
+                outputs.append(out_img)
+
+        return {'output': outputs, 'fps': inputs['fps']}
+
+    def postprocess(self, inputs: Dict[str, Any], **kwargs) -> Dict[str, Any]:
+        output_video_path = kwargs.get('output_video', None)
+        demo_service = kwargs.get('demo_service', True)
+        if output_video_path is None:
+            output_video_path = tempfile.NamedTemporaryFile(suffix='.mp4').name
+
+        h, w = inputs['output'][0].shape[:2]
+        fourcc = cv2.VideoWriter_fourcc(*'mp4v')
+        video_writer = cv2.VideoWriter(output_video_path, fourcc,
+                                       inputs['fps'], (w, h))
+        for i in range(len(inputs['output'])):
+            img = inputs['output'][i]
+            video_writer.write(img)
+        video_writer.release()
+
+        if demo_service:
+            assert os.system(
+                'ffmpeg -version') == 0, 'ffmpeg is not installed correctly!'
+            output_video_path_for_web = output_video_path[:-4] + '_web.mp4'
+            convert_cmd = f'ffmpeg -i {output_video_path} -vcodec h264 -crf 5 {output_video_path_for_web}'
+            subprocess.call(convert_cmd, shell=True)
+            return {OutputKeys.OUTPUT_VIDEO: output_video_path_for_web}
+        else:
+            return {OutputKeys.OUTPUT_VIDEO: output_video_path}
diff --git a/modelscope/pipelines/cv/video_deinterlace_pipeline.py b/modelscope/pipelines/cv/video_deinterlace_pipeline.py
new file mode 100644
index 00000000..e30a6678
--- /dev/null
+++ b/modelscope/pipelines/cv/video_deinterlace_pipeline.py
@@ -0,0 +1,186 @@
+# The implementation here is modified based on RealBasicVSR,
+# originally Apache 2.0 License and publicly avaialbe at
+# https://github.com/ckkelvinchan/RealBasicVSR/blob/master/inference_realbasicvsr.py
+import math
+import os
+import subprocess
+import tempfile
+from typing import Any, Dict, Optional, Union
+
+import cv2
+import numpy as np
+import torch
+from torchvision.utils import make_grid
+
+from modelscope.metainfo import Pipelines
+from modelscope.models.cv.video_deinterlace.UNet_for_video_deinterlace import \
+    UNetForVideoDeinterlace
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines.base import Input, Pipeline
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.preprocessors.cv import VideoReader
+from modelscope.utils.constant import Tasks
+from modelscope.utils.logger import get_logger
+
+VIDEO_EXTENSIONS = ('.mp4', '.mov')
+
+logger = get_logger()
+
+
+def tensor2img(tensor, out_type=np.uint8, min_max=(0, 1)):
+    """Convert torch Tensors into image numpy arrays.
+    After clamping to (min, max), image values will be normalized to [0, 1].
+    For different tensor shapes, this function will have different behaviors:
+        1. 4D mini-batch Tensor of shape (N x 3/1 x H x W):
+            Use `make_grid` to stitch images in the batch dimension, and then
+            convert it to numpy array.
+        2. 3D Tensor of shape (3/1 x H x W) and 2D Tensor of shape (H x W):
+            Directly change to numpy array.
+    Note that the image channel in input tensors should be RGB order. This
+    function will convert it to cv2 convention, i.e., (H x W x C) with BGR
+    order.
+    Args:
+        tensor (Tensor | list[Tensor]): Input tensors.
+        out_type (numpy type): Output types. If ``np.uint8``, transform outputs
+            to uint8 type with range [0, 255]; otherwise, float type with
+            range [0, 1]. Default: ``np.uint8``.
+        min_max (tuple): min and max values for clamp.
+    Returns:
+        (Tensor | list[Tensor]): 3D ndarray of shape (H x W x C) or 2D ndarray
+        of shape (H x W).
+    """
+    condition = torch.is_tensor(tensor) or (isinstance(tensor, list) and all(
+        torch.is_tensor(t) for t in tensor))
+    if not condition:
+        raise TypeError(
+            f'tensor or list of tensors expected, got {type(tensor)}')
+
+    if torch.is_tensor(tensor):
+        tensor = [tensor]
+    result = []
+    for _tensor in tensor:
+        # Squeeze two times so that:
+        # 1. (1, 1, h, w) -> (h, w) or
+        # 3. (1, 3, h, w) -> (3, h, w) or
+        # 2. (n>1, 3/1, h, w) -> (n>1, 3/1, h, w)
+        _tensor = _tensor.squeeze(0).squeeze(0)
+        _tensor = _tensor.float().detach().cpu().clamp_(*min_max)
+        _tensor = (_tensor - min_max[0]) / (min_max[1] - min_max[0])
+        n_dim = _tensor.dim()
+        if n_dim == 4:
+            img_np = make_grid(
+                _tensor, nrow=int(math.sqrt(_tensor.size(0))),
+                normalize=False).numpy()
+            img_np = np.transpose(img_np[[2, 1, 0], :, :], (1, 2, 0))
+        elif n_dim == 3:
+            img_np = _tensor.numpy()
+            img_np = np.transpose(img_np[[2, 1, 0], :, :], (1, 2, 0))
+        elif n_dim == 2:
+            img_np = _tensor.numpy()
+        else:
+            raise ValueError('Only support 4D, 3D or 2D tensor. '
+                             f'But received with dimension: {n_dim}')
+        if out_type == np.uint8:
+            # Unlike MATLAB, numpy.unit8() WILL NOT round by default.
+            img_np = (img_np * 255.0).round()
+        img_np = img_np.astype(out_type)
+        result.append(img_np)
+    result = result[0] if len(result) == 1 else result
+    return result
+
+
+@PIPELINES.register_module(
+    Tasks.video_deinterlace, module_name=Pipelines.video_deinterlace)
+class VideoDeinterlacePipeline(Pipeline):
+
+    def __init__(self,
+                 model: Union[UNetForVideoDeinterlace, str],
+                 preprocessor=None,
+                 **kwargs):
+        """The inference pipeline for all the video deinterlace sub-tasks.
+
+        Args:
+            model (`str` or `Model` or module instance): A model instance or a model local dir
+                or a model id in the model hub.
+            preprocessor (`Preprocessor`, `optional`): A Preprocessor instance.
+            kwargs (dict, `optional`):
+                Extra kwargs passed into the preprocessor's constructor.
+
+        Example:
+            >>> from modelscope.pipelines import pipeline
+            >>> pipeline_ins = pipeline('video-deinterlace',
+                model='damo/cv_unet_video-deinterlace')
+            >>> input = 'https://modelscope.oss-cn-beijing.aliyuncs.com/test/videos/video_deinterlace_test.mp4'
+            >>> print(pipeline_ins(input)[OutputKeys.OUTPUT_VIDEO])
+        """
+        super().__init__(model=model, preprocessor=preprocessor, **kwargs)
+        if torch.cuda.is_available():
+            self._device = torch.device('cuda')
+        else:
+            self._device = torch.device('cpu')
+
+        self.net = self.model.model
+        self.net.to(self._device)
+        self.net.eval()
+
+        logger.info('load video deinterlace model done')
+
+    def preprocess(self, input: Input) -> Dict[str, Any]:
+        # input is a video file
+        video_reader = VideoReader(input)
+        inputs = []
+        for frame in video_reader:
+            inputs.append(np.flip(frame, axis=2))
+        fps = video_reader.fps
+
+        for i, img in enumerate(inputs):
+            img = torch.from_numpy(img / 255.).permute(2, 0, 1).float()
+            inputs[i] = img.unsqueeze(0)
+        inputs = torch.stack(inputs, dim=1)
+        return {'video': inputs, 'fps': fps}
+
+    def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
+        inputs = input['video'][0]
+        frenet = self.net.frenet
+        enhnet = self.net.enhnet
+        with torch.no_grad():
+            outputs = []
+            frames = []
+            for i in range(0, inputs.size(0)):
+                frames.append(frenet(inputs[i:i + 1, ...].to(self._device)))
+                if i == 0:
+                    frames = [frames[-1]] * 2
+                    continue
+                outputs.append(enhnet(frames).cpu().unsqueeze(1))
+                frames = frames[1:]
+
+            frames.append(frames[-1])
+            outputs.append(enhnet(frames).cpu().unsqueeze(1))
+            outputs = torch.cat(outputs, dim=1)
+        return {'output': outputs, 'fps': input['fps']}
+
+    def postprocess(self, inputs: Dict[str, Any], **kwargs) -> Dict[str, Any]:
+        output_video_path = kwargs.get('output_video', None)
+        demo_service = kwargs.get('demo_service', False)
+        if output_video_path is None:
+            output_video_path = tempfile.NamedTemporaryFile(suffix='.mp4').name
+
+        h, w = inputs['output'].shape[-2:]
+        fourcc = cv2.VideoWriter_fourcc(*'mp4v')
+        video_writer = cv2.VideoWriter(output_video_path, fourcc,
+                                       inputs['fps'], (w, h))
+        for i in range(0, inputs['output'].size(1)):
+            img = tensor2img(inputs['output'][:, i, :, :, :])
+            video_writer.write(img.astype(np.uint8))
+        video_writer.release()
+
+        if demo_service:
+            assert os.system(
+                'ffmpeg -version'
+            ) == 0, 'ffmpeg is not installed correctly, please refer to https://trac.ffmpeg.org/wiki/CompilationGuide.'
+            output_video_path_for_web = output_video_path[:-4] + '_web.mp4'
+            convert_cmd = f'ffmpeg -i {output_video_path} -vcodec h264 -crf 5 {output_video_path_for_web}'
+            subprocess.call(convert_cmd, shell=True)
+            return {OutputKeys.OUTPUT_VIDEO: output_video_path_for_web}
+        else:
+            return {OutputKeys.OUTPUT_VIDEO: output_video_path}
diff --git a/modelscope/pipelines/cv/video_frame_interpolation_pipeline.py b/modelscope/pipelines/cv/video_frame_interpolation_pipeline.py
index d241b00a..f9c92769 100644
--- a/modelscope/pipelines/cv/video_frame_interpolation_pipeline.py
+++ b/modelscope/pipelines/cv/video_frame_interpolation_pipeline.py
@@ -496,8 +496,9 @@ __all__ = ['VideoFrameInterpolationPipeline']
     module_name=Pipelines.video_frame_interpolation)
 class VideoFrameInterpolationPipeline(Pipeline):
     """ Video Frame Interpolation Pipeline.
-    Example:
-    ```python
+
+    Examples:
+
     >>> from modelscope.pipelines import pipeline
     >>> from modelscope.utils.constant import Tasks
     >>> from modelscope.outputs import OutputKeys
@@ -507,7 +508,6 @@ class VideoFrameInterpolationPipeline(Pipeline):
     'damo/cv_raft_video-frame-interpolation')
     >>> result = video_frame_interpolation_pipeline(video)[OutputKeys.OUTPUT_VIDEO]
     >>> print('pipeline: the output video path is {}'.format(result))
-    ```
     """
 
     def __init__(self,
@@ -525,8 +525,11 @@ class VideoFrameInterpolationPipeline(Pipeline):
         logger.info('load video frame-interpolation done')
 
     def preprocess(self, input: Input, out_fps: float = 0) -> Dict[str, Any]:
-        # input is a video file
-        video_reader = VideoReader(input)
+        # Determine the input type
+        if isinstance(input, str):
+            video_reader = VideoReader(input)
+        elif isinstance(input, dict):
+            video_reader = VideoReader(input['video'])
         inputs = []
         for frame in video_reader:
             inputs.append(frame)
@@ -536,8 +539,15 @@ class VideoFrameInterpolationPipeline(Pipeline):
             img = torch.from_numpy(img.copy()).permute(2, 0, 1).float()
             inputs[i] = img.unsqueeze(0)
 
-        if out_fps == 0:
+        if isinstance(input, str):
             out_fps = 2 * fps
+        elif isinstance(input, dict):
+            if 'interp_ratio' in input:
+                out_fps = input['interp_ratio'] * fps
+            elif 'out_fps' in input:
+                out_fps = input['out_fps']
+            else:
+                out_fps = 2 * fps
         return {'video': inputs, 'fps': fps, 'out_fps': out_fps}
 
     def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
diff --git a/modelscope/pipelines/cv/video_human_matting_pipeline.py b/modelscope/pipelines/cv/video_human_matting_pipeline.py
index e9a05d84..98e0e809 100644
--- a/modelscope/pipelines/cv/video_human_matting_pipeline.py
+++ b/modelscope/pipelines/cv/video_human_matting_pipeline.py
@@ -5,6 +5,7 @@ from typing import Any, Dict
 import cv2
 import numpy as np
 import torch
+from moviepy.editor import ImageSequenceClip, VideoFileClip
 
 from modelscope.metainfo import Pipelines
 from modelscope.models.cv.video_human_matting import preprocess
@@ -22,10 +23,19 @@ logger = get_logger()
 class VideoHumanMattingPipeline(Pipeline):
 
     def __init__(self, model: str, **kwargs):
-        """
+        """ Video Human Matting Pipeline.
         use `model` to create a video human matting pipeline for prediction
-        Args:
-            model: model id on modelscope hub.
+
+        Example:
+
+        >>> from modelscope.pipelines import pipeline
+        >>> from modelscope.outputs import OutputKeys
+        >>> from modelscope.utils.constant import Tasks
+        >>> video_matting = pipeline(Tasks.video_human_matting, model='damo/cv_effnetv2_video-human-matting')
+        >>> result_status = video_matting({
+        'video_input_path':'https://modelscope.oss-cn-beijing.aliyuncs.com/test/videos/video_matting_test.mp4',
+        'output_path':'matting_out.mp4'})
+        >>> masks = result_status[OutputKeys.MASKS]
         """
         super().__init__(model=model, **kwargs)
         if torch.cuda.is_available():
@@ -37,16 +47,19 @@ class VideoHumanMattingPipeline(Pipeline):
     def preprocess(self, input) -> Input:
         return input
 
-    def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
+    def forward(self, input: Dict[str, Any],
+                **forward_params) -> Dict[str, Any]:
         video_path = input['video_input_path']
-        out_path = input['output_path']
+        if 'output_path' in input:
+            out_path = input['output_path']
+        else:
+            out_path = 'output.mp4'
         video_input = cv2.VideoCapture(video_path)
         fps = video_input.get(cv2.CAP_PROP_FPS)
-        fourcc = cv2.VideoWriter_fourcc(*'mp4v')
         success, frame = video_input.read()
         h, w = frame.shape[:2]
         scale = 512 / max(h, w)
-        video_save = cv2.VideoWriter(out_path, fourcc, fps, (w, h))
+        self.fps = fps
         masks = []
         rec = [None] * 4
         self.model = self.model.to(self.device)
@@ -58,20 +71,28 @@ class VideoHumanMattingPipeline(Pipeline):
                 frame_tensor = preprocess(frame)
                 pha, *rec = self.model.model(
                     frame_tensor.to(self.device), *rec, downsample_ratio=scale)
-                com = pha * 255
-                com = com.repeat(1, 3, 1, 1)
-                com = com[0].data.cpu().numpy().transpose(1, 2,
-                                                          0).astype(np.uint8)
-                video_save.write(com)
-                masks.append(com / 255)
+                mask = pha[0].data.cpu().numpy().transpose(1, 2, 0)
+                masks.append(mask)
                 success, frame = video_input.read()
         logger.info('matting process done')
         video_input.release()
-        video_save.release()
 
-        return {
-            OutputKeys.MASKS: masks,
+        return {OutputKeys.MASKS: masks, OutputKeys.OUTPUT_VIDEO: out_path}
+
+    def postprocess(self, inputs, **kwargs) -> Dict[str, Any]:
+        render = kwargs.get('render', False)
+        masks = inputs[OutputKeys.MASKS]
+        output_path = inputs[OutputKeys.OUTPUT_VIDEO]
+        frame_lst = []
+        for mask in masks:
+            com = (mask * 255).repeat(3, 2).astype(np.uint8)
+            frame_lst.append(com)
+        video = ImageSequenceClip(sequence=frame_lst, fps=self.fps)
+        video.write_videofile(output_path, fps=self.fps, audio=False)
+        del frame_lst
+
+        result = {
+            OutputKeys.MASKS: None if render else masks,
+            OutputKeys.OUTPUT_VIDEO: output_path
         }
-
-    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
-        return inputs
+        return result
diff --git a/modelscope/pipelines/cv/video_multi_object_tracking_pipeline.py b/modelscope/pipelines/cv/video_multi_object_tracking_pipeline.py
index 0f02413c..2d3a1ff4 100644
--- a/modelscope/pipelines/cv/video_multi_object_tracking_pipeline.py
+++ b/modelscope/pipelines/cv/video_multi_object_tracking_pipeline.py
@@ -47,10 +47,13 @@ class VideoMultiObjectTrackingPipeline(Pipeline):
         dataloader = LoadVideo(input, self.opt.img_size)
         self.tracker.set_buffer_len(dataloader.frame_rate)
 
-        results = []
+        output_boxes = []
+        output_labels = []
         output_timestamps = []
         frame_id = 0
         for i, (path, img, img0) in enumerate(dataloader):
+            output_boxex_cur = []
+            output_labels_cur = []
             output_timestamps.append(
                 timestamp_format(seconds=frame_id / dataloader.frame_rate))
             blob = torch.from_numpy(img).unsqueeze(0)
@@ -66,14 +69,20 @@ class VideoMultiObjectTrackingPipeline(Pipeline):
                         tlwh[0], tlwh[1], tlwh[0] + tlwh[2], tlwh[1] + tlwh[3]
                     ])
                     online_ids.append(tid)
-                results.append([
-                    frame_id + 1, tid, tlwh[0], tlwh[1], tlwh[0] + tlwh[2],
-                    tlwh[1] + tlwh[3]
+                output_boxex_cur.append([
+                    int(max(0, tlwh[0])),
+                    int(max(0, tlwh[1])),
+                    int(tlwh[0] + tlwh[2]),
+                    int(tlwh[1] + tlwh[3])
                 ])
+                output_labels_cur.append(tid)
+            output_boxes.append(output_boxex_cur)
+            output_labels.append(output_labels_cur)
             frame_id += 1
 
         return {
-            OutputKeys.BOXES: results,
+            OutputKeys.BOXES: output_boxes,
+            OutputKeys.LABELS: output_labels,
             OutputKeys.TIMESTAMPS: output_timestamps
         }
 
diff --git a/modelscope/pipelines/cv/video_panoptic_segmentation_pipeline.py b/modelscope/pipelines/cv/video_panoptic_segmentation_pipeline.py
new file mode 100644
index 00000000..53000fa6
--- /dev/null
+++ b/modelscope/pipelines/cv/video_panoptic_segmentation_pipeline.py
@@ -0,0 +1,138 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import os
+import os.path as osp
+from typing import Any, Dict
+
+import cv2
+import mmcv
+import numpy as np
+import torch
+from tqdm import tqdm
+
+from modelscope.metainfo import Pipelines
+from modelscope.models.cv.video_panoptic_segmentation.video_k_net import \
+    VideoKNet
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines.base import Input, Pipeline
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.utils.config import Config
+from modelscope.utils.constant import ModelFile, Tasks
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+
+@PIPELINES.register_module(
+    Tasks.video_panoptic_segmentation,
+    module_name=Pipelines.video_panoptic_segmentation)
+class VideoPanopticSegmentationPipeline(Pipeline):
+
+    def __init__(self, model: str, **kwargs):
+        """
+        use `model` to create a video panoptic segmentation pipeline for prediction
+        Args:
+            model: model id on modelscope hub.
+        """
+        super().__init__(model=model, auto_collate=False, **kwargs)
+        logger.info(f'loading model from {model}')
+        model_path = osp.join(model, ModelFile.TORCH_MODEL_FILE)
+        config_path = osp.join(model, ModelFile.CONFIGURATION)
+        logger.info(f'loading config from {config_path}')
+        self.cfg = Config.from_file(config_path)
+        self.max_video_frames = kwargs.get('max_video_frames', 1000)
+
+        self.model = VideoKNet(model)
+        checkpoint = torch.load(
+            model_path, map_location=torch.device(self.device))
+        self.model.load_state_dict(checkpoint['state_dict'])
+        self.model = self.model.to(self.device).eval()
+        logger.info('load model done')
+
+        self.pad_size_divisor = 32
+        self.mean = np.array([123.675, 116.28, 103.53], np.float32)
+        self.std = np.array([58.395, 57.12, 57.375], np.float32)
+        self.to_rgb = False
+
+    def preprocess(self, input: Input) -> Dict[str, Any]:
+        if not isinstance(input, str):
+            raise TypeError(f'input should be a str,'
+                            f'  but got {type(input)}')
+        frames = []
+        img_metas = []
+        iids = []
+        cap = cv2.VideoCapture(input)
+        self.fps = cap.get(cv2.CAP_PROP_FPS)
+        self.frame_count = cap.get(cv2.CAP_PROP_FRAME_COUNT)
+        frame_idx = 0
+        while (cap.isOpened()):
+            ret, frame = cap.read()
+            if not ret:
+                break
+
+            if frame_idx > self.max_video_frames:
+                break
+
+            norm_frame = mmcv.imnormalize(frame, self.mean, self.std,
+                                          self.to_rgb)
+            pad_frame = mmcv.impad_to_multiple(
+                norm_frame, self.pad_size_divisor, pad_val=0)
+
+            img_meta = {}
+            img_meta['ori_shape'] = frame.shape
+            img_meta['img_shape'] = frame.shape
+            img_meta['pad_shape'] = pad_frame.shape
+            img_meta['batch_input_shape'] = pad_frame.shape[0:2]
+            img_meta['scale_factor'] = 1.0,
+            img_meta['flip'] = False
+            img_meta['flip_direction'] = None
+
+            frames.append(pad_frame)
+            img_metas.append([img_meta])
+            iids.append(frame_idx)
+
+            frame_idx += 1
+
+        result = {
+            'video_name': input,
+            'imgs': np.array(frames),
+            'img_metas': img_metas,
+            'iids': iids,
+        }
+        return result
+
+    def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
+        scores = []
+        labels = []
+        masks = []
+        boxes = []
+        track_ids = []
+        for ii in tqdm(range(len(input['iids']))):
+            img = input['imgs'][ii]
+            img_meta = input['img_metas'][ii]
+            iid = input['iids'][ii]
+
+            x = np.transpose(img, [2, 0, 1])
+            x = np.expand_dims(x, 0)
+            x = torch.from_numpy(x).to(self.device)
+            with torch.no_grad():
+                segm_results = self.model(x, img_meta, rescale=True, iid=iid)
+
+            _, _, _, vis_sem, vis_tracker, label, binary_mask, track_id, thing_bbox_for_tracking = segm_results
+            scores.append([0.99] * len(label))
+            labels.append(label)
+            masks.append(binary_mask)
+            boxes.append(thing_bbox_for_tracking)
+            track_ids.append(track_id)
+
+        output = {
+            'scores': scores,
+            'labels': labels,
+            'masks': masks,
+            'boxes': boxes,
+            'uuid': track_ids
+        }
+        return output
+
+    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
+        return inputs
diff --git a/modelscope/pipelines/cv/video_stabilization_pipeline.py b/modelscope/pipelines/cv/video_stabilization_pipeline.py
index c0b50a04..bdf3d0fc 100644
--- a/modelscope/pipelines/cv/video_stabilization_pipeline.py
+++ b/modelscope/pipelines/cv/video_stabilization_pipeline.py
@@ -39,9 +39,8 @@ __all__ = ['VideoStabilizationPipeline']
 class VideoStabilizationPipeline(Pipeline):
     """  Video Stabilization Pipeline.
 
-    Example:
+    Examples:
 
-    ```python
     >>> import cv2
     >>> from modelscope.outputs import OutputKeys
     >>> from modelscope.pipelines import pipeline
@@ -51,7 +50,6 @@ class VideoStabilizationPipeline(Pipeline):
     >>> video_stabilization = pipeline(Tasks.video_stabilization, model='damo/cv_dut-raft_video-stabilization_base')
     >>> out_video_path = video_stabilization(test_video)[OutputKeys.OUTPUT_VIDEO]
     >>> print('Pipeline: the output video path is {}'.format(out_video_path))
-    ```
     """
 
     def __init__(self,
diff --git a/modelscope/pipelines/cv/video_super_resolution_pipeline.py b/modelscope/pipelines/cv/video_super_resolution_pipeline.py
index 87b73346..717ece43 100644
--- a/modelscope/pipelines/cv/video_super_resolution_pipeline.py
+++ b/modelscope/pipelines/cv/video_super_resolution_pipeline.py
@@ -13,8 +13,8 @@ import torch
 from torchvision.utils import make_grid
 
 from modelscope.metainfo import Pipelines
-from modelscope.models.cv.video_super_resolution import \
-    RealBasicVSRNetForVideoSR
+from modelscope.models.cv.video_super_resolution import (
+    MSRResNetLiteModel, RealBasicVSRNetForVideoSR)
 from modelscope.outputs import OutputKeys
 from modelscope.pipelines.base import Input, Pipeline
 from modelscope.pipelines.builder import PIPELINES
@@ -94,7 +94,8 @@ def tensor2img(tensor, out_type=np.uint8, min_max=(0, 1)):
 class VideoSuperResolutionPipeline(Pipeline):
 
     def __init__(self,
-                 model: Union[RealBasicVSRNetForVideoSR, str],
+                 model: Union[RealBasicVSRNetForVideoSR, MSRResNetLiteModel,
+                              str],
                  preprocessor=None,
                  **kwargs):
         super().__init__(model=model, preprocessor=preprocessor, **kwargs)
diff --git a/modelscope/pipelines/cv/vision_efficient_tuning_pipeline.py b/modelscope/pipelines/cv/vision_efficient_tuning_pipeline.py
new file mode 100644
index 00000000..2e3c45cc
--- /dev/null
+++ b/modelscope/pipelines/cv/vision_efficient_tuning_pipeline.py
@@ -0,0 +1,74 @@
+# Copyright 2022-2023 The Alibaba Fundamental Vision Team Authors. All rights reserved.
+from typing import Any, Dict
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+import torchvision.transforms as transforms
+
+from modelscope.metainfo import Pipelines
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines.base import Input, Pipeline
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.preprocessors import LoadImage
+from modelscope.utils.constant import Tasks
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+
+@PIPELINES.register_module(
+    Tasks.vision_efficient_tuning,
+    module_name=Pipelines.vision_efficient_tuning)
+class VisionEfficientTuningPipeline(Pipeline):
+
+    def __init__(self, model: str, **kwargs):
+        """
+        use `model` to create a vision efficient tuning pipeline for prediction
+        Args:
+            model: model id on modelscope hub.
+        Example:
+            >>> from modelscope.pipelines import pipeline
+            >>> petl_pipeline = pipeline('vision-efficient-tuning',
+                'damo/cv_vitb16_classification_vision-efficient-tuning-adapter')
+            >>> result = petl_pipeline(
+                'data/test/images/vision_efficient_tuning_test_1.png')
+            >>> print(f'Output: {result}.')
+        """
+        super().__init__(model=model, **kwargs)
+
+        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
+        self.model = self.model.to(self.device)
+        self.model.eval()
+        self.transform = transforms.Compose([
+            transforms.Resize(224),
+            transforms.ToTensor(),
+            transforms.Normalize(
+                mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
+        ])
+
+    def preprocess(self, input: Input) -> Dict[str, Any]:
+        img = LoadImage.convert_to_img(input)
+        data = self.transform(img).unsqueeze(0).to(self.device)
+        return data
+
+    def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
+        with torch.no_grad():
+            results = self.model(input)
+            return results
+
+    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
+        scores = F.softmax(inputs, dim=1).cpu().numpy()
+        pred_scores = np.sort(scores, axis=1)[0][::-1][:5]
+        pred_labels = np.argsort(scores, axis=1)[0][::-1][:5]
+
+        result = {
+            'pred_score': [score for score in pred_scores],
+            'pred_class': [self.model.CLASSES[label] for label in pred_labels]
+        }
+
+        outputs = {
+            OutputKeys.SCORES: result['pred_score'],
+            OutputKeys.LABELS: result['pred_class']
+        }
+        return outputs
diff --git a/modelscope/pipelines/multi_modal/diffusers_wrapped/stable_diffusion/chinese_stable_diffusion_pipeline.py b/modelscope/pipelines/multi_modal/diffusers_wrapped/stable_diffusion/chinese_stable_diffusion_pipeline.py
index fa8e1f50..d1627962 100644
--- a/modelscope/pipelines/multi_modal/diffusers_wrapped/stable_diffusion/chinese_stable_diffusion_pipeline.py
+++ b/modelscope/pipelines/multi_modal/diffusers_wrapped/stable_diffusion/chinese_stable_diffusion_pipeline.py
@@ -6,7 +6,7 @@
 # and publicly available at
 # https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
 
-from typing import Any, Dict, List, Union
+from typing import Any, Dict, List, Optional, Union
 
 import cv2
 import numpy as np
@@ -136,8 +136,15 @@ class _DiffuersChineseStableDiffusionPipeline(StableDiffusionPipeline):
             feature_extractor=feature_extractor,
             requires_safety_checker=requires_safety_checker)
 
-    def _encode_prompt(self, prompt, device, num_images_per_prompt,
-                       do_classifier_free_guidance, negative_prompt):
+    def _encode_prompt(
+            self,
+            prompt,
+            device,
+            num_images_per_prompt,
+            do_classifier_free_guidance,
+            negative_prompt=None,
+            prompt_embeds: Optional[torch.FloatTensor] = None,
+            negative_prompt_embeds: Optional[torch.FloatTensor] = None):
         r"""
         Encodes the prompt into text encoder hidden states.
 
@@ -153,27 +160,43 @@ class _DiffuersChineseStableDiffusionPipeline(StableDiffusionPipeline):
             negative_prompt (`str` or `List[str]`):
                 The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
                 if `guidance_scale` is less than `1`).
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
         """
-        batch_size = len(prompt) if isinstance(prompt, list) else 1
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
 
-        text_inputs = self.tokenizer(
-            text=prompt,
-            padding='max_length',
-            truncation=True,
-            max_length=52,
-            return_tensors='pt')
-        text_inputs = {k: v.to(device) for k, v in text_inputs.items()}
-        text_embeddings = self.text_encoder(**text_inputs)
-        text_embeddings = text_embeddings[0]
+        if prompt_embeds is None:
+            text_inputs = self.tokenizer(
+                text=prompt,
+                padding='max_length',
+                truncation=True,
+                max_length=52,
+                return_tensors='pt')
+            text_inputs = {k: v.to(device) for k, v in text_inputs.items()}
+            prompt_embeds = self.text_encoder(**text_inputs)
+            prompt_embeds = prompt_embeds[0]
+
+        prompt_embeds = prompt_embeds.to(
+            dtype=self.text_encoder.dtype, device=device)
 
         # duplicate text embeddings for each generation per prompt, using mps friendly method
-        bs_embed, seq_len, _ = text_embeddings.shape
-        text_embeddings = text_embeddings.repeat(1, num_images_per_prompt, 1)
-        text_embeddings = text_embeddings.view(
-            bs_embed * num_images_per_prompt, seq_len, -1)
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt,
+                                           seq_len, -1)
 
         # get unconditional embeddings for classifier free guidance
-        if do_classifier_free_guidance:
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
             uncond_tokens: List[str]
             if negative_prompt is None:
                 uncond_tokens = [''] * batch_size
@@ -198,19 +221,24 @@ class _DiffuersChineseStableDiffusionPipeline(StableDiffusionPipeline):
                 max_length=52,
                 return_tensors='pt')
             uncond_input = {k: v.to(device) for k, v in uncond_input.items()}
-            uncond_embeddings = self.text_encoder(**uncond_input)
-            uncond_embeddings = uncond_embeddings[0]
+            negative_prompt_embeds = self.text_encoder(**uncond_input)
+            negative_prompt_embeds = negative_prompt_embeds[0]
 
+        if do_classifier_free_guidance:
             # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
-            seq_len = uncond_embeddings.shape[1]
-            uncond_embeddings = uncond_embeddings.repeat(
+            seq_len = negative_prompt_embeds.shape[1]
+
+            negative_prompt_embeds = negative_prompt_embeds.to(
+                dtype=self.text_encoder.dtype, device=device)
+
+            negative_prompt_embeds = negative_prompt_embeds.repeat(
                 1, num_images_per_prompt, 1)
-            uncond_embeddings = uncond_embeddings.view(
+            negative_prompt_embeds = negative_prompt_embeds.view(
                 batch_size * num_images_per_prompt, seq_len, -1)
 
             # For classifier free guidance, we need to do two forward passes.
             # Here we concatenate the unconditional and text embeddings into a single batch
             # to avoid doing two forward passes
-            text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
 
-        return text_embeddings
+        return prompt_embeds
diff --git a/modelscope/pipelines/multi_modal/document_vl_embedding_pipeline.py b/modelscope/pipelines/multi_modal/document_vl_embedding_pipeline.py
index 754d2d2b..05e08f70 100644
--- a/modelscope/pipelines/multi_modal/document_vl_embedding_pipeline.py
+++ b/modelscope/pipelines/multi_modal/document_vl_embedding_pipeline.py
@@ -30,8 +30,8 @@ class DocumentVLEmbeddingPipeline(Pipeline):
             model: model id on modelscope hub.
             preprocessor: type `Preprocessor`. If None, `VLDocPreprocessor` is used.
 
-        Example:
-        ```python
+        Examples:
+
         >>> from modelscope.models import Model
         >>> from modelscope.pipelines import pipeline
         >>> model = Model.from_pretrained(
@@ -42,7 +42,6 @@ class DocumentVLEmbeddingPipeline(Pipeline):
                 'ocr_info_paths': ['data/demo.json']
             }
         >>> result = doc_VL_emb_pipeline(inp)
-        ```
         """
 
         super().__init__(model=model, preprocessor=preprocessor, **kwargs)
diff --git a/modelscope/pipelines/multi_modal/gridvlp_pipeline.py b/modelscope/pipelines/multi_modal/gridvlp_pipeline.py
new file mode 100644
index 00000000..1b3a71a6
--- /dev/null
+++ b/modelscope/pipelines/multi_modal/gridvlp_pipeline.py
@@ -0,0 +1,276 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import os.path as osp
+import time
+import traceback
+from typing import Any, Dict, Optional
+
+import json
+import numpy as np
+import torch
+from PIL import Image
+from transformers import BertTokenizer
+
+from modelscope.hub.snapshot_download import snapshot_download
+from modelscope.metainfo import Pipelines
+from modelscope.pipelines import Pipeline
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.preprocessors.image import load_image
+from modelscope.utils.constant import (DEFAULT_MODEL_REVISION, Frameworks,
+                                       Invoke, Tasks)
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+
+def cost(end, begin):
+    return '{:.2f}ms'.format((end - begin) * 1000)
+
+
+class Config:
+    SCALE = 1 / 255.0
+    MEAN = np.require([0.485, 0.456, 0.406], dtype=np.float32)[:, np.newaxis,
+                                                               np.newaxis]
+    STD = np.require([0.229, 0.224, 0.225], dtype=np.float32)[:, np.newaxis,
+                                                              np.newaxis]
+
+    # RESIZE_HEIGHT = int(224*1.14)
+    RESIZE_HEIGHT = int(256)
+    # RESIZE_WIDTH = int(224*1.14)
+    RESIZE_WIDTH = int(256)
+    CROP_SIZE = 224
+
+
+def pre_processor(img):
+    img = img.convert('RGB')
+
+    w, h = img.size
+    if (w <= h and w == Config.RESIZE_WIDTH) \
+            or (h <= w and h == Config.RESIZE_WIDTH):
+        img = img
+    if w < h:
+        ow = Config.RESIZE_WIDTH
+        oh = int(Config.RESIZE_WIDTH * h / w)
+        img = img.resize((ow, oh), Image.BILINEAR)
+    else:
+        oh = Config.RESIZE_WIDTH
+        ow = int(Config.RESIZE_WIDTH * w / h)
+        img = img.resize((ow, oh), Image.BILINEAR)
+    w, h = img.size
+    crop_top = int(round((h - Config.CROP_SIZE) / 2.))
+    crop_left = int(round((w - Config.CROP_SIZE) / 2.))
+    img = img.crop((crop_left, crop_top, crop_left + Config.CROP_SIZE,
+                    crop_top + Config.CROP_SIZE))
+    _img = np.array(img, dtype=np.float32)
+    _img = np.require(_img.transpose((2, 0, 1)), dtype=np.float32)
+    _img *= Config.SCALE
+    _img -= Config.MEAN
+    _img /= Config.STD
+    return _img
+
+
+class GridVlpPipeline(Pipeline):
+    """ Pipeline for gridvlp, including classification and embedding."""
+
+    def __init__(self, model_name_or_path: str, **kwargs):
+        """ Pipeline for gridvlp, including classification and embedding.
+        Args:
+            model: path to local model directory.
+        """
+        # download model from modelscope to local model dir
+        logger.info(f'load checkpoint from modelscope {model_name_or_path}')
+        if osp.exists(model_name_or_path):
+            local_model_dir = model_name_or_path
+        else:
+            invoked_by = '%s/%s' % (Invoke.KEY, Invoke.PIPELINE)
+            local_model_dir = snapshot_download(
+                model_name_or_path,
+                DEFAULT_MODEL_REVISION,
+                user_agent=invoked_by)
+        self.local_model_dir = local_model_dir
+
+        # load model from cpu and torch jit model
+        logger.info(f'load model from {local_model_dir}')
+        self.model = torch.jit.load(
+            osp.join(local_model_dir, 'pytorch_model.pt'))
+        self.framework = Frameworks.torch
+        self.device_name = 'cpu'
+        self._model_prepare = True
+        self._auto_collate = False
+
+        # load tokenizer
+        logger.info(f'load tokenizer from {local_model_dir}')
+        self.tokenizer = BertTokenizer.from_pretrained(local_model_dir)
+
+    def preprocess(self, inputs: Dict[str, Any], max_seq_length=49):
+        # fetch input params
+        image = inputs.get('image', '')
+        text = inputs.get('text', '')
+
+        s1 = time.time()
+
+        # download image and preprocess
+        try:
+            # load PIL image
+            img = load_image(image)
+            s2 = time.time()
+
+            # image preprocess
+            image_data = pre_processor(img)
+            s3 = time.time()
+
+        except Exception:
+            image_data = np.zeros((3, 224, 224), dtype=np.float32)
+            s2 = time.time()
+            s3 = time.time()
+            logger.info(traceback.print_exc())
+
+        # text process
+        if text is None or text.isspace() or not text.strip():
+            logger.info('text is empty!')
+            text = ''
+        inputs = self.tokenizer(
+            text,
+            padding='max_length',
+            truncation=True,
+            max_length=max_seq_length)
+
+        s4 = time.time()
+
+        logger.info(f'example. text: {text} image: {image}')
+        logger.info(
+            f'preprocess. Img_Download:{cost(s2, s1)}, Img_Pre:{cost(s3, s2)}, Txt_Pre:{cost(s4, s3)}'
+        )
+
+        input_dict = {
+            'image': image_data,
+            'input_ids': inputs['input_ids'],
+            'input_mask': inputs['attention_mask'],
+            'segment_ids': inputs['token_type_ids']
+        }
+        return input_dict
+
+
+@PIPELINES.register_module(
+    Tasks.visual_question_answering,
+    module_name=Pipelines.gridvlp_multi_modal_classification)
+class GridVlpClassificationPipeline(GridVlpPipeline):
+    """ Pipeline for gridvlp classification, including cate classfication and
+    brand classification.
+
+    Example:
+
+    ```python
+    >>> from modelscope.pipelines.multi_modal.gridvlp_pipeline import \
+    GridVlpClassificationPipeline
+
+    >>> pipeline = GridVlpClassificationPipeline('rgtjf1/multi-modal_gridvlp_classification_chinese-base-ecom-cate')
+    >>> output = pipeline({'text': '女装快干弹力轻型短裤448575',\
+        'image':'https://yejiabo-public.oss-cn-zhangjiakou.aliyuncs.com/alinlp/clothes.png'})
+    >>> output['text'][0]
+    {'label': {'cate_name': '休闲裤', 'cate_path': '女装>>裤子>>休闲裤>>休闲裤'}, 'score': 0.4146, 'rank': 0}
+
+    ```
+    """
+
+    def __init__(self, model_name_or_path: str, **kwargs):
+        """ Pipeline for gridvlp classification, including cate classfication and
+    brand classification.
+        Args:
+            model: path to local model directory.
+        """
+        super().__init__(model_name_or_path, **kwargs)
+
+        # load label mapping
+        logger.info(f'load label mapping from {self.local_model_dir}')
+        self.label_mapping = json.load(
+            open(osp.join(self.local_model_dir, 'label_mapping.json')))
+
+    def forward(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
+        s4 = time.time()
+
+        box_tensor = torch.zeros(1, dtype=torch.float32)
+
+        output = self.model(
+            torch.tensor(inputs['image']).unsqueeze(0),
+            box_tensor.unsqueeze(0),
+            torch.tensor(inputs['input_ids'], dtype=torch.long).unsqueeze(0),
+            torch.tensor(inputs['input_mask'], dtype=torch.long).unsqueeze(0),
+            torch.tensor(inputs['segment_ids'], dtype=torch.long).unsqueeze(0))
+        output = output[0].detach().numpy()
+
+        s5 = time.time()
+
+        logger.info(f'forward. Infer:{cost(s5, s4)}')
+
+        # 返回结果
+        return output
+
+    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
+        s5 = time.time()
+        output = inputs
+        index = np.argsort(-output)
+        out_sort = output[index]
+
+        top_k = []
+        for i in range(min(10, len(self.label_mapping))):
+            label = self.label_mapping[str(index[i])]
+            top_k.append({
+                'label': label,
+                'score': round(float(out_sort[i]), 4),
+                'rank': i
+            })
+
+        s6 = time.time()
+        logger.info(f'postprocess. Post: {cost(s6, s5)}')
+        return {'text': top_k}
+
+
+@PIPELINES.register_module(
+    Tasks.multi_modal_embedding,
+    module_name=Pipelines.gridvlp_multi_modal_embedding)
+class GridVlpEmbeddingPipeline(GridVlpPipeline):
+    """ Pipeline for gridvlp embedding. These only generate unified multi-modal
+    embeddings and output it in `text_embedding` or `img_embedding`.
+
+    Example:
+
+    ```python
+    >>> from modelscope.pipelines.multi_modal.gridvlp_pipeline import \
+    GridVlpEmbeddingPipeline
+
+    >>> pipeline = GridVlpEmbeddingPipeline('rgtjf1/multi-modal_gridvlp_classification_chinese-base-ecom-embedding')
+    >>> outputs = pipeline({'text': '女装快干弹力轻型短裤448575',\
+        'image':'https://yejiabo-public.oss-cn-zhangjiakou.aliyuncs.com/alinlp/clothes.png'})
+    >>> outputs["text_embedding"].shape
+    (768,)
+
+    ```
+    """
+
+    def forward(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
+        s4 = time.time()
+
+        box_tensor = torch.zeros(1, dtype=torch.float32)
+
+        output = self.model(
+            torch.tensor(inputs['image']).unsqueeze(0),
+            box_tensor.unsqueeze(0),
+            torch.tensor(inputs['input_ids'], dtype=torch.long).unsqueeze(0),
+            torch.tensor(inputs['input_mask'], dtype=torch.long).unsqueeze(0),
+            torch.tensor(inputs['segment_ids'], dtype=torch.long).unsqueeze(0))
+        s5 = time.time()
+
+        output = output[0].detach().numpy()
+
+        s6 = time.time()
+        logger.info(f'forward. Infer:{cost(s5, s4)}, Post: {cost(s6, s5)}')
+        # 返回结果
+        return output
+
+    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
+        outputs = {
+            'img_embedding': inputs,
+            'text_embedding': inputs,
+        }
+        return outputs
diff --git a/modelscope/pipelines/multi_modal/image_captioning_pipeline.py b/modelscope/pipelines/multi_modal/image_captioning_pipeline.py
index 0a16c58f..fbab88fd 100644
--- a/modelscope/pipelines/multi_modal/image_captioning_pipeline.py
+++ b/modelscope/pipelines/multi_modal/image_captioning_pipeline.py
@@ -10,7 +10,7 @@ from modelscope.pipelines.builder import PIPELINES
 from modelscope.pipelines.util import batch_process
 from modelscope.preprocessors import (MPlugPreprocessor, OfaPreprocessor,
                                       Preprocessor)
-from modelscope.utils.constant import Tasks
+from modelscope.utils.constant import ModelFile, Tasks
 from modelscope.utils.logger import get_logger
 
 logger = get_logger()
@@ -31,7 +31,10 @@ class ImageCaptioningPipeline(Pipeline):
         """
         super().__init__(model=model, preprocessor=preprocessor, **kwargs)
         self.model.eval()
+        assert isinstance(self.model, Model), \
+            f'please check whether model config exists in {ModelFile.CONFIGURATION}'
         if preprocessor is None:
+
             if isinstance(self.model, OfaForAllTasks):
                 self.preprocessor = OfaPreprocessor(self.model.model_dir)
             elif isinstance(self.model, MPlugForAllTasks):
diff --git a/modelscope/pipelines/multi_modal/image_text_retrieval_pipeline.py b/modelscope/pipelines/multi_modal/image_text_retrieval_pipeline.py
index 09be8265..d1037b24 100644
--- a/modelscope/pipelines/multi_modal/image_text_retrieval_pipeline.py
+++ b/modelscope/pipelines/multi_modal/image_text_retrieval_pipeline.py
@@ -8,7 +8,7 @@ from modelscope.outputs import OutputKeys
 from modelscope.pipelines.base import Model, Pipeline
 from modelscope.pipelines.builder import PIPELINES
 from modelscope.preprocessors import MPlugPreprocessor, Preprocessor
-from modelscope.utils.constant import Tasks
+from modelscope.utils.constant import ModelFile, Tasks
 from modelscope.utils.logger import get_logger
 
 logger = get_logger()
@@ -30,6 +30,8 @@ class ImageTextRetrievalPipeline(Pipeline):
         """
         super().__init__(model=model, preprocessor=preprocessor, **kwargs)
         self.model.eval()
+        assert isinstance(self.model, Model), \
+            f'please check whether model config exists in {ModelFile.CONFIGURATION}'
         if preprocessor is None:
             self.preprocessor = MPlugPreprocessor(self.model.model_dir)
 
diff --git a/modelscope/pipelines/multi_modal/mgeo_ranking_pipeline.py b/modelscope/pipelines/multi_modal/mgeo_ranking_pipeline.py
index da6c0d2f..a959ee09 100644
--- a/modelscope/pipelines/multi_modal/mgeo_ranking_pipeline.py
+++ b/modelscope/pipelines/multi_modal/mgeo_ranking_pipeline.py
@@ -11,7 +11,7 @@ from modelscope.outputs import OutputKeys
 from modelscope.pipelines.base import Pipeline
 from modelscope.pipelines.builder import PIPELINES
 from modelscope.preprocessors import Preprocessor
-from modelscope.utils.constant import Tasks
+from modelscope.utils.constant import ModelFile, Tasks
 
 __all__ = ['MGeoRankingPipeline']
 
@@ -46,6 +46,8 @@ class MGeoRankingPipeline(Pipeline):
             device=device,
             auto_collate=auto_collate)
 
+        assert isinstance(self.model, Model), \
+            f'please check whether model config exists in {ModelFile.CONFIGURATION}'
         if preprocessor is None:
             self.preprocessor = Preprocessor.from_pretrained(
                 self.model.model_dir,
diff --git a/modelscope/pipelines/nlp/__init__.py b/modelscope/pipelines/nlp/__init__.py
index 8fcf3e3f..b4ea93be 100644
--- a/modelscope/pipelines/nlp/__init__.py
+++ b/modelscope/pipelines/nlp/__init__.py
@@ -17,6 +17,7 @@ if TYPE_CHECKING:
     from .feature_extraction_pipeline import FeatureExtractionPipeline
     from .fill_mask_pipeline import FillMaskPipeline
     from .information_extraction_pipeline import InformationExtractionPipeline
+    from .interactive_translation_pipeline import InteractiveTranslationPipeline
     from .named_entity_recognition_pipeline import NamedEntityRecognitionPipeline
     from .text_ranking_pipeline import TextRankingPipeline
     from .sentence_embedding_pipeline import SentenceEmbeddingPipeline
@@ -25,6 +26,7 @@ if TYPE_CHECKING:
     from .translation_quality_estimation_pipeline import TranslationQualityEstimationPipeline
     from .text_error_correction_pipeline import TextErrorCorrectionPipeline
     from .text_generation_pipeline import TextGenerationPipeline, TextGenerationT5Pipeline
+    from .fid_dialogue_pipeline import FidDialoguePipeline
     from .token_classification_pipeline import TokenClassificationPipeline
     from .translation_pipeline import TranslationPipeline
     from .word_segmentation_pipeline import WordSegmentationPipeline, WordSegmentationThaiPipeline
@@ -34,6 +36,10 @@ if TYPE_CHECKING:
     from .codegeex_code_generation_pipeline import CodeGeeXCodeGenerationPipeline
     from .translation_evaluation_pipeline import TranslationEvaluationPipeline
     from .user_satisfaction_estimation_pipeline import UserSatisfactionEstimationPipeline
+    from .siamese_uie_pipeline import SiameseUiePipeline
+    from .document_grounded_dialog_generate_pipeline import DocumentGroundedDialogGeneratePipeline
+    from .document_grounded_dialog_retrieval_pipeline import DocumentGroundedDialogRetrievalPipeline
+    from .document_grounded_dialog_rerank_pipeline import DocumentGroundedDialogRerankPipeline
 
 else:
     _import_structure = {
@@ -44,7 +50,7 @@ else:
         ['DialogIntentPredictionPipeline'],
         'dialog_modeling_pipeline': ['DialogModelingPipeline'],
         'dialog_state_tracking_pipeline': ['DialogStateTrackingPipeline'],
-        'domain_classification_pipeline':
+        'fasttext_text_classification_pipeline':
         ['FasttextSequenceClassificationPipeline'],
         'document_segmentation_pipeline': ['DocumentSegmentationPipeline'],
         'extractive_summarization_pipeline':
@@ -53,6 +59,7 @@ else:
         'feature_extraction_pipeline': ['FeatureExtractionPipeline'],
         'fill_mask_pipeline': ['FillMaskPipeline'],
         'information_extraction_pipeline': ['InformationExtractionPipeline'],
+        'interactive_translation_pipeline': ['InteractiveTranslationPipeline'],
         'named_entity_recognition_pipeline': [
             'NamedEntityRecognitionPipeline',
         ],
@@ -65,7 +72,7 @@ else:
         'text_error_correction_pipeline': ['TextErrorCorrectionPipeline'],
         'text_generation_pipeline':
         ['TextGenerationPipeline', 'TextGenerationT5Pipeline'],
-        'text2text_generation_pipeline': ['Text2TextGenerationPipeline'],
+        'fid_dialogue_pipeline': ['FidDialoguePipeline'],
         'token_classification_pipeline': ['TokenClassificationPipeline'],
         'translation_pipeline': ['TranslationPipeline'],
         'translation_quality_estimation_pipeline':
@@ -81,7 +88,17 @@ else:
         ['CodeGeeXCodeGenerationPipeline'],
         'translation_evaluation_pipeline': ['TranslationEvaluationPipeline'],
         'user_satisfaction_estimation_pipeline':
-        ['UserSatisfactionEstimationPipeline']
+        ['UserSatisfactionEstimationPipeline'],
+        'siamese_uie_pipeline': ['SiameseUiePipeline'],
+        'document_grounded_dialog_generate_pipeline': [
+            'DocumentGroundedDialogGeneratePipeline'
+        ],
+        'document_grounded_dialog_rerank_pipeline': [
+            'DocumentGroundedDialogRerankPipeline'
+        ],
+        'document_grounded_dialog_retrieval_pipeline': [
+            'DocumentGroundedDialogRetrievalPipeline'
+        ]
     }
 
     import sys
diff --git a/modelscope/pipelines/nlp/document_grounded_dialog_generate_pipeline.py b/modelscope/pipelines/nlp/document_grounded_dialog_generate_pipeline.py
new file mode 100644
index 00000000..5fc1a193
--- /dev/null
+++ b/modelscope/pipelines/nlp/document_grounded_dialog_generate_pipeline.py
@@ -0,0 +1,68 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+from typing import Any, Dict, Union
+
+from modelscope.metainfo import Pipelines
+from modelscope.models import Model
+from modelscope.models.nlp import DocumentGroundedDialogGenerateModel
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines.base import Pipeline
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.preprocessors import DocumentGroundedDialogGeneratePreprocessor
+from modelscope.utils.constant import Tasks
+
+__all__ = ['DocumentGroundedDialogGeneratePipeline']
+
+
+@PIPELINES.register_module(
+    Tasks.document_grounded_dialog_generate,
+    module_name=Pipelines.document_grounded_dialog_generate)
+class DocumentGroundedDialogGeneratePipeline(Pipeline):
+
+    def __init__(
+            self,
+            model: Union[DocumentGroundedDialogGenerateModel, str],
+            preprocessor: DocumentGroundedDialogGeneratePreprocessor = None,
+            config_file: str = None,
+            device: str = 'gpu',
+            auto_collate=True,
+            **kwargs):
+        """The Generate pipeline for document grounded dialog
+
+        Args:
+            model: A model instance or a model local dir or a model id in the model hub.
+            preprocessor: A preprocessor instance.
+            config_file: Path to config file.
+            device: Device to run the model.
+            auto_collate: Apply auto collate.
+            **kwargs: The preprocessor kwargs passed into the preprocessor's constructor.
+
+        Examples:
+            >>> from modelscope.pipelines import pipeline
+            >>> pipe_ins = pipeline('document-grounded-dialog-generate', model='damo/nlp_convai_generate')
+        """
+        super().__init__(
+            model=model,
+            preprocessor=preprocessor,
+            config_file=config_file,
+            device=device,
+            auto_collate=auto_collate)
+
+        if preprocessor is None:
+            self.preprocessor = DocumentGroundedDialogGeneratePreprocessor(
+                self.model.model_dir, **kwargs)
+
+    def forward(self, inputs: Union[list, Dict[str, Any]],
+                **forward_params) -> Dict[str, Any]:
+        return {'generated_ids': self.model.generate(inputs)}
+
+    def postprocess(self, inputs: Union[list, Dict[str, Any]],
+                    **postprocess_params) -> Dict[str, Any]:
+        predictions = self.preprocessor.generation_tokenizer.batch_decode(
+            inputs['generated_ids'],
+            skip_special_tokens=True,
+            clean_up_tokenization_spaces=False)
+        return {OutputKeys.TEXT: predictions}
+
+    def _collate_fn(self, data):
+        return data
diff --git a/modelscope/pipelines/nlp/document_grounded_dialog_rerank_pipeline.py b/modelscope/pipelines/nlp/document_grounded_dialog_rerank_pipeline.py
new file mode 100644
index 00000000..d72366e9
--- /dev/null
+++ b/modelscope/pipelines/nlp/document_grounded_dialog_rerank_pipeline.py
@@ -0,0 +1,754 @@
+import os
+import pprint
+import random
+import re
+import sys
+import time
+from collections import OrderedDict, defaultdict
+from typing import Any, Dict, Iterable, List, Union
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+import transformers
+import ujson as json
+from torch import nn
+
+from modelscope.metainfo import Pipelines
+from modelscope.models import Model
+from modelscope.models.nlp import DocumentGroundedDialogRerankModel
+from modelscope.models.nlp.ponet.configuration import PoNetConfig
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines.base import Pipeline, Tensor
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.preprocessors import DocumentGroundedDialogRerankPreprocessor
+from modelscope.utils.constant import Tasks
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+__all__ = ['DocumentGroundedDialogRerankPipeline']
+
+
+@PIPELINES.register_module(
+    Tasks.document_grounded_dialog_rerank,
+    module_name=Pipelines.document_grounded_dialog_rerank)
+class DocumentGroundedDialogRerankPipeline(Pipeline):
+
+    def __init__(self,
+                 model: Union[DocumentGroundedDialogRerankModel, str],
+                 preprocessor: DocumentGroundedDialogRerankPreprocessor = None,
+                 config_file: str = None,
+                 device: str = 'cuda',
+                 auto_collate=True,
+                 seed: int = 88,
+                 **kwarg):
+        """The Rerank pipeline for document grounded dialog
+
+        Args:
+            model: A model instance or a model local dir or a model id in the model hub.
+            preprocessor: A preprocessor instance.
+            config_file: Path to config file.
+            device: Device to run the model.
+            auto_collate: Apply auto collate.
+            seed: Random seeds of random parameters.
+            **kwargs: The preprocessor kwargs passed into the preprocessor's constructor.
+
+        Examples:
+            >>> from modelscope.pipelines import pipeline
+            >>> pipe_ins = pipeline('document_grounded_dialog_rerank', model='damo/nlp_convai_rerank')
+        """
+        super().__init__(
+            model=model,
+            preprocessor=preprocessor,
+            config_file=config_file,
+            device=device,
+            auto_collate=auto_collate,
+            seed=seed)
+        self.model = model
+        self.preprocessor = preprocessor
+        self.device = device
+        if kwarg['model_resize']:
+            self.model.resize_token_embeddings(
+                len(self.preprocessor.tokenizer))
+        self.model.to(self.device)
+        self.model.eval()
+        self.args = kwarg
+        # self.model_cfg = self.model.model_cfg
+        set_seed(seed)
+
+    def one_instance(self, input_ids, attention_mask):
+        all_probs = []
+        for start_ndx in range(0, len(input_ids), self.args['max_batch_size']):
+            probs = F.softmax(
+                self.model({
+                    'input_ids':
+                    input_ids[start_ndx:start_ndx
+                              + self.args['max_batch_size']],
+                    'attention_mask':
+                    attention_mask[start_ndx:start_ndx
+                                   + self.args['max_batch_size']]
+                }).logits.detach().cpu(),
+                dim=-1)[:, 1].numpy().tolist()
+            all_probs.extend(probs)
+        return all_probs
+
+    def forward(self, dataset: Union[list, Dict[str, Any]],
+                **forward_params) -> Dict[str, Any]:
+        report = Reporting()
+        self.guess = []
+        with torch.no_grad():
+            for jobj in dataset:
+                inst_id = jobj['id']
+                probs = self.one_instance(jobj['input_ids'],
+                                          jobj['attention_mask'])
+                passages = jobj['passages']
+                query = jobj['query']
+                scored_pids = [(p['pid'], prob)
+                               for p, prob in zip(passages, probs)]
+                scored_pids.sort(key=lambda x: x[1], reverse=True)
+                wids = to_distinct_doc_ids([
+                    pid for pid, prob in scored_pids
+                ])  # convert to Wikipedia document ids
+                pred_record = {
+                    'id':
+                    inst_id,
+                    'input':
+                    query,
+                    'scored_pids':
+                    scored_pids,
+                    'output': [{
+                        'answer':
+                        '',
+                        'provenance': [{
+                            'wikipedia_id': wid
+                        } for wid in wids]
+                    }]
+                }
+                if self.args['include_passages']:
+                    pred_record['passages'] = passages
+
+                if report.is_time():
+                    print(
+                        f'Finished {report.check_count}; {report.check_count / report.elapsed_seconds()} per second.'
+                    )
+                self.guess.append(pred_record)
+        # if args['kilt_data']:
+        #     evaluate(dataset, args['output'])
+
+    def postprocess(self, inputs: list):
+        return {OutputKeys.OUTPUT: inputs}
+
+
+class Reporting:
+
+    def __init__(self,
+                 *,
+                 recency_weight=0.001,
+                 report_interval_secs=300,
+                 check_every=1,
+                 gather_samples: Iterable = (),
+                 num_samples=10000):
+        """The Reporting to print parameter status
+
+        Args:
+            recency_weight: when computing the moving average, how much weight to give to the current sample.
+            report_interval_secs: how many seconds between returning true for is_time.
+            check_every: how often to check the time, when calling is_time.
+            gather_samples: keep the last num_samples of the listed names (gathered from moving_averages).
+            num_samples: how many samples to keep.
+        """
+        self.check_count = 0
+        self.check_every = check_every
+        self.start_time = time.time()
+        self.last_time = self.start_time
+        self.report_interval_secs = report_interval_secs
+        # For tracking moving averages of various values
+        self.names = None
+        self.averages = None
+        self.counts = None
+        self.recency_weight = recency_weight
+        self.per_value_recency_weight = dict()
+        self.report_count = 0
+        self._prev_check_count = 0
+        self.sample_names = list(gather_samples)
+        if len(self.sample_names) > 0:
+            self.sample_values = np.zeros(
+                (len(self.sample_names), num_samples), dtype=np.float32)
+            self.sample_ndxs = np.zeros(len(self.sample_names), dtype=np.int32)
+        else:
+            self.sample_values = None
+            self.sample_ndxs = None
+
+    def reset(self):
+        self.check_count = 0
+        self.start_time = time.time()
+        self.last_time = self.start_time
+        self.report_count = 0
+        self._prev_check_count = 0
+        if len(self.sample_names) > 0:
+            self.sample_values[:, :] = 0
+            self.sample_ndxs[:] = 0
+        if self.counts is not None:
+            self.counts[:] = 0
+            self.averages[:] = 0
+
+    def is_time(self):
+        self.check_count += 1
+        if self.check_count % self.check_every == 0:
+            elapsed = time.time() - self.last_time
+            if elapsed >= self.report_interval_secs:
+                # check the time more or less often
+                if self.check_every > 1 and self.check_count - self._prev_check_count < 5 * self.check_every:
+                    self.check_every //= 2
+                elif self.check_count - self._prev_check_count > 50 * self.check_every:
+                    self.check_every *= 2
+                self.last_time = time.time()
+                self.report_count += 1
+                self._prev_check_count = self.check_count
+                return True
+        return False
+
+    def moving_averages(self, **values):
+        # create entries in avgs and counts when needed
+        # update the avgs and counts
+        if self.names is None:
+            self.names = list(values.keys())
+            self.averages = np.zeros(len(self.names))
+            self.counts = np.zeros(len(self.names))
+        for name in values.keys():
+            if name not in self.names:
+                self.names.append(name)
+        if self.averages.shape[0] < len(self.names):
+            old_len = self.averages.shape[0]
+            self.averages = np.resize(self.averages, len(self.names))
+            self.averages[old_len:] = 0
+            self.counts = np.resize(self.counts, len(self.names))
+            self.counts[old_len:] = 0
+        for ndx, name in enumerate(self.names):
+            if name in values:
+                self.counts[ndx] += 1
+                # support per-name recency_weight
+                if name in self.per_value_recency_weight:
+                    rweight = max(self.per_value_recency_weight[name],
+                                  1.0 / self.counts[ndx])
+                else:
+                    rweight = max(self.recency_weight, 1.0 / self.counts[ndx])
+                self.averages[ndx] = rweight * values[name] + (
+                    1.0 - rweight) * self.averages[ndx]
+        for ndx, name in enumerate(self.sample_names):
+            if name in values:
+                self.sample_values[self.sample_ndxs[ndx]] = values[name]
+                self.sample_ndxs[ndx] = (self.sample_ndxs[ndx]
+                                         + 1) % self.sample_values.shape[1]
+
+    def get_samples(self, name):
+        for ndx, n in enumerate(self.sample_names):
+            if n == name:
+                count = self.get_count(name)
+                if count is None:
+                    count = 0
+                return self.sample_values[ndx, 0:count]  # NOTE: not in order
+        return None
+
+    def get_moving_average(self, name):
+        if self.names is None:
+            return None
+        for ndx, n in enumerate(self.names):
+            if n == name:
+                return self.averages[ndx]
+        return None
+
+    def get_count(self, name):
+        if self.names is None:
+            return None
+        for ndx, n in enumerate(self.names):
+            if n == name:
+                return self.counts[ndx]
+        return None
+
+    def elapsed_seconds(self) -> float:
+        return time.time() - self.start_time
+
+    def elapsed_time_str(self) -> str:
+        return time_str(self.elapsed_seconds())
+
+    def progress_str(self, instance_name='instance'):
+        return f'On {instance_name} {self.check_count}, ' \
+               f'{self.check_count / self.elapsed_seconds()} {instance_name}s per second.'
+
+    def display(self, *, prefix=''):
+        # display the moving averages
+        logger.info('==========================================')
+        if self.names is not None:
+            for n, v in zip(self.names, self.averages):
+                logger.info(f'{prefix}{n} = {v}')
+
+    def display_warn(self, *, prefix=''):
+        # display the moving averages
+        logger.info('==========================================')
+        if self.names is not None:
+            for n, v in zip(self.names, self.averages):
+                logger.warning(f'{prefix}{n} = {v}')
+
+
+def _remove_duplicates(obj):
+    obj_tmp = []
+    for o in obj:
+        if o not in obj_tmp:
+            obj_tmp.append(o)
+    return obj_tmp
+
+
+def _get_ids_list(datapoint, rank_keys, verbose=False):
+    # collect all gold ids
+    ids_list = []
+    for output in datapoint['output']:
+        current_ids_list = []
+        if 'provenance' in output:
+            for provenance in output['provenance']:
+                if any(rank_key not in provenance for rank_key in rank_keys):
+                    missing = set(rank_keys) - set(list(
+                        provenance.keys())).intersection(set(rank_keys))
+                    if verbose:
+                        print(
+                            f'WARNING: missing key(s) {missing} in provenance, unable to compute retrieval for those.'
+                        )
+                else:
+                    current_ids_list.append('+'.join([
+                        str(provenance[rank_key]).strip()
+                        for rank_key in rank_keys
+                    ]))
+        ids_list.append(
+            _remove_duplicates(current_ids_list))  # remove duplicates
+
+    # consider only unique ids
+    return ids_list
+
+
+def _computeRprec(guess_ids, gold_ids):
+    R = len(gold_ids)
+    num = 0
+
+    for prediction in guess_ids[:R]:
+        if str(prediction).strip() in gold_ids:
+            num += 1
+
+    Rprec = num / R if R > 0 else 0
+    return Rprec
+
+
+# 1. Precision computation
+def _precision_at_k(rank, k):
+    # precision @ k
+    p = rank[:k].count(True) / k
+
+    return p
+
+
+# 2. Recall computation
+def _recall_at_k(rank, num_distinct_evidence_sets, k):
+    r = rank[:k].count(True) / num_distinct_evidence_sets
+
+    return r
+
+
+# 3. Success rate computation
+def _success_rate_at_k(rank, k):
+    # success rate @ k
+    p = int(True in rank[:k])
+
+    return p
+
+
+def get_rank(guess_item, gold_item, k, rank_keys, verbose=False):
+    """
+    The main idea is to consider each evidence set as a single point in the rank.
+    The score in the rank for an evidence set is given by the lowest scored evidence in the set.
+    """
+
+    assert k > 0, 'k must be a positive integer grater than 0.'
+
+    rank = []
+    num_distinct_evidence_sets = 0
+
+    guess_ids = _get_ids_list(guess_item, rank_keys)[0]
+
+    if guess_ids and len(guess_ids) > 0:
+
+        # 1. collect evidence sets and their sizes
+        evidence_sets = []
+        e_size = defaultdict(int)
+        for output in gold_item['output']:
+            if 'provenance' in output:
+                e_set = {
+                    '+'.join([
+                        str(provenance[rank_key]).strip()
+                        for rank_key in rank_keys
+                    ])
+                    for provenance in output['provenance']
+                }
+                if e_set not in evidence_sets:  # no duplicate evidence set
+                    evidence_sets.append(e_set)
+                    e_size[len(e_set)] += 1
+        num_distinct_evidence_sets = len(evidence_sets)
+
+        # 2. check what's the minimum number of predicted pages needed to get a robust P/R@k
+        min_prediction_size = 0
+        c = 0
+        for size, freq in sorted(e_size.items(), reverse=True):
+            for _ in range(freq):
+                min_prediction_size += size
+                c += 1
+                if c == k:
+                    break
+            if c == k:
+                break
+        # if the number of evidence sets is smaller than k
+        min_prediction_size += k - c
+
+        if verbose and len(guess_ids) < min_prediction_size:
+            print(
+                f'WARNING: you should provide at least {min_prediction_size} provenance items '
+                f'for a robust recall@{k} computation (you provided {len(guess_ids)} item(s)).'
+            )
+
+        # 3. rank by gruping pages in each evidence set (each evidence set count as 1),
+        # the position in the rank of each evidence set is given by the last page in guess_ids
+        # non evidence pages counts as 1
+        rank = []
+        for guess_id in guess_ids:
+            guess_id = str(guess_id).strip()
+            found = False
+            for idx, e_set in enumerate(evidence_sets):
+
+                e_set_id = f'evidence_set:{idx}'
+
+                if guess_id in e_set:
+                    found = True
+
+                    # remove from the rank previous points referring to this evidence set
+                    if e_set_id in rank:
+                        rank.remove(e_set_id)
+
+                    # remove the guess_id from the evidence set
+                    e_set.remove(guess_id)
+
+                    if len(e_set) == 0:
+                        # it was the last evidence, it counts as true in the rank
+                        rank.append(True)
+                    else:
+                        # add a point for this partial evidence set
+                        rank.append(e_set_id)
+
+            if not found:
+                rank.append(False)
+
+    return rank, num_distinct_evidence_sets
+
+
+def set_seed(seed):
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+
+
+def load_data(filename):
+    data = []
+    file_in = open(filename, 'r')
+    lines = file_in.readlines()
+    for line in lines:
+        data.append(json.loads(line))
+    return data
+
+
+def rprecision(guess_item, gold_item, rank_keys):
+    gold_ids_list = _get_ids_list(gold_item, rank_keys)
+    guess_ids = _get_ids_list(guess_item, rank_keys)[0]
+    Rprec_vector = []
+    for gold_ids in gold_ids_list:
+        Rprec = _computeRprec(guess_ids, gold_ids)
+        Rprec_vector.append(Rprec)
+    return max(Rprec_vector)
+
+
+def get_ranking_metrics(guess_item, gold_item, ks, rank_keys):
+    Rprec = 0
+    P_at_k = {'precision@{}'.format(k): 0 for k in sorted(ks) if k > 0}
+    R_at_k = {'recall@{}'.format(k): 0 for k in sorted(ks) if k > 1}
+    S_at_k = {'success_rate@{}'.format(k): 0 for k in sorted(ks) if k > 1}
+
+    assert (
+        'output' in guess_item and len(guess_item['output']) == 1
+    ), f"guess should provide exactly one output for {guess_item['id']}"
+
+    Rprec = rprecision(guess_item, gold_item, rank_keys=rank_keys)
+    for k in ks:
+
+        # 0. get rank
+        rank, num_distinct_evidence_sets = get_rank(
+            guess_item, gold_item, k, rank_keys=rank_keys)
+
+        if num_distinct_evidence_sets > 0:
+            # 1. precision
+            P_at_k['precision@{}'.format(k)] = _precision_at_k(rank, k)
+
+            # 2. recall
+            R_at_k['recall@{}'.format(k)] = _recall_at_k(
+                rank, num_distinct_evidence_sets, k)
+
+            # 3. success rate
+            S_at_k['success_rate@{}'.format(k)] = _success_rate_at_k(rank, k)
+
+        # else:
+        #     print(
+        #         "WARNING: the number of distinct evidence sets is 0 for {}".format(
+        #             gold_item
+        #         )
+        #     )
+
+    return {'Rprec': Rprec, **P_at_k, **R_at_k, **S_at_k}
+
+
+def compute(gold_dataset, guess_dataset, ks, rank_keys):
+    ks = sorted([int(x) for x in ks])
+
+    result = OrderedDict()
+    result['Rprec'] = 0.0
+    for k in ks:
+        if k > 0:
+            result['precision@{}'.format(k)] = 0.0
+            # if k > 1:
+            result['recall@{}'.format(k)] = 0.0
+            result['success_rate@{}'.format(k)] = 0.0
+
+    assert len(guess_dataset) == len(
+        gold_dataset), 'different size gold: {} guess: {}'.format(
+            len(guess_dataset), len(gold_dataset))
+
+    for gold, guess in zip(guess_dataset, gold_dataset):
+        assert (str(gold['id']).strip() == str(
+            guess['id']).strip()), 'Items must have same order with same IDs'
+
+    for guess_item, gold_item in zip(guess_dataset, gold_dataset):
+        ranking_metrics = get_ranking_metrics(guess_item, gold_item, ks,
+                                              rank_keys)
+        result['Rprec'] += ranking_metrics['Rprec']
+        for k in ks:
+            if k > 0:
+                result['precision@{}'.format(k)] += ranking_metrics[
+                    'precision@{}'.format(k)]
+                result['recall@{}'.format(k)] += ranking_metrics[
+                    'recall@{}'.format(k)]
+                result['success_rate@{}'.format(k)] += ranking_metrics[
+                    'success_rate@{}'.format(k)]
+
+    if len(guess_dataset) > 0:
+        result['Rprec'] /= len(guess_dataset)
+        for k in ks:
+            if k > 0:
+                result['precision@{}'.format(k)] /= len(guess_dataset)
+                # if k > 1:
+                result['recall@{}'.format(k)] /= len(guess_dataset)
+                result['success_rate@{}'.format(k)] /= len(guess_dataset)
+
+    return result
+
+
+def to_distinct_doc_ids(passage_ids):
+    doc_ids = []
+    for pid in passage_ids:
+        # MARK
+        doc_id = pid
+        if doc_id not in doc_ids:
+            doc_ids.append(doc_id)
+    return doc_ids
+
+
+def validate_input(gold_records, guess_records):
+    if len(gold_records) != len(guess_records):
+        print('WARNING: DIFFERENT SIZE gold: {} guess: {}'.format(
+            len(gold_records), len(guess_records)))
+
+    # align order
+    gold_ids = []
+    for gold in gold_records:
+        assert str(
+            gold['id']).strip() not in gold_ids, 'Gold IDs should be unique'
+        gold_ids.append(str(gold['id']).strip())
+
+    id2guess_record = {}
+    for guess in guess_records:
+        assert (str(guess['id']).strip()
+                not in id2guess_record), 'Prediction IDs should be unique'
+        id2guess_record[str(guess['id']).strip()] = guess
+
+    guess_records = []
+    for id in gold_ids:
+        if id in id2guess_record:
+            guess_records.append(id2guess_record[id])
+        else:
+            raise ValueError(
+                'ERROR: no prediction provided for id: {}'.format(id))
+
+    return gold_records, guess_records
+
+
+# utility to get gold answers
+def get_gold_answers(gold):
+    ground_truths = set()
+    for item in gold['output']:
+        if 'answer' in item and item['answer'] and len(
+                item['answer'].strip()) > 0:
+            ground_truths.add(item['answer'].strip())
+    return ground_truths
+
+
+# utility to get max
+def _metric_max_over_ground_truths(metric_fn, prediction, ground_truths):
+    scores_for_ground_truths = []
+    for ground_truth in ground_truths:
+        score = metric_fn(prediction, ground_truth)
+        scores_for_ground_truths.append(score)
+    if scores_for_ground_truths:
+        return max(scores_for_ground_truths)
+    else:
+        return 0
+
+
+def _calculate_metrics(gold_records, guess_records):
+    assert len(gold_records) == len(
+        guess_records), 'different size gold: {} guess: {}'.format(
+            len(gold_records), len(guess_records))
+
+    total_count = 0
+
+    # downstream metrics
+    accuracy = 0
+    normalized_em = 0
+    normalized_f1 = 0
+    rougel = 0
+
+    # kilt metrics
+    kilt_accuracy = 0
+    kilt_em = 0
+    kilt_f1 = 0
+    kilt_rougel = 0
+
+    for guess_item, gold_item in zip(guess_records, gold_records):
+
+        # check ids
+        assert (str(gold_item['id']).strip() == str(guess_item['id']).strip()
+                ), 'Items must have same order with same IDs'
+
+        total_count += 1
+        # check if each output of guess file exist in set of candidate answers
+        gold_candidate_answers = get_gold_answers(gold_item)
+
+        conditions = (len(guess_item['output'])
+                      == 1) and ('answer' in guess_item['output'][0])
+        assert (
+            conditions
+        ), f"you should provide exactly one valid answer for {guess_item['id']}"
+        guess_answer = str(guess_item['output'][0]['answer']).strip()
+
+        if len(guess_answer) == 0:
+            # empty answer
+            continue
+
+        # 0. accuracy = strict exact match
+        local_accuracy = 0
+        if guess_answer in gold_candidate_answers:
+            local_accuracy = 1
+        accuracy += local_accuracy
+
+        # 1. normalized exact match
+        local_em = _metric_max_over_ground_truths(_exact_match_score,
+                                                  guess_answer,
+                                                  gold_candidate_answers)
+        normalized_em += local_em
+
+        # 2. normalized f1
+        local_f1 = _metric_max_over_ground_truths(_f1_score, guess_answer,
+                                                  gold_candidate_answers)
+        normalized_f1 += local_f1
+
+        # 3. rougel
+        local_rougel = _metric_max_over_ground_truths(_rougel_score,
+                                                      guess_answer,
+                                                      gold_candidate_answers)
+        rougel += local_rougel
+
+        # KILT-metrics
+        Rprec = rprecision(guess_item, gold_item, rank_keys=['wikipedia_id'])
+        if Rprec == 1:
+            # 1. KILT-AC
+            kilt_accuracy += local_accuracy
+
+            # 2. KILT-EM
+            kilt_em += local_em
+
+            # 3. KILT-F1
+            kilt_f1 += local_f1
+
+            # 4. KILT-RL
+            kilt_rougel += local_rougel
+
+    if total_count > 0:
+        accuracy /= total_count
+        normalized_em /= total_count
+        normalized_f1 /= total_count
+        rougel /= total_count
+        kilt_accuracy /= total_count
+        kilt_em /= total_count
+        kilt_f1 /= total_count
+        kilt_rougel /= total_count
+
+    return {
+        'kilt': {
+            'KILT-accuracy': kilt_accuracy,
+            'KILT-em': kilt_em,
+            'KILT-f1': kilt_f1,
+            'KILT-rougel': kilt_rougel,
+        },
+        'downstream': {
+            'accuracy': accuracy,
+            'em': normalized_em,
+            'f1': normalized_f1,
+            'rougel': rougel,
+        },
+    }
+
+
+def evaluate(gold, guess):
+    pp = pprint.PrettyPrinter(indent=4)
+
+    gold_records = gold
+    guess_records = load_data(guess)
+
+    # 0. validate input
+    gold_records, guess_records = validate_input(gold_records, guess_records)
+
+    # 1. downstream + kilt
+    result = _calculate_metrics(gold_records, guess_records)
+
+    # 2. retrieval performance
+    retrieval_results = compute(
+        gold_records,
+        guess_records,
+        ks=[1, 5, 10, 100],
+        rank_keys=['wikipedia_id'])
+    result['retrieval'] = {
+        'Rprec': retrieval_results['Rprec'],
+        'recall@1': retrieval_results['recall@1'],
+        'recall@5': retrieval_results['recall@5'],
+        'recall@10': retrieval_results['recall@10'],
+        'recall@100': retrieval_results['recall@100'],
+    }
+
+    pp.pprint(result)
+    return result
+
+
+if __name__ == '__main__':
+    main()
diff --git a/modelscope/pipelines/nlp/document_grounded_dialog_retrieval_pipeline.py b/modelscope/pipelines/nlp/document_grounded_dialog_retrieval_pipeline.py
new file mode 100644
index 00000000..e3461b09
--- /dev/null
+++ b/modelscope/pipelines/nlp/document_grounded_dialog_retrieval_pipeline.py
@@ -0,0 +1,128 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os.path
+from typing import Any, Dict, List, Union
+
+import faiss
+import json
+import numpy as np
+
+from modelscope.metainfo import Pipelines
+from modelscope.models import Model
+from modelscope.models.nlp import DocumentGroundedDialogRetrievalModel
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines.base import Pipeline
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.preprocessors import \
+    DocumentGroundedDialogRetrievalPreprocessor
+from modelscope.utils.constant import ModeKeys, Tasks
+
+__all__ = ['DocumentGroundedDialogRetrievalPipeline']
+
+
+@PIPELINES.register_module(
+    Tasks.document_grounded_dialog_retrieval,
+    module_name=Pipelines.document_grounded_dialog_retrieval)
+class DocumentGroundedDialogRetrievalPipeline(Pipeline):
+
+    def __init__(
+            self,
+            model: Union[DocumentGroundedDialogRetrievalModel, str],
+            preprocessor: DocumentGroundedDialogRetrievalPreprocessor = None,
+            config_file: str = None,
+            device: str = 'gpu',
+            auto_collate=True,
+            index_path: str = None,
+            per_gpu_batch_size: int = 32,
+            **kwargs):
+        """The Retrieval pipeline for document grounded dialog.
+        Args:
+            model: A model instance or a model local dir or a model id in the model hub.
+            preprocessor: A preprocessor instance.
+            config_file: Path to config file.
+            device: Device to run the model.
+            auto_collate: Apply auto collate.
+            index_path: Index file path.
+            per_gpu_batch_size: Batch size per GPU to run the code.
+            **kwargs: The preprocessor kwargs passed into the preprocessor's constructor.
+
+        Examples:
+            >>> from modelscope.pipelines import pipeline
+            >>> pipe_ins = pipeline('document-grounded-dialog-retrieval', model='damo/nlp_convai_retrieval')
+
+        """
+        super().__init__(
+            model=model,
+            preprocessor=preprocessor,
+            config_file=config_file,
+            device=device,
+            auto_collate=auto_collate)
+
+        if preprocessor is None:
+            self.preprocessor = DocumentGroundedDialogRetrievalPreprocessor(
+                self.model.model_dir, **kwargs)
+        self.per_gpu_batch_size = per_gpu_batch_size
+        self.passages_index = []
+        self.passages = []
+        self.index = None
+        self.load_index(index_path)
+
+    def forward(self, inputs: Union[list, Dict[str, Any]],
+                **forward_params) -> Dict[str, Any]:
+        query_vector = self.model.encode_query(
+            inputs).detach().cpu().numpy().astype('float32')
+        D, Index = self.index.search(query_vector, 20)
+        return {'retrieved_ids': Index.tolist()}
+
+    def postprocess(self, inputs: Union[list, Dict[str, Any]],
+                    **postprocess_params) -> Dict[str, Any]:
+        predictions = [[self.passages[x] for x in retrieved_ids]
+                       for retrieved_ids in inputs['retrieved_ids']]
+        return {OutputKeys.OUTPUT: predictions}
+
+    def _collate_fn(self, data):
+        return data
+
+    def load_index(self, index_path: str = None):
+        if not index_path:
+            index_path = os.path.join(self.model.model_dir,
+                                      'passages_index.json')
+        with open(index_path) as f:
+            passage_index = json.load(f)
+        self.passages_index = passage_index
+        self.passages = [x['passage'] for x in passage_index]
+        all_ctx_vector = np.array([x['vector']
+                                   for x in passage_index]).astype('float32')
+        index = faiss.IndexFlatIP(all_ctx_vector.shape[-1])
+        index.add(all_ctx_vector)
+        self.index = index
+
+    def save_index(self, index_path: str = None):
+        if not index_path:
+            index_path = os.path.join(self.model.model_dir,
+                                      'passages_index.json')
+        with open(index_path, 'w') as f:
+            json.dump(self.passage_index, f, ensure_ascii=False, indent=4)
+
+    def add_passage(self, passages: List[str]):
+        all_ctx_vector = []
+        for mini_batch in range(0, len(passages), self.per_gpu_batch_size):
+            context = passages[mini_batch:mini_batch + self.per_gpu_batch_size]
+            processed = self.preprocessor({'context': context},
+                                          invoke_mode=ModeKeys.INFERENCE,
+                                          input_type='context')
+            sub_ctx_vector = self.model.encode_context(
+                processed).detach().cpu().numpy()
+            all_ctx_vector.append(sub_ctx_vector)
+        all_ctx_vector = np.concatenate(all_ctx_vector, axis=0)
+        all_ctx_vector = np.array(all_ctx_vector).astype('float32')
+        for passage, vector in zip(passages, all_ctx_vector):
+            self.passages_index.append({
+                'passage': passage,
+                'vector': vector.tolist()
+            })
+        self.passages = [x['passage'] for x in self.passage_index]
+        all_ctx_vector = np.array([x['vector'] for x in self.passage_index
+                                   ]).astype('float32')
+        index = faiss.IndexFlatIP(all_ctx_vector.shape[-1])
+        index.add(all_ctx_vector)
+        self.index = index
diff --git a/modelscope/pipelines/nlp/faq_question_answering_pipeline.py b/modelscope/pipelines/nlp/faq_question_answering_pipeline.py
index 5675144a..0b2ba199 100644
--- a/modelscope/pipelines/nlp/faq_question_answering_pipeline.py
+++ b/modelscope/pipelines/nlp/faq_question_answering_pipeline.py
@@ -8,7 +8,7 @@ from modelscope.outputs import OutputKeys
 from modelscope.pipelines.base import Pipeline
 from modelscope.pipelines.builder import PIPELINES
 from modelscope.preprocessors import Preprocessor
-from modelscope.utils.constant import Tasks
+from modelscope.utils.constant import ModelFile, Tasks
 
 __all__ = ['FaqQuestionAnsweringPipeline']
 
@@ -38,9 +38,13 @@ class FaqQuestionAnsweringPipeline(Pipeline):
             config_file=config_file,
             device=device,
             auto_collate=auto_collate)
+        assert isinstance(self.model, Model), \
+            f'please check whether model config exists in {ModelFile.CONFIGURATION}'
         if preprocessor is None:
             self.preprocessor = Preprocessor.from_pretrained(
                 self.model.model_dir, **kwargs)
+        if hasattr(self.model, 'eval'):
+            self.model.eval()
 
     def _sanitize_parameters(self, **pipeline_parameters):
         return pipeline_parameters, pipeline_parameters, pipeline_parameters
diff --git a/modelscope/pipelines/nlp/feature_extraction_pipeline.py b/modelscope/pipelines/nlp/feature_extraction_pipeline.py
index 2ea264f0..6131fa61 100644
--- a/modelscope/pipelines/nlp/feature_extraction_pipeline.py
+++ b/modelscope/pipelines/nlp/feature_extraction_pipeline.py
@@ -40,7 +40,7 @@ class FeatureExtractionPipeline(Pipeline):
             kwargs (dict, `optional`):
                 Extra kwargs passed into the preprocessor's constructor.
 
-            Example:
+        Examples:
             >>> from modelscope.pipelines import pipeline
             >>> pipe_ins = pipeline('feature_extraction', model='damo/nlp_structbert_feature-extraction_english-large')
             >>> input = 'Everything you love is treasure'
@@ -55,6 +55,8 @@ class FeatureExtractionPipeline(Pipeline):
             device=device,
             auto_collate=auto_collate)
 
+        assert isinstance(self.model, Model), \
+            f'please check whether model config exists in {ModelFile.CONFIGURATION}'
         if preprocessor is None:
             self.preprocessor = Preprocessor.from_pretrained(
                 self.model.model_dir,
diff --git a/modelscope/pipelines/nlp/fid_dialogue_pipeline.py b/modelscope/pipelines/nlp/fid_dialogue_pipeline.py
new file mode 100644
index 00000000..2880b4a2
--- /dev/null
+++ b/modelscope/pipelines/nlp/fid_dialogue_pipeline.py
@@ -0,0 +1,176 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import re
+from typing import Any, Dict, Optional, Union
+
+import torch
+
+from modelscope.metainfo import Pipelines
+from modelscope.models.base import Model
+from modelscope.outputs import OutputKeys, TokenGeneratorOutput
+from modelscope.pipelines.base import Pipeline
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.preprocessors import Preprocessor
+from modelscope.utils.constant import ModelFile, Tasks
+
+context_template = '假设我和你正在进行对话，请你给我得体、准确、友好的回复。以下是我们的对话内容。{context}'
+history_template = '假设我和你正在进行对话，请你给我得体、准确、友好的回复。以下是我们的对话内容。{context}' \
+                   '#以下是在此之前我们的对话内容，可作为回复时的参考。{history}'
+knowledge_template = '假设我和你正在进行对话，请你给我得体、准确、友好的回复。以下是我们的对话内容。{context}' \
+                     '#以下是和对话相关的知识，请你参考该知识进行回复。{knowledge}'
+user_profile_template = '假设我和你正在进行对话，请你给我得体、准确、友好的回复。以下是我们的对话内容。{context}' \
+                        '#假设以下是你对我所了解的信息，请你参考该信息并避免你的回复和该信息矛盾，信息如下：{user_profile}'
+bot_profile_template = '假设我和你正在进行对话，请你给我得体、准确、友好的回复。以下是我们的对话内容。{context}' \
+                       '#假设以下是你的人物设定，请你参考该信息并避免你的回复和该信息矛盾，信息如下：{bot_profile}'
+
+__all__ = ['FidDialoguePipeline']
+
+
+@PIPELINES.register_module(
+    Tasks.fid_dialogue, module_name=Pipelines.fid_dialogue)
+class FidDialoguePipeline(Pipeline):
+
+    def __init__(self,
+                 model: Union[Model, str],
+                 preprocessor: Optional[Preprocessor] = None,
+                 config_file: str = None,
+                 device: str = 'gpu',
+                 auto_collate=True,
+                 **kwargs):
+        """Use `model` and `preprocessor` to create a fid-dialogue pipeline for prediction.
+
+        Args:
+            model (str or Model): Supply either a local model dir which supported the text generation task,
+            or a model id from the model hub, or a torch model instance.
+            preprocessor (Preprocessor): An optional preprocessor instance, please make sure the preprocessor fits for
+            the model if supplied.
+            kwargs (dict, `optional`):
+                Extra kwargs passed into the preprocessor's constructor.
+            Examples:
+                >>> from modelscope.pipelines import pipeline
+                >>> from modelscope.utils.constant import Tasks
+                >>> pipeline_ins = pipeline(Tasks.fid_dialogue, model='damo/plug-dialogue', model_revision='v1.0.1')
+                >>> input = {
+                >>>    "history": "你好[SEP]你好，我是小达，很高兴认识你！[SEP]李白是谁",
+                >>>    "bot_profile": "我是小达;我是女生;我是单身;我今年21岁;我生日是2001年11月11日",
+                >>>    "knowledge": "唐代诗人李白（701年—762年12月）,字太白,号青莲居士,又号“谪仙人”[SEP]李白（公元701年—公元762年），字太白",
+                >>>    "user_profile": "你是小明"
+                >>> }
+                >>> result = pipeline_ins(input)
+                >>> print(result)
+        """
+        super().__init__(
+            model=model,
+            preprocessor=preprocessor,
+            config_file=config_file,
+            device=device,
+            auto_collate=auto_collate,
+            **kwargs)
+
+        if preprocessor is None:
+            self.preprocessor_tokenizer = Preprocessor.from_pretrained(
+                self.model.model_dir, **kwargs)
+
+        assert isinstance(self.model, Model), \
+            f'please check whether model config exists in {ModelFile.CONFIGURATION}'
+        self.model = self.model.to(self.device)
+        self.model.eval()
+
+        self.SEP = '[SEP]'
+
+    def forward(self, inputs: Dict[str, Any], **forward_params):
+        with torch.no_grad():
+            return self.model.generate(inputs, **forward_params)
+
+    def preprocess(self, inputs: Dict[str, Any],
+                   **preprocess_params) -> Dict[str, Any]:
+        # init params
+        max_encoder_length = 300
+        if 'max_encoder_length' in preprocess_params:
+            max_encoder_length = preprocess_params.pop('max_encoder_length')
+        # get raw data
+        history = inputs['history'] if 'history' in inputs else ''
+        if len(history) <= 0:
+            raise Exception('history is necessary!')
+        knowledge = inputs['knowledge'] if 'knowledge' in inputs else ''
+        user_profile = inputs[
+            'user_profile'] if 'user_profile' in inputs else ''
+        bot_profile = inputs['bot_profile'] if 'bot_profile' in inputs else ''
+        # parse raw data
+        history = history.split(self.SEP)
+        context = history[-3:]
+        context = self.process_context(context)
+        history = history[:-3]
+        history = self.process_history(history)
+        knowledge = knowledge.split(self.SEP)
+
+        model_input = []
+        if history and len(history) > 0:
+            model_input.append(
+                history_template.format(context=context, history=history))
+        if knowledge and len(knowledge) > 0:
+            for know in knowledge:
+                model_input.append(
+                    knowledge_template.format(context=context, knowledge=know))
+        if user_profile and len(user_profile) > 0:
+            model_input.append(
+                user_profile_template.format(
+                    context=context, user_profile=user_profile))
+        if bot_profile and len(bot_profile) > 0:
+            model_input.append(
+                bot_profile_template.format(
+                    context=context, bot_profile=bot_profile))
+
+        if not model_input:
+            model_input.append(context_template.format(context=context))
+
+        for i in range(len(model_input)):
+            model_input[i] = re.sub('[ \t]+', '▂', model_input[i])
+
+        # tokenization
+        input_ids = self.preprocessor_tokenizer(
+            {'src_txt': model_input},
+            padding=True,
+            truncation=True,
+            max_length=max_encoder_length,
+            return_tensors='pt')['input_ids'].unsqueeze(0).to(self.device)
+        input_dict = {
+            'input_ids':
+            input_ids.to(torch.int64).to(self.device),
+            'attention_mask': (input_ids != 0).to(torch.int64).to(self.device),
+            'token_type_ids':
+            torch.zeros(input_ids.shape).to(torch.int64).to(self.device)
+        }
+
+        return input_dict
+
+    def process_context(self, context_list):
+        subject = '我'
+        for i in range(len(context_list) - 1, -1, -1):
+            if len(context_list[i]) > 0 and context_list[i][
+                    -1] not in '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~、。，？！；：“”（）【】《》〈〉……':
+                context_list[i] = context_list[i] + '。'
+            context_list[i] = subject + '：' + context_list[i]
+            subject = '你' if subject == '我' else '我'
+        return ''.join(context_list)
+
+    def process_history(self, history_list):
+        subject = '你'
+        for i in range(len(history_list) - 1, -1, -1):
+            if len(history_list[i]) > 0 and history_list[i][
+                    -1] not in '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~、。，？！；：“”（）【】《》〈〉……':
+                history_list[i] = history_list[i] + '。'
+            history_list[i] = subject + '：' + history_list[i]
+            subject = '你' if subject == '我' else '我'
+        return ''.join(history_list)
+
+    def postprocess(self, inputs: TokenGeneratorOutput,
+                    **postprocess_params) -> Dict[str, Any]:
+
+        if torch.cuda.is_available():
+            hypotheses = inputs.sequences.detach().cpu().tolist()
+
+        response = self.preprocessor_tokenizer.decode(
+            hypotheses[0], skip_special_tokens=True)
+        response = response.replace(' ', '')
+        return {OutputKeys.TEXT: response}
diff --git a/modelscope/pipelines/nlp/fill_mask_pipeline.py b/modelscope/pipelines/nlp/fill_mask_pipeline.py
index af731d00..dc12efa7 100644
--- a/modelscope/pipelines/nlp/fill_mask_pipeline.py
+++ b/modelscope/pipelines/nlp/fill_mask_pipeline.py
@@ -10,7 +10,7 @@ from modelscope.outputs import OutputKeys
 from modelscope.pipelines.base import Pipeline, Tensor
 from modelscope.pipelines.builder import PIPELINES
 from modelscope.preprocessors import Preprocessor
-from modelscope.utils.constant import Tasks
+from modelscope.utils.constant import ModelFile, Tasks
 
 __all__ = ['FillMaskPipeline']
 
@@ -38,21 +38,24 @@ class FillMaskPipeline(Pipeline):
             kwargs (dict, `optional`):
                 Extra kwargs passed into the preprocessor's constructor.
 
-            Example1:
-            >>> from modelscope.pipelines import pipeline
-            >>> pipeline_ins = pipeline('fill-mask', model='damo/nlp_structbert_fill-mask_english-large')
-            >>> input = 'Everything in [MASK] you call reality is really [MASK] a reflection of your [MASK].'
-            >>> print(pipeline_ins(input))
-            Example2:
-            >>> from modelscope.pipelines import pipeline
-            >>> pipeline_ins = pipeline('fill-mask', model='damo/nlp_ponet_fill-mask_english-base')
-            >>> input = 'Everything in [MASK] you call reality is really [MASK] a reflection of your [MASK].'
-            >>> print(pipeline_ins(input))
+        Examples:
 
-            NOTE2: Please pay attention to the model's special tokens.
-            If bert based model(bert, structbert, etc.) is used, the mask token is '[MASK]'.
-            If the xlm-roberta(xlm-roberta, veco, etc.) based model is used, the mask token is '<mask>'.
-            To view other examples plese check tests/pipelines/test_fill_mask.py.
+        >>> from modelscope.pipelines import pipeline
+        >>> pipeline_ins = pipeline('fill-mask', model='damo/nlp_structbert_fill-mask_english-large')
+        >>> input = 'Everything in [MASK] you call reality is really [MASK] a reflection of your [MASK].'
+        >>> print(pipeline_ins(input))
+
+        Examples:
+
+        >>> from modelscope.pipelines import pipeline
+        >>> pipeline_ins = pipeline('fill-mask', model='damo/nlp_ponet_fill-mask_english-base')
+        >>> input = 'Everything in [MASK] you call reality is really [MASK] a reflection of your [MASK].'
+        >>> print(pipeline_ins(input))
+
+        NOTE2: Please pay attention to the model's special tokens.
+        If bert based model(bert, structbert, etc.) is used, the mask token is '[MASK]'.
+        If the xlm-roberta(xlm-roberta, veco, etc.) based model is used, the mask token is '<mask>'.
+        To view other examples plese check tests/pipelines/test_fill_mask.py.
         """
         super().__init__(
             model=model,
@@ -61,6 +64,8 @@ class FillMaskPipeline(Pipeline):
             device=device,
             auto_collate=auto_collate)
 
+        assert isinstance(self.model, Model), \
+            f'please check whether model config exists in {ModelFile.CONFIGURATION}'
         if preprocessor is None:
             self.preprocessor = Preprocessor.from_pretrained(
                 self.model.model_dir,
diff --git a/modelscope/pipelines/nlp/information_extraction_pipeline.py b/modelscope/pipelines/nlp/information_extraction_pipeline.py
index 0c726c9a..0c2ddf87 100644
--- a/modelscope/pipelines/nlp/information_extraction_pipeline.py
+++ b/modelscope/pipelines/nlp/information_extraction_pipeline.py
@@ -9,7 +9,7 @@ from modelscope.models import Model
 from modelscope.pipelines.base import Pipeline
 from modelscope.pipelines.builder import PIPELINES
 from modelscope.preprocessors import Preprocessor
-from modelscope.utils.constant import Tasks
+from modelscope.utils.constant import ModelFile, Tasks
 
 __all__ = ['InformationExtractionPipeline']
 
@@ -45,6 +45,9 @@ class InformationExtractionPipeline(Pipeline):
             device=device,
             auto_collate=auto_collate)
 
+        assert isinstance(self.model, Model), \
+            f'please check whether model config exists in {ModelFile.CONFIGURATION}'
+
         if self.preprocessor is None:
             self.preprocessor = Preprocessor.from_pretrained(
                 self.model.model_dir,
diff --git a/modelscope/pipelines/nlp/interactive_translation_pipeline.py b/modelscope/pipelines/nlp/interactive_translation_pipeline.py
new file mode 100644
index 00000000..86df09bb
--- /dev/null
+++ b/modelscope/pipelines/nlp/interactive_translation_pipeline.py
@@ -0,0 +1,170 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import os.path as osp
+from typing import Any, Dict
+
+import jieba
+import numpy as np
+import tensorflow as tf
+from sacremoses import MosesDetokenizer, MosesPunctNormalizer, MosesTokenizer
+from subword_nmt import apply_bpe
+
+from modelscope.metainfo import Pipelines
+from modelscope.models.base import Model
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines.base import Pipeline
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.pipelines.nlp.translation_pipeline import TranslationPipeline
+from modelscope.utils.config import Config
+from modelscope.utils.constant import ModelFile, Tasks
+from modelscope.utils.logger import get_logger
+
+if tf.__version__ >= '2.0':
+    tf = tf.compat.v1
+    tf.disable_eager_execution()
+
+logger = get_logger()
+
+__all__ = ['InteractiveTranslationPipeline']
+
+
+@PIPELINES.register_module(
+    Tasks.translation, module_name=Pipelines.interactive_translation)
+class InteractiveTranslationPipeline(TranslationPipeline):
+
+    def __init__(self, model: Model, **kwargs):
+        """Build a interactive translation pipeline with a model dir or a model id in the model hub.
+
+        Args:
+            model (`str` or `Model` or module instance): A model instance or a model local dir
+                or a model id in the model hub.
+
+        Example:
+            >>> from modelscope.pipelines import pipeline
+            >>> pipeline_ins = pipeline(task=Tasks.translation,
+                model='damo/nlp_imt_translation_zh2en')
+            >>> input_sequence = 'Elon Musk, co-founder and chief executive officer of Tesla Motors.'
+            >>> input_prefix = "特斯拉汽车公司"
+            >>> print(pipeline_ins(input_sequence + "<PREFIX_SPLIT>" + input_prefix))
+        """
+        super().__init__(model=model, **kwargs)
+        model = self.model.model_dir
+        tf.reset_default_graph()
+        model_path = osp.join(
+            osp.join(model, ModelFile.TF_CHECKPOINT_FOLDER), 'ckpt-0')
+
+        self._trg_vocab = dict([
+            (w.strip(), i) for i, w in enumerate(open(self._trg_vocab_path))
+        ])
+        self._len_tgt_vocab = len(self._trg_rvocab)
+
+        self.input_wids = tf.placeholder(
+            dtype=tf.int64, shape=[None, None], name='input_wids')
+
+        self.prefix_wids = tf.placeholder(
+            dtype=tf.int64, shape=[None, None], name='prefix_wids')
+
+        self.prefix_hit = tf.placeholder(
+            dtype=tf.bool, shape=[None, None], name='prefix_hit')
+
+        self.output = {}
+
+        # preprocess
+        if self._tgt_lang == 'zh':
+            self._tgt_tok = jieba
+        else:
+            self._tgt_punct_normalizer = MosesPunctNormalizer(
+                lang=self._tgt_lang)
+            self._tgt_tok = MosesTokenizer(lang=self._tgt_lang)
+
+        # model
+        output = self.model(self.input_wids, None, self.prefix_wids,
+                            self.prefix_hit)
+        self.output.update(output)
+
+        tf_config = tf.ConfigProto(allow_soft_placement=True)
+        tf_config.gpu_options.allow_growth = True
+        self._session = tf.Session(config=tf_config)
+
+        with self._session.as_default() as sess:
+            logger.info(f'loading model from {model_path}')
+            # load model
+            model_loader = tf.train.Saver(tf.global_variables())
+            model_loader.restore(sess, model_path)
+
+    def preprocess(self, input: str) -> Dict[str, Any]:
+        input_src, prefix = input.split('<PREFIX_SPLIT>', 1)
+        if self._src_lang == 'zh':
+            input_tok = self._tok.cut(input_src)
+            input_tok = ' '.join(list(input_tok))
+        else:
+            input_src = self._punct_normalizer.normalize(input_src)
+            input_tok = self._tok.tokenize(
+                input_src, return_str=True, aggressive_dash_splits=True)
+
+        if self._tgt_lang == 'zh':
+            prefix = self._tgt_tok.lcut(prefix)
+            prefix_tok = ' '.join(list(prefix)[:-1])
+        else:
+            prefix = self._tgt_punct_normalizer.normalize(prefix)
+            prefix = self._tgt_tok.tokenize(
+                prefix, return_str=True, aggressive_dash_splits=True).split()
+            prefix_tok = ' '.join(prefix[:-1])
+
+        if len(list(prefix)) > 0:
+            subword = list(prefix)[-1]
+        else:
+            subword = ''
+
+        input_bpe = self._bpe.process_line(input_tok)
+        prefix_bpe = self._bpe.process_line(prefix_tok)
+        input_ids = np.array([[
+            self._src_vocab[w]
+            if w in self._src_vocab else self.cfg['model']['src_vocab_size']
+            for w in input_bpe.strip().split()
+        ]])
+
+        prefix_ids = np.array([[
+            self._trg_vocab[w]
+            if w in self._trg_vocab else self.cfg['model']['trg_vocab_size']
+            for w in prefix_bpe.strip().split()
+        ]])
+
+        prefix_hit = [[0] * (self._len_tgt_vocab + 1)]
+
+        if subword != '':
+            hit_state = False
+            for i, w in self._trg_rvocab.items():
+                if w.startswith(subword):
+                    prefix_hit[0][i] = 1
+                    hit_state = True
+            if hit_state is False:
+                prefix_hit = [[1] * (self._len_tgt_vocab + 1)]
+        result = {
+            'input_ids': input_ids,
+            'prefix_ids': prefix_ids,
+            'prefix_hit': np.array(prefix_hit)
+        }
+        return result
+
+    def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
+        with self._session.as_default():
+            feed_dict = {
+                self.input_wids: input['input_ids'],
+                self.prefix_wids: input['prefix_ids'],
+                self.prefix_hit: input['prefix_hit']
+            }
+            sess_outputs = self._session.run(self.output, feed_dict=feed_dict)
+            return sess_outputs
+
+    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
+        output_seqs = inputs['output_seqs'][0]
+        wids = list(output_seqs[0]) + [0]
+        wids = wids[:wids.index(0)]
+        translation_out = ' '.join([
+            self._trg_rvocab[wid] if wid in self._trg_rvocab else '<unk>'
+            for wid in wids
+        ]).replace('@@ ', '').replace('@@', '')
+        translation_out = self._detok.detokenize(translation_out.split())
+        result = {OutputKeys.TRANSLATION: translation_out}
+        return result
diff --git a/modelscope/pipelines/nlp/named_entity_recognition_pipeline.py b/modelscope/pipelines/nlp/named_entity_recognition_pipeline.py
index 5901ab36..8a25c415 100644
--- a/modelscope/pipelines/nlp/named_entity_recognition_pipeline.py
+++ b/modelscope/pipelines/nlp/named_entity_recognition_pipeline.py
@@ -7,7 +7,7 @@ from modelscope.models import Model
 from modelscope.pipelines.builder import PIPELINES
 from modelscope.pipelines.nlp import TokenClassificationPipeline
 from modelscope.preprocessors import Preprocessor
-from modelscope.utils.constant import Tasks
+from modelscope.utils.constant import ModelFile, Tasks
 
 __all__ = ['NamedEntityRecognitionPipeline']
 
@@ -37,10 +37,11 @@ class NamedEntityRecognitionPipeline(TokenClassificationPipeline):
             model (str or Model): Supply either a local model dir which supported NER task, or a
             model id from the model hub, or a torch model instance.
             preprocessor (Preprocessor): An optional preprocessor instance, please make sure the preprocessor fits for
-            the model if supplied.
+                the model if supplied.
             kwargs (dict, `optional`):
                 Extra kwargs passed into the preprocessor's constructor.
-            Example:
+
+        Examples:
             >>> from modelscope.pipelines import pipeline
             >>> pipeline_ins = pipeline(task='named-entity-recognition',
             >>>        model='damo/nlp_raner_named-entity-recognition_chinese-base-news')
@@ -55,6 +56,10 @@ class NamedEntityRecognitionPipeline(TokenClassificationPipeline):
             config_file=config_file,
             device=device,
             auto_collate=auto_collate)
+
+        assert isinstance(self.model, Model), \
+            f'please check whether model config exists in {ModelFile.CONFIGURATION}'
+
         if preprocessor is None:
             self.preprocessor = Preprocessor.from_pretrained(
                 self.model.model_dir,
diff --git a/modelscope/pipelines/nlp/sentence_embedding_pipeline.py b/modelscope/pipelines/nlp/sentence_embedding_pipeline.py
index 424a9abc..7aaa073b 100644
--- a/modelscope/pipelines/nlp/sentence_embedding_pipeline.py
+++ b/modelscope/pipelines/nlp/sentence_embedding_pipeline.py
@@ -3,6 +3,7 @@
 from typing import Any, Dict, Optional, Union
 
 import numpy as np
+import torch
 
 from modelscope.metainfo import Pipelines
 from modelscope.models import Model
@@ -10,7 +11,7 @@ from modelscope.outputs import OutputKeys
 from modelscope.pipelines.base import Pipeline
 from modelscope.pipelines.builder import PIPELINES
 from modelscope.preprocessors import Preprocessor
-from modelscope.utils.constant import Tasks
+from modelscope.utils.constant import ModelFile, Tasks
 
 __all__ = ['SentenceEmbeddingPipeline']
 
@@ -42,6 +43,10 @@ class SentenceEmbeddingPipeline(Pipeline):
             config_file=config_file,
             device=device,
             auto_collate=auto_collate)
+
+        assert isinstance(self.model, Model), \
+            f'please check whether model config exists in {ModelFile.CONFIGURATION}'
+
         if preprocessor is None:
             self.preprocessor = Preprocessor.from_pretrained(
                 self.model.model_dir,
@@ -61,11 +66,17 @@ class SentenceEmbeddingPipeline(Pipeline):
         Returns:
             Dict[str, Any]: the predicted text representation
         """
-        embs = inputs['last_hidden_state'][:, 0].cpu().numpy()
-        num_sent = embs.shape[0]
-        if num_sent >= 2:
-            scores = np.dot(embs[0:1, ], np.transpose(embs[1:, ],
-                                                      (1, 0))).tolist()[0]
+        embeddings = inputs['query_embeddings']
+        doc_embeddings = inputs['doc_embeddings']
+        if doc_embeddings is not None:
+            embeddings = torch.cat((embeddings, doc_embeddings), dim=0)
+        embeddings = embeddings.detach().cpu().numpy()
+        if doc_embeddings is not None:
+            scores = np.dot(embeddings[0:1, ],
+                            np.transpose(embeddings[1:, ], (1, 0))).tolist()[0]
         else:
             scores = []
-        return {OutputKeys.TEXT_EMBEDDING: embs, OutputKeys.SCORES: scores}
+        return {
+            OutputKeys.TEXT_EMBEDDING: embeddings,
+            OutputKeys.SCORES: scores
+        }
diff --git a/modelscope/pipelines/nlp/siamese_uie_pipeline.py b/modelscope/pipelines/nlp/siamese_uie_pipeline.py
new file mode 100644
index 00000000..c9f86893
--- /dev/null
+++ b/modelscope/pipelines/nlp/siamese_uie_pipeline.py
@@ -0,0 +1,317 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import logging
+import os
+import pathlib
+from copy import deepcopy
+from math import ceil
+from time import time
+from typing import Any, Dict, Generator, List, Mapping, Optional, Union
+
+import json
+import torch
+from scipy.special import softmax
+from torch.cuda.amp import autocast
+from tqdm import tqdm
+
+from modelscope.metainfo import Pipelines
+from modelscope.models import Model
+from modelscope.msdatasets import MsDataset
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines.base import Pipeline
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.preprocessors import Preprocessor, SiameseUiePreprocessor
+from modelscope.utils.constant import Tasks
+
+Input = Union[str, tuple, MsDataset, 'Image.Image', 'numpy.ndarray']
+
+logger = logging.getLogger(__name__)
+
+os.environ['TOKENIZERS_PARALLELISM'] = 'true'
+
+__all__ = ['SiameseUiePipeline']
+
+
+@PIPELINES.register_module(
+    Tasks.siamese_uie, module_name=Pipelines.siamese_uie)
+class SiameseUiePipeline(Pipeline):
+
+    def __init__(self,
+                 model: Union[Model, str],
+                 preprocessor: Optional[Preprocessor] = None,
+                 config_file: str = None,
+                 device: str = 'cpu',
+                 auto_collate=True,
+                 **kwargs):
+        """Use `model` and `preprocessor` to create a generation pipeline for prediction.
+
+        Args:
+            model (str or Model): Supply either a local model dir which supported the text generation task,
+            or a model id from the model hub, or a torch model instance.
+            preprocessor (Preprocessor): An optional preprocessor instance, please make sure the preprocessor fits for
+            the model if supplied.
+            kwargs (dict, `optional`):
+                Extra kwargs passed into the preprocessor's constructor.
+
+        Examples:
+            >>> from modelscope.pipelines import pipeline
+            >>> pipeline_ins = pipeline(Tasks.siamese_uie,
+            >>>    model='damo/nlp_structbert_siamese-uie_chinese-base')
+            >>> sentence = '1944年毕业于北大的名古屋铁道会长谷口清太郎等人在日本积极筹资，共筹款2.7亿日元，参加捐款的日本企业有69家。'
+            >>> print(pipeline_ins(sentence, schema={'人物': None, '地理位置': None, '组织机构': None}))
+
+            To view other examples plese check tests/pipelines/test_siamese_uie.py.
+        """
+        super().__init__(
+            model=model,
+            preprocessor=preprocessor,
+            config_file=config_file,
+            device=device,
+            auto_collate=auto_collate)
+
+        assert isinstance(self.model, Model), \
+            f'please check whether model config exists in {ModelFile.CONFIGURATION}'
+
+        if self.preprocessor is None:
+            self.preprocessor = Preprocessor.from_pretrained(
+                self.model.model_dir, **kwargs)
+        self.model.eval()
+        self.slide_len = 352
+        self.max_len = 384
+        self.hint_max_len = 128
+        self.inference_batch_size = 8
+        self.threshold = 0.5
+
+    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
+        pass
+
+    def __call__(self, input: Union[Input, List[Input]], *args,
+                 **kwargs) -> Union[Dict[str, Any], Generator]:
+        """
+        Args:
+            input(str): sentence to extract
+            schema: (dict or str) schema of uie task
+        Default Returns:
+            List[List]:  predicted info list i.e.
+            [[{'type': '人物', 'span': '谷口清太郎', 'offset': [18, 23]}],
+            [{'type': '地理位置', 'span': '日本', 'offset': [26, 28]}],
+            [{'type': '地理位置', 'span': '日本', 'offset': [48, 50]}],
+            [{'type': '组织机构', 'span': '北大', 'offset': [8, 10]}],
+            [{'type': '组织机构', 'span': '名古屋铁道', 'offset': [11, 16]}]]
+        """
+        # override __call__ because siamese-uie needs to recursively tokenize prompt
+        if 'batch_size' in kwargs:
+            batch_size = kwargs.pop('batch_size')
+            if batch_size and batch_size > 1:
+                raise Exception('This pipeline do not support batch inference')
+        # place model to cpu or gpu
+        if self.model:
+            if not self._model_prepare:
+                self.prepare_model()
+
+        # sanitize the parameters
+        text = input
+        schema = kwargs.pop('schema')
+        if type(schema) == str:
+            schema = json.loads(schema)
+        output_all_prefix = kwargs.pop('output_all_prefix', False)
+        tokenized_text = self.preprocessor([text])[0]
+        pred_info_list = []
+        prefix_info = []
+        self.forward(text, tokenized_text, prefix_info, schema, pred_info_list,
+                     output_all_prefix)
+        return {'output': pred_info_list}
+
+    def _pad(self, input_ids, pad_token_id):
+        input_ids[-1] += [pad_token_id] * (self.max_len - len(input_ids[-1]))
+        return input_ids
+
+    def tokenize_sample(self, text, tokenized_text, hints):
+        tokenized_hints = self.preprocessor(
+            hints, padding=True, truncation=True, max_length=self.hint_max_len)
+        tokenized_data = []
+        split_num = ceil(
+            (len(tokenized_text) - self.max_len)
+            / self.slide_len) + 1 if len(tokenized_text) > self.max_len else 1
+        token_ids = [
+            tokenized_text.ids[j * self.slide_len:j * self.slide_len
+                               + self.max_len] for j in range(split_num)
+        ]
+        attention_masks = [
+            tokenized_text.attention_mask[j * self.slide_len:j * self.slide_len
+                                          + self.max_len]
+            for j in range(split_num)
+        ]
+        if split_num > 1:
+            token_ids = self._pad(token_ids, 0)
+            attention_masks = self._pad(attention_masks, 0)
+        token_ids = torch.tensor(
+            token_ids, dtype=torch.long, device=self.device)
+        attention_masks = torch.tensor(
+            attention_masks, dtype=torch.long, device=self.device)
+        batch_num = token_ids.size(0) // self.inference_batch_size + 1
+        all_token_ids = torch.tensor_split(token_ids, batch_num)
+        all_attention_masks = torch.tensor_split(attention_masks, batch_num)
+        all_sequence_output = []
+        with torch.no_grad():
+            with autocast():
+                for token_ids, attention_masks in zip(all_token_ids,
+                                                      all_attention_masks):
+                    sequence_output = self.model.get_plm_sequence_output(
+                        token_ids, attention_masks)
+                    all_sequence_output.append(sequence_output)
+        all_sequence_output = torch.cat(all_sequence_output, dim=0)
+        all_attention_masks = torch.cat(all_attention_masks, dim=0)
+        for i in range(len(hints)):
+            hint = hints[i]
+            tokenized_hint = tokenized_hints[i]
+            for j in range(split_num):
+                a = j * self.slide_len
+                item = {
+                    'id': hint + '--' + text,
+                    'hint': hint,
+                    'text': text,
+                    'shift': a,
+                    'sequence_output': all_sequence_output[j],
+                    'hint_token_ids': tokenized_hint.ids,
+                    'attention_masks': all_attention_masks[j],
+                    'cross_attention_masks': tokenized_hint.attention_mask
+                }
+                tokenized_data.append(item)
+        # size #hint * #sub_text
+        return tokenized_data
+
+    def get_tokenized_data_and_data_loader(self, text, tokenized_text, hints):
+        tokenized_data = self.tokenize_sample(text, tokenized_text, hints)
+        sequence_output = torch.stack(
+            [item['sequence_output'] for item in tokenized_data])
+        attention_masks = torch.stack(
+            [item['attention_masks'] for item in tokenized_data])
+        hint_token_ids = torch.tensor(
+            [item['hint_token_ids'] for item in tokenized_data],
+            dtype=torch.long,
+            device=self.device)
+        cross_attention_masks = torch.tensor(
+            [item['cross_attention_masks'] for item in tokenized_data],
+            dtype=torch.long,
+            device=self.device)
+        # split to batchs
+        batch_num = sequence_output.size(0) // self.inference_batch_size + 1
+        sequence_output = torch.tensor_split(sequence_output, batch_num)
+        attention_masks = torch.tensor_split(attention_masks, batch_num)
+        hint_token_ids = torch.tensor_split(hint_token_ids, batch_num)
+        cross_attention_masks = torch.tensor_split(cross_attention_masks,
+                                                   batch_num)
+        return tokenized_data, (sequence_output, attention_masks,
+                                hint_token_ids, cross_attention_masks)
+
+    def get_entities(self, text, offsets, head_probs, tail_probs):
+        sample_entities = []
+        potential_heads = [
+            j for j in range(len(head_probs)) if head_probs[j] > self.threshold
+        ]
+        for ph in potential_heads:
+            for pt in range(ph, len(tail_probs)):
+                if tail_probs[pt] > self.threshold:
+                    char_head = offsets[ph][0]
+                    char_tail = offsets[pt][1]
+                    e = {
+                        'offset': [char_head, char_tail],
+                        'span': text[char_head:char_tail]
+                    }
+                    sample_entities.append(e)
+                    break
+        sample_entities = sorted(
+            sample_entities, key=lambda x: tuple(x['offset']))
+        return sample_entities
+
+    def get_prefix_infos(self, text, tokenized_text, prefix_info,
+                         schema_types):
+        hints = []
+        for st in schema_types:
+            hint = ''
+            for item in prefix_info:
+                hint += f'{item["type"]}: {item["span"]}, '
+            hint += f'{st}: '
+            # print('hint: ', hint)
+            hints.append(hint)
+        all_valid_tokenized_data, all_tensor_data = self.get_tokenized_data_and_data_loader(
+            text, tokenized_text, hints)
+        probs = []
+        last_uuid = None
+        all_pred_entities = []
+        all_head_probs = []
+        all_tail_probs = []
+        with torch.no_grad():
+            with autocast():
+                for batch_data in zip(*all_tensor_data):
+                    batch_head_probs, batch_tail_probs = self.model(
+                        *batch_data)
+                    batch_head_probs, batch_tail_probs = batch_head_probs.tolist(
+                    ), batch_tail_probs.tolist()  # (b, n, l)
+                    all_head_probs += batch_head_probs
+                    all_tail_probs += batch_tail_probs
+        all_valid_tokenized_data.append({'id': 'WhatADifferentUUiD'
+                                         })  # 需要加一个冗余数据，不然最后一个样本无法触发条件
+        all_head_probs.append(None)
+        all_tail_probs.append(None)
+        for tokenized_sample, head_probs, tail_probs in zip(
+                all_valid_tokenized_data, all_head_probs, all_tail_probs):
+            uuid = tokenized_sample['id']
+            prob = {
+                'shift': tokenized_sample.get('shift', 0),
+                'head': head_probs,  # (n, l)
+                'tail': tail_probs
+            }
+            if last_uuid is not None and uuid != last_uuid:
+                len_tokens = len(tokenized_text.offsets)
+                head_probs = [-1] * len_tokens  # (n, l)
+                tail_probs = [-1] * len_tokens
+                for prob_tmp in probs:
+                    shift = prob_tmp['shift']
+                    head = prob_tmp['head']
+                    tail = prob_tmp['tail']
+                    len_sub = len(head)
+                    for j in range(len_sub):
+                        if j + shift < len_tokens:
+                            head_probs[j + shift] = head[j] if head_probs[
+                                j + shift] == -1 else (head_probs[j + shift]
+                                                       + head[j]) / 2
+                            tail_probs[j + shift] = tail[j] if tail_probs[
+                                j + shift] == -1 else (tail_probs[j + shift]
+                                                       + tail[j]) / 2
+                # print('head_probs', head_probs)
+                # print('head_probs', tail_probs)
+                offsets = tokenized_text.offsets
+                pred_entities = self.get_entities(text, offsets, head_probs,
+                                                  tail_probs)
+                all_pred_entities.append(pred_entities)
+                probs = []
+            probs.append(prob)
+            last_uuid = uuid
+        next_prefix_infos = []
+        for st, pred_entities in zip(schema_types, all_pred_entities):
+            for e in pred_entities:
+                pi = deepcopy(prefix_info)
+                item = {'type': st, 'span': e['span'], 'offset': e['offset']}
+                pi.append(item)
+                next_prefix_infos.append(pi)
+        return next_prefix_infos
+
+    def forward(self, text, tokenized_text, prefix_info, curr_schema_dict,
+                pred_info_list, output_all_prefix):
+        # print('prefix_info', prefix_info)
+        next_prefix_infos = self.get_prefix_infos(text, tokenized_text,
+                                                  prefix_info,
+                                                  curr_schema_dict)
+        # print('next_prefix_infos', next_prefix_infos)
+        for prefix_info in next_prefix_infos:
+            next_schema_dict = curr_schema_dict[prefix_info[-1]['type']]
+            if next_schema_dict is None:
+                pred_info_list.append(prefix_info)
+            else:
+                if output_all_prefix:
+                    pred_info_list.append(prefix_info)
+                self.forward(text, tokenized_text, prefix_info,
+                             next_schema_dict, pred_info_list,
+                             output_all_prefix)
diff --git a/modelscope/pipelines/nlp/table_question_answering_pipeline.py b/modelscope/pipelines/nlp/table_question_answering_pipeline.py
index c5e0203f..365c6c6c 100644
--- a/modelscope/pipelines/nlp/table_question_answering_pipeline.py
+++ b/modelscope/pipelines/nlp/table_question_answering_pipeline.py
@@ -53,6 +53,9 @@ class TableQuestionAnsweringPipeline(Pipeline):
             device=device,
             auto_collate=auto_collate)
 
+        assert isinstance(self.model, Model), \
+            f'please check whether model config exists in {ModelFile.CONFIGURATION}'
+
         if preprocessor is None:
             self.preprocessor = TableQuestionAnsweringPreprocessor(
                 self.model.model_dir, **kwargs)
diff --git a/modelscope/pipelines/nlp/text_classification_pipeline.py b/modelscope/pipelines/nlp/text_classification_pipeline.py
index 9a3b6901..5b76a571 100644
--- a/modelscope/pipelines/nlp/text_classification_pipeline.py
+++ b/modelscope/pipelines/nlp/text_classification_pipeline.py
@@ -11,7 +11,7 @@ from modelscope.pipelines.base import Pipeline
 from modelscope.pipelines.builder import PIPELINES
 from modelscope.pipelines.util import batch_process
 from modelscope.preprocessors import Preprocessor
-from modelscope.utils.constant import Fields, Tasks
+from modelscope.utils.constant import Fields, ModelFile, Tasks
 from modelscope.utils.logger import get_logger
 
 logger = get_logger()
@@ -49,7 +49,7 @@ class TextClassificationPipeline(Pipeline):
             kwargs (dict, `optional`):
                 Extra kwargs passed into the preprocessor's constructor.
 
-        Example:
+        Examples:
             >>> from modelscope.pipelines import pipeline
             >>> pipeline_ins = pipeline('text-classification',
                 model='damo/nlp_structbert_sentence-similarity_chinese-base')
@@ -63,6 +63,9 @@ class TextClassificationPipeline(Pipeline):
             device=device,
             auto_collate=auto_collate)
 
+        assert isinstance(self.model, Model), \
+            f'please check whether model config exists in {ModelFile.CONFIGURATION}'
+
         if preprocessor is None:
             if self.model.__class__.__name__ == 'OfaForAllTasks':
                 self.preprocessor = Preprocessor.from_pretrained(
diff --git a/modelscope/pipelines/nlp/text_error_correction_pipeline.py b/modelscope/pipelines/nlp/text_error_correction_pipeline.py
index 39fcdcc1..9df31b3c 100644
--- a/modelscope/pipelines/nlp/text_error_correction_pipeline.py
+++ b/modelscope/pipelines/nlp/text_error_correction_pipeline.py
@@ -11,7 +11,7 @@ from modelscope.outputs import OutputKeys
 from modelscope.pipelines.base import Pipeline, Tensor
 from modelscope.pipelines.builder import PIPELINES
 from modelscope.preprocessors import Preprocessor
-from modelscope.utils.constant import Tasks
+from modelscope.utils.constant import ModelFile, Tasks
 
 __all__ = ['TextErrorCorrectionPipeline']
 
@@ -27,19 +27,21 @@ class TextErrorCorrectionPipeline(Pipeline):
                  device: str = 'gpu',
                  auto_collate=True,
                  **kwargs):
-        """use `model` and `preprocessor` to create a nlp text correction pipeline.
+        """
+        Use `model` and `preprocessor` to create a nlp text correction pipeline.
 
         Args:
             model (BartForTextErrorCorrection): A model instance, or a model local dir, or a model id in the model hub.
             preprocessor (TextErrorCorrectionPreprocessor): An optional preprocessor instance.
             kwargs (dict, `optional`):
                 Extra kwargs passed into the preprocessor's constructor.
-        Example:
-        >>> from modelscope.pipelines import pipeline
-        >>> pipeline_ins = pipeline(
-        >>>    task='text-error-correction', model='damo/nlp_bart_text-error-correction_chinese')
-        >>> sentence1 = '随着中国经济突飞猛近，建造工业与日俱增'
-        >>> print(pipeline_ins(sentence1))
+
+        Examples:
+            >>> from modelscope.pipelines import pipeline
+            >>> pipeline_ins = pipeline(
+            >>>    task='text-error-correction', model='damo/nlp_bart_text-error-correction_chinese')
+            >>> sentence1 = '随着中国经济突飞猛近，建造工业与日俱增'
+            >>> print(pipeline_ins(sentence1))
 
         To view other examples plese check tests/pipelines/test_text_error_correction.py.
         """
@@ -49,6 +51,8 @@ class TextErrorCorrectionPipeline(Pipeline):
             config_file=config_file,
             device=device,
             auto_collate=auto_collate)
+        assert isinstance(self.model, Model), \
+            f'please check whether model config exists in {ModelFile.CONFIGURATION}'
         if preprocessor is None:
             self.preprocessor = Preprocessor.from_pretrained(
                 self.model.model_dir, **kwargs)
@@ -64,16 +68,13 @@ class TextErrorCorrectionPipeline(Pipeline):
         """
         Args:
             inputs (Dict[str, Tensor])
-            Example:
+            Examples:
                 {
                     'predictions': Tensor([1377, 4959, 2785, 6392...]), # tokens need to be decode by tokenizer
                 }
         Returns:
-            Dict[str, str]
-            Example:
-            {
-                'output': '随着中国经济突飞猛进，建造工业与日俱增'
-            }
+            Dict[str, str]: which contains following:
+                - 'output': output str, for example '随着中国经济突飞猛进，建造工业与日俱增'
 
         """
 
diff --git a/modelscope/pipelines/nlp/text_generation_pipeline.py b/modelscope/pipelines/nlp/text_generation_pipeline.py
index 16e871ab..61f3a421 100644
--- a/modelscope/pipelines/nlp/text_generation_pipeline.py
+++ b/modelscope/pipelines/nlp/text_generation_pipeline.py
@@ -13,7 +13,7 @@ from modelscope.pipelines.base import Pipeline, Tensor
 from modelscope.pipelines.builder import PIPELINES
 from modelscope.preprocessors import Preprocessor
 from modelscope.utils.chinese_utils import remove_space_between_chinese_chars
-from modelscope.utils.constant import Tasks
+from modelscope.utils.constant import ModelFile, Tasks
 from modelscope.utils.hub import Config, read_config
 
 __all__ = ['TextGenerationPipeline', 'TextGenerationT5Pipeline']
@@ -41,7 +41,7 @@ class TextGenerationPipeline(Pipeline):
             kwargs (dict, `optional`):
                 Extra kwargs passed into the preprocessor's constructor.
 
-            Example:
+        Examples:
             >>> from modelscope.pipelines import pipeline
             >>> pipeline_ins = pipeline(task='text-generation',
             >>>    model='damo/nlp_palm2.0_text-generation_chinese-base')
@@ -60,6 +60,8 @@ class TextGenerationPipeline(Pipeline):
             device=device,
             auto_collate=auto_collate)
 
+        assert isinstance(self.model, Model), \
+            f'please check whether model config exists in {ModelFile.CONFIGURATION}'
         if preprocessor is None:
             self.preprocessor = Preprocessor.from_pretrained(
                 self.model.model_dir, first_sequence=first_sequence, **kwargs)
diff --git a/modelscope/pipelines/nlp/text_ranking_pipeline.py b/modelscope/pipelines/nlp/text_ranking_pipeline.py
index dfd0d433..1b313acc 100644
--- a/modelscope/pipelines/nlp/text_ranking_pipeline.py
+++ b/modelscope/pipelines/nlp/text_ranking_pipeline.py
@@ -11,7 +11,7 @@ from modelscope.pipelines.base import Pipeline
 from modelscope.pipelines.builder import PIPELINES
 from modelscope.preprocessors import (Preprocessor,
                                       TextRankingTransformersPreprocessor)
-from modelscope.utils.constant import Tasks
+from modelscope.utils.constant import ModelFile, Tasks
 
 __all__ = ['TextRankingPipeline']
 
@@ -45,6 +45,9 @@ class TextRankingPipeline(Pipeline):
             device=device,
             auto_collate=auto_collate)
 
+        assert isinstance(self.model, Model), \
+            f'please check whether model config exists in {ModelFile.CONFIGURATION}'
+
         if preprocessor is None:
             self.preprocessor = Preprocessor.from_pretrained(
                 self.model.model_dir,
diff --git a/modelscope/pipelines/nlp/token_classification_pipeline.py b/modelscope/pipelines/nlp/token_classification_pipeline.py
index 9926ee78..daa4823c 100644
--- a/modelscope/pipelines/nlp/token_classification_pipeline.py
+++ b/modelscope/pipelines/nlp/token_classification_pipeline.py
@@ -11,7 +11,7 @@ from modelscope.outputs import OutputKeys
 from modelscope.pipelines.base import Pipeline
 from modelscope.pipelines.builder import PIPELINES
 from modelscope.preprocessors import Preprocessor
-from modelscope.utils.constant import Tasks
+from modelscope.utils.constant import ModelFile, Tasks
 from modelscope.utils.tensor_utils import (torch_nested_detach,
                                            torch_nested_numpify)
 
@@ -53,6 +53,9 @@ class TokenClassificationPipeline(Pipeline):
             device=device,
             auto_collate=auto_collate)
 
+        assert isinstance(self.model, Model), \
+            f'please check whether model config exists in {ModelFile.CONFIGURATION}'
+
         if preprocessor is None:
             self.preprocessor = Preprocessor.from_pretrained(
                 self.model.model_dir,
diff --git a/modelscope/pipelines/nlp/translation_evaluation_pipeline.py b/modelscope/pipelines/nlp/translation_evaluation_pipeline.py
index 0bd1ce9c..1f8ba79a 100644
--- a/modelscope/pipelines/nlp/translation_evaluation_pipeline.py
+++ b/modelscope/pipelines/nlp/translation_evaluation_pipeline.py
@@ -32,6 +32,7 @@ class TranslationEvaluationPipeline(Pipeline):
                  model: InputModel,
                  preprocessor: Optional[Preprocessor] = None,
                  eval_mode: EvaluationMode = EvaluationMode.SRC_REF,
+                 device: str = 'gpu',
                  **kwargs):
         r"""Build a translation pipeline with a model dir or a model id in the model hub.
 
@@ -45,13 +46,16 @@ class TranslationEvaluationPipeline(Pipeline):
 
         self.eval_mode = eval_mode
         self.checking_eval_mode()
+        assert isinstance(self.model, Model), \
+            f'please check whether model config exists in {ModelFile.CONFIGURATION}'
 
         self.preprocessor = TranslationEvaluationPreprocessor(
             self.model.model_dir,
             self.eval_mode) if preprocessor is None else preprocessor
 
         self.model.load_checkpoint(
-            osp.join(self.model.model_dir, ModelFile.TORCH_MODEL_BIN_FILE))
+            osp.join(self.model.model_dir, ModelFile.TORCH_MODEL_BIN_FILE),
+            self.device)
         self.model.eval()
 
         return
diff --git a/modelscope/pipelines/nlp/translation_pipeline.py b/modelscope/pipelines/nlp/translation_pipeline.py
index 68a03631..8750cd3b 100644
--- a/modelscope/pipelines/nlp/translation_pipeline.py
+++ b/modelscope/pipelines/nlp/translation_pipeline.py
@@ -38,10 +38,13 @@ class TranslationPipeline(Pipeline):
             model: A Model instance.
         """
         super().__init__(model=model, **kwargs)
+        assert isinstance(self.model, Model), \
+            f'please check whether model config exists in {ModelFile.CONFIGURATION}'
+
         model = self.model.model_dir
         tf.reset_default_graph()
 
-        model_path = osp.join(
+        self.model_path = osp.join(
             osp.join(model, ModelFile.TF_CHECKPOINT_FOLDER), 'ckpt-0')
 
         self.cfg = Config.from_file(osp.join(model, ModelFile.CONFIGURATION))
@@ -85,26 +88,39 @@ class TranslationPipeline(Pipeline):
         self.output.update(output)
 
         with self._session.as_default() as sess:
-            logger.info(f'loading model from {model_path}')
+            logger.info(f'loading model from {self.model_path}')
             # load model
-            model_loader = tf.train.Saver(tf.global_variables())
-            model_loader.restore(sess, model_path)
+            self.model_loader = tf.train.Saver(tf.global_variables())
+            self.model_loader.restore(sess, self.model_path)
 
     def preprocess(self, input: str) -> Dict[str, Any]:
-        if self._src_lang == 'zh':
-            input_tok = self._tok.cut(input)
-            input_tok = ' '.join(list(input_tok))
-        else:
-            input = self._punct_normalizer.normalize(input)
-            input_tok = self._tok.tokenize(
-                input, return_str=True, aggressive_dash_splits=True)
+        input = input.split('<SENT_SPLIT>')
 
-        input_bpe = self._bpe.process_line(input_tok)
+        if self._src_lang == 'zh':
+            input_tok = [self._tok.cut(item) for item in input]
+            input_tok = [' '.join(list(item)) for item in input_tok]
+        else:
+            input = [self._punct_normalizer.normalize(item) for item in input]
+            aggressive_dash_splits = True
+            if (self._src_lang in ['es', 'fr'] and self._tgt_lang == 'en') or (
+                    self._src_lang == 'en' and self._tgt_lang in ['es', 'fr']):
+                aggressive_dash_splits = False
+            input_tok = [
+                self._tok.tokenize(
+                    item,
+                    return_str=True,
+                    aggressive_dash_splits=aggressive_dash_splits)
+                for item in input
+            ]
+
+        input_bpe = [
+            self._bpe.process_line(item).strip().split() for item in input_tok
+        ]
+        MAX_LENGTH = max([len(item) for item in input_bpe])
         input_ids = np.array([[
-            self._src_vocab[w]
-            if w in self._src_vocab else self.cfg['model']['src_vocab_size']
-            for w in input_bpe.strip().split()
-        ]])
+            self._src_vocab[w] if w in self._src_vocab else
+            self.cfg['model']['src_vocab_size'] - 1 for w in item
+        ] + [0] * (MAX_LENGTH - len(item)) for item in input_bpe])
         result = {'input_ids': input_ids}
         return result
 
@@ -115,13 +131,18 @@ class TranslationPipeline(Pipeline):
             return sess_outputs
 
     def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
-        output_seqs = inputs['output_seqs'][0]
-        wids = list(output_seqs[0]) + [0]
-        wids = wids[:wids.index(0)]
-        translation_out = ' '.join([
-            self._trg_rvocab[wid] if wid in self._trg_rvocab else '<unk>'
-            for wid in wids
-        ]).replace('@@ ', '').replace('@@', '')
-        translation_out = self._detok.detokenize(translation_out.split())
+        x, y, z = inputs['output_seqs'].shape
+
+        translation_out = []
+        for i in range(x):
+            output_seqs = inputs['output_seqs'][i]
+            wids = list(output_seqs[0]) + [0]
+            wids = wids[:wids.index(0)]
+            translation = ' '.join([
+                self._trg_rvocab[wid] if wid in self._trg_rvocab else '<unk>'
+                for wid in wids
+            ]).replace('@@ ', '').replace('@@', '')
+            translation_out.append(self._detok.detokenize(translation.split()))
+        translation_out = '<SENT_SPLIT>'.join(translation_out)
         result = {OutputKeys.TRANSLATION: translation_out}
         return result
diff --git a/modelscope/pipelines/nlp/user_satisfaction_estimation_pipeline.py b/modelscope/pipelines/nlp/user_satisfaction_estimation_pipeline.py
index 6abfca8b..fc55dc84 100644
--- a/modelscope/pipelines/nlp/user_satisfaction_estimation_pipeline.py
+++ b/modelscope/pipelines/nlp/user_satisfaction_estimation_pipeline.py
@@ -36,7 +36,7 @@ class UserSatisfactionEstimationPipeline(Pipeline):
             device (str): device str, should be either cpu, cuda, gpu, gpu:X or cuda:X
             auto_collate (bool): automatically to convert data to tensor or not.
 
-            Example:
+        Examples:
             >>> from modelscope.pipelines import pipeline
             >>> pipeline_ins = pipeline('text-classification',
                 model='damo/nlp_user-satisfaction-estimation_chinese')
diff --git a/modelscope/pipelines/nlp/word_segmentation_pipeline.py b/modelscope/pipelines/nlp/word_segmentation_pipeline.py
index ee49d9a5..aa6cb824 100644
--- a/modelscope/pipelines/nlp/word_segmentation_pipeline.py
+++ b/modelscope/pipelines/nlp/word_segmentation_pipeline.py
@@ -13,7 +13,7 @@ from modelscope.pipelines.nlp import TokenClassificationPipeline
 from modelscope.preprocessors import (
     Preprocessor, TokenClassificationTransformersPreprocessor,
     WordSegmentationPreprocessorThai)
-from modelscope.utils.constant import Tasks
+from modelscope.utils.constant import ModelFile, Tasks
 from modelscope.utils.tensor_utils import (torch_nested_detach,
                                            torch_nested_numpify)
 
@@ -28,12 +28,12 @@ class WordSegmentationPipeline(TokenClassificationPipeline):
     NOTE: The preprocessor will first split the sentence into single characters,
     then feed them into the tokenizer with the parameter is_split_into_words=True.
 
-    Example:
-    >>> from modelscope.pipelines import pipeline
-    >>> pipeline_ins = pipeline(task='word-segmentation',
-    >>>    model='damo/nlp_structbert_word-segmentation_chinese-base')
-    >>> sentence1 = '今天天气不错，适合出去游玩'
-    >>> print(pipeline_ins(sentence1))
+    Examples:
+        >>> from modelscope.pipelines import pipeline
+        >>> pipeline_ins = pipeline(task='word-segmentation',
+        >>>    model='damo/nlp_structbert_word-segmentation_chinese-base')
+        >>> sentence1 = '今天天气不错，适合出去游玩'
+        >>> print(pipeline_ins(sentence1))
 
     To view other examples plese check tests/pipelines/test_word_segmentation.py.
     """
@@ -100,6 +100,10 @@ class WordSegmentationThaiPipeline(MultilingualWordSegmentationPipeline):
             config_file=config_file,
             device=device,
             auto_collate=auto_collate)
+
+        assert isinstance(self.model, Model), \
+            f'please check whether model config exists in {ModelFile.CONFIGURATION}'
+
         if preprocessor is None:
             self.preprocessor = WordSegmentationPreprocessorThai(
                 self.model.model_dir,
diff --git a/modelscope/pipelines/nlp/zero_shot_classification_pipeline.py b/modelscope/pipelines/nlp/zero_shot_classification_pipeline.py
index 3db73d8b..5bc611fb 100644
--- a/modelscope/pipelines/nlp/zero_shot_classification_pipeline.py
+++ b/modelscope/pipelines/nlp/zero_shot_classification_pipeline.py
@@ -11,7 +11,7 @@ from modelscope.outputs import OutputKeys
 from modelscope.pipelines.base import Pipeline
 from modelscope.pipelines.builder import PIPELINES
 from modelscope.preprocessors import Preprocessor
-from modelscope.utils.constant import Tasks
+from modelscope.utils.constant import ModelFile, Tasks
 
 __all__ = ['ZeroShotClassificationPipeline']
 
@@ -50,7 +50,7 @@ class ZeroShotClassificationPipeline(Pipeline):
             kwargs (dict, `optional`):
                 Extra kwargs passed into the preprocessor's constructor.
 
-            Example:
+        Examples:
             >>> from modelscope.pipelines import pipeline
             >>> pipeline_ins = pipeline(task='zero-shot-classification',
             >>>    model='damo/nlp_structbert_zero-shot-classification_chinese-base')
@@ -69,6 +69,10 @@ class ZeroShotClassificationPipeline(Pipeline):
             auto_collate=auto_collate)
         self.entailment_id = 0
         self.contradiction_id = 2
+
+        assert isinstance(self.model, Model), \
+            f'please check whether model config exists in {ModelFile.CONFIGURATION}'
+
         if preprocessor is None:
             sequence_length = kwargs.pop('sequence_length', 512)
             self.preprocessor = Preprocessor.from_pretrained(
diff --git a/modelscope/pipelines/science/protein_structure_pipeline.py b/modelscope/pipelines/science/protein_structure_pipeline.py
index f5056c9a..53622753 100644
--- a/modelscope/pipelines/science/protein_structure_pipeline.py
+++ b/modelscope/pipelines/science/protein_structure_pipeline.py
@@ -109,7 +109,7 @@ class ProteinStructurePipeline(Pipeline):
             preprocessor (Preprocessor): An optional preprocessor instance, please make sure the preprocessor fits for
             the model if supplied.
 
-            Example:
+        Examples:
             >>> from modelscope.pipelines import pipeline
             >>> pipeline_ins = pipeline(task='protein-structure',
             >>>    model='DPTech/uni-fold-monomer')
diff --git a/modelscope/pipelines/util.py b/modelscope/pipelines/util.py
index fbbf4084..a2a3ed2b 100644
--- a/modelscope/pipelines/util.py
+++ b/modelscope/pipelines/util.py
@@ -2,8 +2,6 @@
 import os.path as osp
 from typing import List, Optional, Union
 
-import torch
-
 from modelscope.hub.api import HubApi
 from modelscope.hub.file_download import model_file_download
 from modelscope.utils.config import Config
@@ -86,6 +84,7 @@ def is_model(path: Union[str, List]):
 
 
 def batch_process(model, data):
+    import torch
     if model.__class__.__name__ == 'OfaForAllTasks':
         # collate batch data due to the nested data structure
         assert isinstance(data, list)
diff --git a/modelscope/preprocessors/__init__.py b/modelscope/preprocessors/__init__.py
index df081036..3a13828b 100644
--- a/modelscope/preprocessors/__init__.py
+++ b/modelscope/preprocessors/__init__.py
@@ -13,7 +13,8 @@ if TYPE_CHECKING:
                         ImageColorEnhanceFinetunePreprocessor,
                         ImageInstanceSegmentationPreprocessor,
                         ImageDenoisePreprocessor, ImageDeblurPreprocessor)
-    from .cv import (ImageClassificationMmcvPreprocessor)
+    from .cv import (ImageClassificationMmcvPreprocessor,
+                     ImageRestorationPreprocessor)
     from .kws import WavToLists
     from .tts import KanttsDataPreprocessor
     from .multi_modal import (OfaPreprocessor, MPlugPreprocessor,
@@ -39,7 +40,10 @@ if TYPE_CHECKING:
         TableQuestionAnsweringPreprocessor, NERPreprocessorViet,
         NERPreprocessorThai, WordSegmentationPreprocessorThai,
         TranslationEvaluationPreprocessor,
-        DialogueClassificationUsePreprocessor)
+        DialogueClassificationUsePreprocessor, SiameseUiePreprocessor,
+        DocumentGroundedDialogGeneratePreprocessor,
+        DocumentGroundedDialogRetrievalPreprocessor,
+        DocumentGroundedDialogRerankPreprocessor)
     from .video import ReadVideoData, MovieSceneSegmentationPreprocessor
 
 else:
@@ -55,7 +59,10 @@ else:
             'ImageInstanceSegmentationPreprocessor',
             'ImageDenoisePreprocessor', 'ImageDeblurPreprocessor'
         ],
-        'cv': ['ImageClassificationMmcvPreprocessor'],
+        'cv': [
+            'ImageClassificationMmcvPreprocessor',
+            'ImageRestorationPreprocessor'
+        ],
         'kws': ['WavToLists'],
         'tts': ['KanttsDataPreprocessor'],
         'multi_modal':
@@ -85,7 +92,11 @@ else:
             'ConversationalTextToSqlPreprocessor',
             'TableQuestionAnsweringPreprocessor',
             'TranslationEvaluationPreprocessor',
-            'DialogueClassificationUsePreprocessor'
+            'DialogueClassificationUsePreprocessor', 'SiameseUiePreprocessor',
+            'DialogueClassificationUsePreprocessor',
+            'DocumentGroundedDialogGeneratePreprocessor',
+            'DocumentGroundedDialogRetrievalPreprocessor',
+            'DocumentGroundedDialogRerankPreprocessor'
         ],
     }
 
diff --git a/modelscope/preprocessors/base.py b/modelscope/preprocessors/base.py
index 4161c4b1..2db804eb 100644
--- a/modelscope/preprocessors/base.py
+++ b/modelscope/preprocessors/base.py
@@ -4,8 +4,7 @@ from abc import ABC, abstractmethod
 from copy import deepcopy
 from typing import Any, Callable, Dict, Optional, Sequence, Union
 
-from modelscope.metainfo import Models, Preprocessors
-from modelscope.utils.checkpoint import save_configuration
+from modelscope.metainfo import Models, Preprocessors, TaskModels
 from modelscope.utils.config import Config, ConfigDict
 from modelscope.utils.constant import (DEFAULT_MODEL_REVISION, Invoke,
                                        ModeKeys, Tasks)
@@ -106,6 +105,14 @@ PREPROCESSOR_MAP = {
     (Models.structbert, Tasks.word_segmentation):
     Preprocessors.token_cls_tokenizer,
 
+    # doc2bot
+    (Models.doc2bot, Tasks.document_grounded_dialog_generate):
+    Preprocessors.document_grounded_dialog_generate,
+    (Models.doc2bot, Tasks.document_grounded_dialog_rerank):
+    Preprocessors.document_grounded_dialog_rerank,
+    (Models.doc2bot, Tasks.document_grounded_dialog_retrieval):
+    Preprocessors.document_grounded_dialog_retrieval,
+
     # veco
     (Models.veco, Tasks.backbone):
     Preprocessors.sen_cls_tokenizer,
@@ -120,7 +127,7 @@ PREPROCESSOR_MAP = {
     (Models.veco, Tasks.sentence_similarity):
     Preprocessors.sen_cls_tokenizer,
 
-    # taskmodels
+    # ner models
     (Models.lcrf, Tasks.named_entity_recognition):
     Preprocessors.sequence_labeling_tokenizer,
     (Models.lcrf, Tasks.word_segmentation):
@@ -134,6 +141,26 @@ PREPROCESSOR_MAP = {
     (Models.tcrf, Tasks.named_entity_recognition):
     Preprocessors.sequence_labeling_tokenizer,
 
+    # task models
+    (TaskModels.token_classification, Tasks.token_classification):
+    Preprocessors.sequence_labeling_tokenizer,
+    (TaskModels.token_classification, Tasks.part_of_speech):
+    Preprocessors.sequence_labeling_tokenizer,
+    (TaskModels.token_classification, Tasks.named_entity_recognition):
+    Preprocessors.sequence_labeling_tokenizer,
+    (TaskModels.text_classification, Tasks.text_classification):
+    Preprocessors.sen_cls_tokenizer,
+    (TaskModels.fill_mask, Tasks.fill_mask):
+    Preprocessors.fill_mask,
+    (TaskModels.feature_extraction, Tasks.feature_extraction):
+    Preprocessors.feature_extraction,
+    (TaskModels.information_extraction, Tasks.information_extraction):
+    Preprocessors.re_tokenizer,
+    (TaskModels.text_ranking, Tasks.text_ranking):
+    Preprocessors.text_ranking,
+    (TaskModels.text_generation, Tasks.text_generation):
+    Preprocessors.text_gen_tokenizer,
+
     # cv
     (Models.tinynas_detection, Tasks.image_object_detection):
     Preprocessors.object_detection_tinynas_preprocessor,
@@ -275,7 +302,6 @@ class Preprocessor(ABC):
                 # TODO: for Sequence, need adapt to `mode` and `mode_dir` args,
                 # and add mode for Compose or other plans
                 raise NotImplementedError('Not supported yet!')
-            sub_cfg = deepcopy(sub_cfg)
 
             preprocessor = build_preprocessor(sub_cfg, field_name)
         else:
@@ -306,13 +332,13 @@ class Preprocessor(ABC):
         preprocessor.mode = preprocessor_mode
         sub_cfg.pop('model_dir', None)
         if not hasattr(preprocessor, 'cfg'):
-            preprocessor.cfg = sub_cfg
+            preprocessor.cfg = cfg
         return preprocessor
 
     def save_pretrained(self,
                         target_folder: Union[str, os.PathLike],
                         config: Optional[dict] = None,
-                        save_config_function: Callable = save_configuration):
+                        save_config_function: Callable = None):
         """Save the preprocessor, its configuration and other related files to a directory,
             so that it can be re-loaded
 
@@ -341,4 +367,7 @@ class Preprocessor(ABC):
                         'preprocessor']['val']:
                     config['preprocessor']['val']['mode'] = 'inference'
 
+            if save_config_function is None:
+                from modelscope.utils.checkpoint import save_configuration
+                save_config_function = save_configuration
             save_config_function(target_folder, config)
diff --git a/modelscope/preprocessors/cv/__init__.py b/modelscope/preprocessors/cv/__init__.py
index 21324ed7..f49cb722 100644
--- a/modelscope/preprocessors/cv/__init__.py
+++ b/modelscope/preprocessors/cv/__init__.py
@@ -8,11 +8,19 @@ if TYPE_CHECKING:
     from .video_stabilization import (stabilization_preprocessor)
     from .mmcls_preprocessor import ImageClassificationMmcvPreprocessor
 
+    from .image_quality_assessment_mos import ImageQualityAssessmentMosPreprocessor
+    from .image_restoration_preprocessor import ImageRestorationPreprocessor
+    from .bad_image_detecting_preprocessor import BadImageDetectingPreprocessor
+
 else:
     _import_structure = {
         'video_super_resolution': ['VideoReader'],
         'video_stabilization': ['stabilization_preprocessor'],
         'mmcls_preprocessor': ['ImageClassificationMmcvPreprocessor'],
+        'image_quality_assessment_mos':
+        ['ImageQualityAssessmentMosPreprocessor'],
+        'image_restoration_preprocessor': ['ImageRestorationPreprocessor'],
+        'bad_image_detecting_preprocessor': ['BadImageDetectingPreprocessor'],
     }
 
     import sys
diff --git a/modelscope/preprocessors/cv/bad_image_detecting_preprocessor.py b/modelscope/preprocessors/cv/bad_image_detecting_preprocessor.py
new file mode 100644
index 00000000..14db48ac
--- /dev/null
+++ b/modelscope/preprocessors/cv/bad_image_detecting_preprocessor.py
@@ -0,0 +1,35 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import math
+from typing import Any, Dict
+
+import torch
+import torch.nn.functional as F
+from numpy import ndarray
+from PIL import Image
+from torchvision import transforms
+
+from modelscope.metainfo import Preprocessors
+from modelscope.preprocessors.base import Preprocessor
+from modelscope.preprocessors.builder import PREPROCESSORS
+from modelscope.utils.constant import Fields
+from modelscope.utils.type_assert import type_assert
+
+
+@PREPROCESSORS.register_module(
+    Fields.cv, module_name=Preprocessors.bad_image_detecting_preprocessor)
+class BadImageDetectingPreprocessor(Preprocessor):
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        self.transform_input = transforms.Compose([
+            transforms.Resize(224),
+            transforms.CenterCrop(224),
+            transforms.ToTensor()
+        ])
+
+    @type_assert(object, object)
+    def __call__(self, data: ndarray) -> Dict[str, Any]:
+        image = Image.fromarray(data)
+        data = self.transform_input(image)
+        data = data.unsqueeze(0)
+        return {'input': data.float()}
diff --git a/modelscope/preprocessors/cv/image_quality_assessment_mos.py b/modelscope/preprocessors/cv/image_quality_assessment_mos.py
new file mode 100644
index 00000000..f752b97b
--- /dev/null
+++ b/modelscope/preprocessors/cv/image_quality_assessment_mos.py
@@ -0,0 +1,65 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import math
+from typing import Any, Dict, Union
+
+import cv2
+import numpy as np
+from torchvision import transforms
+
+from modelscope.metainfo import Preprocessors
+from modelscope.preprocessors import LoadImage
+from modelscope.preprocessors.base import Preprocessor
+from modelscope.preprocessors.builder import PREPROCESSORS
+from modelscope.utils.constant import Fields
+from modelscope.utils.type_assert import type_assert
+
+
+@PREPROCESSORS.register_module(
+    Fields.cv,
+    module_name=Preprocessors.image_quality_assessment_mos_preprocessor)
+class ImageQualityAssessmentMosPreprocessor(Preprocessor):
+
+    def __init__(self, **kwargs):
+        """Preprocess the image for image quality assessment .
+        """
+        super().__init__(**kwargs)
+
+    def preprocessors(self, input):
+        img = LoadImage.convert_to_ndarray(input)
+        sub_img_dim = (720, 1280)
+        resize_dim = (1080, 1920)
+        h, w = img.shape[:2]
+
+        resize_h, resize_w = resize_dim
+        sub_img_h, sub_img_w = sub_img_dim
+        flag = False
+        if (w - h) * (resize_w - resize_h) < 0:
+            flag = True
+            resize_w, resize_h = resize_h, resize_w
+            sub_img_w, sub_img_h = sub_img_h, sub_img_w
+
+        # 注意只能等比例缩放
+        w_scale = resize_w / w
+        h_scale = resize_h / h
+        scale = max(h_scale, w_scale)
+        img = cv2.resize(
+            img, (int(math.ceil(scale * w)), int(math.ceil(scale * h))),
+            interpolation=cv2.INTER_CUBIC)
+        h, w = img.shape[:2]
+        h_i = (h - sub_img_h) // 2
+        w_i = (w - sub_img_w) // 2
+        img = img[h_i:h_i + sub_img_h, w_i:w_i + sub_img_w, :]
+
+        if flag:
+            img = np.rot90(img)
+        img = LoadImage.convert_to_img(img)
+        test_transforms = transforms.Compose([transforms.ToTensor()])
+        img = test_transforms(img)
+        return img
+
+    @type_assert(object, object)
+    def __call__(self, input) -> Dict[str, Any]:
+        data = self.preprocessors(input)
+        ret = {'input': data.unsqueeze(0)}
+        return ret
diff --git a/modelscope/preprocessors/cv/image_restoration_preprocessor.py b/modelscope/preprocessors/cv/image_restoration_preprocessor.py
new file mode 100644
index 00000000..846f1fbe
--- /dev/null
+++ b/modelscope/preprocessors/cv/image_restoration_preprocessor.py
@@ -0,0 +1,82 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import math
+from typing import Any, Dict
+
+import torch
+import torch.nn.functional as F
+from numpy import ndarray
+from PIL import Image
+from torchvision import transforms
+
+from modelscope.metainfo import Preprocessors
+from modelscope.preprocessors.base import Preprocessor
+from modelscope.preprocessors.builder import PREPROCESSORS
+from modelscope.utils.constant import Fields
+from modelscope.utils.type_assert import type_assert
+
+
+@PREPROCESSORS.register_module(
+    Fields.cv, module_name=Preprocessors.image_demoire_preprocessor)
+class ImageRestorationPreprocessor(Preprocessor):
+
+    def __init__(self, pad_32, min_max_l, **kwargs):
+        super().__init__(**kwargs)
+
+        self.pad_32 = pad_32
+        self.min_max_l = min_max_l
+        self.transform_input = transforms.Compose([transforms.ToTensor()])
+
+    def img_pad_3(self, x, w_pad, h_pad, w_odd_pad, h_odd_pad):
+        x1 = F.pad(
+            x[0:1, ...], (w_pad, w_odd_pad, h_pad, h_odd_pad), value=0.3827)
+        x2 = F.pad(
+            x[1:2, ...], (w_pad, w_odd_pad, h_pad, h_odd_pad), value=0.4141)
+        x3 = F.pad(
+            x[2:3, ...], (w_pad, w_odd_pad, h_pad, h_odd_pad), value=0.3912)
+        y = torch.cat([x1, x2, x3], dim=0)
+        return y
+
+    @type_assert(object, object)
+    def __call__(self, data: ndarray) -> Dict[str, Any]:
+        image = Image.fromarray(data)
+        img_w, img_h = image.size
+        min_wh = min(img_w, img_h)
+        # set min_max_l is 3072 avoid gpu oom(16G)
+        if min_wh > self.min_max_l:
+            fscale = self.min_max_l / min_wh
+            img_w_n = int(img_w * fscale)
+            img_h_n = int(img_h * fscale)
+            img_w_n = math.ceil(img_w_n / 32) * 32
+            img_h_n = math.ceil(img_h_n / 32) * 32
+            image = image.resize((img_w_n, img_h_n))
+        data = self.transform_input(image)
+        h_pad = 0
+        h_odd_pad = 0
+        w_pad = 0
+        w_odd_pad = 0
+        if self.pad_32:
+            c, h, w = data.size()
+            # pad image such that the resolution is a multiple of 32
+            w_pad = (math.ceil(w / 32) * 32 - w) // 2
+            h_pad = (math.ceil(h / 32) * 32 - h) // 2
+            w_odd_pad = w_pad
+            h_odd_pad = h_pad
+            if w % 2 == 1:
+                w_odd_pad += 1
+            if h % 2 == 1:
+                h_odd_pad += 1
+            data = self.img_pad_3(
+                data,
+                w_pad=w_pad,
+                h_pad=h_pad,
+                w_odd_pad=w_odd_pad,
+                h_odd_pad=h_odd_pad)
+
+        data = data.unsqueeze(0)
+        return {
+            'img': data.float(),
+            'h_pad': h_pad,
+            'h_odd_pad': h_odd_pad,
+            'w_pad': w_pad,
+            'w_odd_pad': w_odd_pad
+        }
diff --git a/modelscope/preprocessors/cv/timer.py b/modelscope/preprocessors/cv/timer.py
index 90d56f9a..dce2e93f 100644
--- a/modelscope/preprocessors/cv/timer.py
+++ b/modelscope/preprocessors/cv/timer.py
@@ -13,7 +13,8 @@ class TimerError(Exception):
 
 class Timer:
     """A flexible Timer class.
-    :Example:
+    Example:
+
     >>> import time
     >>> import mmcv
     >>> with mmcv.Timer():
@@ -90,7 +91,8 @@ def check_time(timer_id):
     """Add check points in a single line.
     This method is suitable for running a task on a list of items. A timer will
     be registered when the method is called for the first time.
-    :Example:
+    Example:
+
     >>> import time
     >>> import mmcv
     >>> for i in range(1, 6):
diff --git a/modelscope/preprocessors/cv/video_super_resolution.py b/modelscope/preprocessors/cv/video_super_resolution.py
index f7fbbc32..2a7d02a3 100644
--- a/modelscope/preprocessors/cv/video_super_resolution.py
+++ b/modelscope/preprocessors/cv/video_super_resolution.py
@@ -49,7 +49,8 @@ class VideoReader:
     Cache is used when decoding videos. So if the same frame is visited for
     the second time, there is no need to decode again if it is stored in the
     cache.
-    :Example:
+    Example:
+
     >>> import mmcv
     >>> v = mmcv.VideoReader('sample.mp4')
     >>> len(v)  # get the total frame number with `len()`
diff --git a/modelscope/preprocessors/image.py b/modelscope/preprocessors/image.py
index aca3023f..666d2b29 100644
--- a/modelscope/preprocessors/image.py
+++ b/modelscope/preprocessors/image.py
@@ -1,6 +1,5 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import io
-import os
 from typing import Any, Dict, Union
 
 import cv2
@@ -12,7 +11,6 @@ from PIL import Image, ImageOps
 from modelscope.fileio import File
 from modelscope.metainfo import Preprocessors
 from modelscope.utils.constant import Fields
-from modelscope.utils.hub import read_config
 from modelscope.utils.type_assert import type_assert
 from .base import Preprocessor
 from .builder import PREPROCESSORS
diff --git a/modelscope/preprocessors/nlp/__init__.py b/modelscope/preprocessors/nlp/__init__.py
index 4add627e..23fa9f94 100644
--- a/modelscope/preprocessors/nlp/__init__.py
+++ b/modelscope/preprocessors/nlp/__init__.py
@@ -30,6 +30,10 @@ if TYPE_CHECKING:
     from .mglm_summarization_preprocessor import MGLMSummarizationPreprocessor
     from .translation_evaluation_preprocessor import TranslationEvaluationPreprocessor
     from .dialog_classification_use_preprocessor import DialogueClassificationUsePreprocessor
+    from .siamese_uie_preprocessor import SiameseUiePreprocessor
+    from .document_grounded_dialog_generate_preprocessor import DocumentGroundedDialogGeneratePreprocessor
+    from .document_grounded_dialog_retrieval_preprocessor import DocumentGroundedDialogRetrievalPreprocessor
+    from .document_grounded_dialog_retrieval_preprocessor import DocumentGroundedDialogRerankPreprocessor
 else:
     _import_structure = {
         'bert_seq_cls_tokenizer': ['Tokenize'],
@@ -83,7 +87,14 @@ else:
         'translation_evaluation_preprocessor':
         ['TranslationEvaluationPreprocessor'],
         'dialog_classification_use_preprocessor':
-        ['DialogueClassificationUsePreprocessor']
+        ['DialogueClassificationUsePreprocessor'],
+        'siamese_uie_preprocessor': ['SiameseUiePreprocessor'],
+        'document_grounded_dialog_generate_preprocessor':
+        ['DocumentGroundedDialogGeneratePreprocessor'],
+        'document_grounded_dialog_retrieval_preprocessor':
+        ['DocumentGroundedDialogRetrievalPreprocessor'],
+        'document_grounded_dialog_rerank_preprocessor':
+        ['DocumentGroundedDialogRerankPreprocessor']
     }
 
     import sys
diff --git a/modelscope/preprocessors/nlp/document_grounded_dialog_generate_preprocessor.py b/modelscope/preprocessors/nlp/document_grounded_dialog_generate_preprocessor.py
new file mode 100644
index 00000000..7b87eca3
--- /dev/null
+++ b/modelscope/preprocessors/nlp/document_grounded_dialog_generate_preprocessor.py
@@ -0,0 +1,101 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os
+from typing import Any, Dict
+
+import torch
+from transformers import MT5Tokenizer, XLMRobertaTokenizer
+
+from modelscope.metainfo import Preprocessors
+from modelscope.preprocessors import Preprocessor
+from modelscope.preprocessors.builder import PREPROCESSORS
+from modelscope.utils.config import Config
+from modelscope.utils.constant import Fields, ModeKeys, ModelFile
+from modelscope.utils.type_assert import type_assert
+
+
+@PREPROCESSORS.register_module(
+    Fields.nlp, module_name=Preprocessors.document_grounded_dialog_generate)
+class DocumentGroundedDialogGeneratePreprocessor(Preprocessor):
+
+    def __init__(self, model_dir: str, *args, **kwargs):
+        """The preprocessor for DGDS generate task, based on transformers' tokenizer.
+
+        Args:
+            model_dir: The model dir containing the essential files to build the tokenizer.
+        """
+        super().__init__(*args, **kwargs)
+
+        self.model_dir: str = model_dir
+        self.config = Config.from_file(
+            os.path.join(self.model_dir, ModelFile.CONFIGURATION))
+        self.device = 'cuda' \
+            if ('device' not in kwargs or kwargs['device'] == 'gpu') and torch.cuda.is_available() \
+            else 'cpu'
+
+        self.top_k = self.config['top_k']
+        self.query_sequence_length = self.config['query_sequence_length']
+        self.rerank_source_sequence_length = self.config[
+            'rerank_source_sequence_length']
+        self.source_sequence_length = self.config['source_sequence_length']
+        self.target_sequence_length = self.config['target_sequence_length']
+        self.rerank_tokenizer = XLMRobertaTokenizer.from_pretrained(
+            os.path.join(self.model_dir, 'rerank'))
+        self.generation_tokenizer = MT5Tokenizer.from_pretrained(
+            os.path.join(self.model_dir, 'generation'))
+
+    @type_assert(object, Dict)
+    def __call__(self,
+                 data: Dict[str, Any],
+                 invoke_mode=ModeKeys.INFERENCE,
+                 **preprocessor_param) -> Dict[str, Any]:
+        query, context, label = data['query'], data['context'], data.get(
+            'label', None)
+        query = [
+            self.generation_tokenizer.decode(
+                self.generation_tokenizer([x],
+                                          add_special_tokens=False,
+                                          return_tensors='pt')['input_ids'][0]
+                [:self.query_sequence_length]) for x in query
+        ]
+
+        querys = [x for x in query for i in range(self.top_k)]
+        contexts = [x for ctxs in context for x in ctxs[:self.top_k]]
+        assert len(querys) == len(contexts)
+        rerank_input_ids = self.rerank_tokenizer(
+            querys,
+            contexts,
+            add_special_tokens=True,
+            return_tensors='pt',
+            max_length=self.rerank_source_sequence_length,
+            padding='longest',
+            truncation=True)
+
+        generator_inputs = [
+            ' '.join([query[i], '<passage>', doc]) for i in range(len(query))
+            for doc in context[i][:self.top_k]
+        ]
+        inputs_tokenizer_outputs = self.generation_tokenizer.batch_encode_plus(
+            list(generator_inputs),
+            padding=True,
+            return_tensors='pt',
+            max_length=self.source_sequence_length,
+            truncation=True)
+
+        result = {
+            'rerank_input_ids': rerank_input_ids,
+            'input_ids': inputs_tokenizer_outputs.input_ids,
+            'attention_mask': inputs_tokenizer_outputs.attention_mask
+        }
+        if invoke_mode in (ModeKeys.TRAIN, ModeKeys.EVAL
+                           ) and invoke_mode != ModeKeys.INFERENCE:
+            result['label_ids'] = self.generation_tokenizer.batch_encode_plus(
+                list(label),
+                padding=True,
+                return_tensors='pt',
+                max_length=self.target_sequence_length,
+                truncation=True).input_ids
+
+        for k, v in result.items():
+            result[k] = v.to(self.device)
+
+        return result
diff --git a/modelscope/preprocessors/nlp/document_grounded_dialog_rerank_preprocessor.py b/modelscope/preprocessors/nlp/document_grounded_dialog_rerank_preprocessor.py
new file mode 100644
index 00000000..c1918a15
--- /dev/null
+++ b/modelscope/preprocessors/nlp/document_grounded_dialog_rerank_preprocessor.py
@@ -0,0 +1,111 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import copy
+import os
+from typing import Any, Dict
+
+import torch
+from transformers import XLMRobertaTokenizer
+
+from modelscope.metainfo import Preprocessors
+from modelscope.preprocessors import Preprocessor
+from modelscope.preprocessors.builder import PREPROCESSORS
+from modelscope.utils.constant import Fields, ModeKeys, ModelFile
+from modelscope.utils.type_assert import type_assert
+
+
+@PREPROCESSORS.register_module(
+    Fields.nlp, module_name=Preprocessors.document_grounded_dialog_rerank)
+class DocumentGroundedDialogRerankPreprocessor(Preprocessor):
+
+    def __init__(self, model_dir: str, **kwargs):
+        """The preprocessor for DGDS rerank task, based on transformers' tokenizer.
+
+        Args:
+            model_dir: The model dir containing the essential files to build the tokenizer.
+        """
+        super().__init__()
+
+        self.model_dir = model_dir
+        self.device = 'cuda' \
+            if ('device' not in kwargs or kwargs['device'] == 'gpu') and torch.cuda.is_available() \
+            else 'cpu'
+        self.query_length = kwargs['query_length']
+        self.max_seq_length = kwargs['max_seq_length']
+        self.tokenizer = XLMRobertaTokenizer.from_pretrained(self.model_dir)
+        if kwargs['tokenizer_resize']:
+            special_tokens = [
+                '<last_turn>', '<user>', '<agent>', '<response>', '<passage>'
+            ]
+            self.tokenizer.add_tokens(special_tokens)
+
+    @type_assert(object, Dict)
+    def __call__(self, data: Dict[str, Any],
+                 **preprocessor_param) -> Dict[str, Any]:
+        if 'query' not in data:
+            query = data['input']
+            passages = data['passages']
+            ids = data['id']
+            output = data['output']
+            positive_pids = data['positive_pids']
+            preprocess_output_list = []
+            for index in range(len(query)):
+                now_query = query[index]
+                now_passages = eval(passages[index])
+                now_id = ids[index]
+                now_output = eval(output[index])
+                now_positive_pids = eval(positive_pids[index])
+                # query
+                query_ids = self.tokenizer(
+                    [now_query], add_special_tokens=False,
+                    return_tensors='pt')['input_ids'][0][:self.query_length]
+                now_query = self.tokenizer.decode(query_ids)
+                # passage
+                texts_b = []
+                for p in now_passages:
+                    texts_b.append(' '.join(
+                        [now_query, '<passage>', p['text']]))
+                passages_input = self.tokenizer(
+                    texts_b,
+                    add_special_tokens=True,
+                    return_tensors='pt',
+                    max_length=self.max_seq_length,
+                    padding='longest',
+                    truncation=True)
+                preprocess_output_list.append({
+                    'input_ids':
+                    passages_input['input_ids'].to(self.device),
+                    'attention_mask':
+                    passages_input['attention_mask'].to(self.device),
+                    'id':
+                    now_id,
+                    'output':
+                    now_output,
+                    'positive_pids':
+                    now_positive_pids,
+                    'passages':
+                    now_passages,
+                    'query':
+                    now_query
+                })
+            return preprocess_output_list
+        else:
+            query = data['query']
+            passages = data['passages']
+            # query
+            query_ids = self.tokenizer(
+                [query], add_special_tokens=False,
+                return_tensors='pt')['input_ids'][0][:self.query_length]
+            query = self.tokenizer.decode(query_ids)
+            # passage
+            texts_b = []
+            for p in passages:
+                texts_b.append(' '.join([query, '<passage>', p['text']]))
+            passages_input = self.tokenizer(
+                texts_b,
+                add_special_tokens=True,
+                return_tensors='pt',
+                max_length=self.max_seq_length,
+                padding='longest',
+                truncation=True)
+            result = {n: t.to(self.device) for n, t in passages_input.items()}
+        return result
diff --git a/modelscope/preprocessors/nlp/document_grounded_dialog_retrieval_preprocessor.py b/modelscope/preprocessors/nlp/document_grounded_dialog_retrieval_preprocessor.py
new file mode 100644
index 00000000..b6cc1ba4
--- /dev/null
+++ b/modelscope/preprocessors/nlp/document_grounded_dialog_retrieval_preprocessor.py
@@ -0,0 +1,102 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os
+from typing import Any, Dict
+
+import torch
+from transformers import XLMRobertaTokenizer
+
+from modelscope.metainfo import Preprocessors
+from modelscope.preprocessors import Preprocessor
+from modelscope.preprocessors.builder import PREPROCESSORS
+from modelscope.utils.config import Config
+from modelscope.utils.constant import Fields, ModeKeys, ModelFile
+from modelscope.utils.type_assert import type_assert
+
+
+@PREPROCESSORS.register_module(
+    Fields.nlp, module_name=Preprocessors.document_grounded_dialog_retrieval)
+class DocumentGroundedDialogRetrievalPreprocessor(Preprocessor):
+
+    def __init__(self, model_dir: str, *args, **kwargs):
+        """The preprocessor for DGDS retrieval task, based on transformers' tokenizer.
+
+        Args:
+            model_dir: The model dir containing the essential files to build the tokenizer.
+        """
+        super().__init__(*args, **kwargs)
+
+        self.model_dir: str = model_dir
+        self.config = Config.from_file(
+            os.path.join(self.model_dir, ModelFile.CONFIGURATION))
+        self.device = 'cuda' \
+            if ('device' not in kwargs or kwargs['device'] == 'gpu') and torch.cuda.is_available() \
+            else 'cpu'
+        self.query_sequence_length = self.config['query_sequence_length']
+        self.context_sequence_length = self.config['context_sequence_length']
+        self.tokenizer = XLMRobertaTokenizer.from_pretrained(
+            os.path.join(self.model_dir))
+
+    @type_assert(object, Dict)
+    def __call__(self,
+                 data: Dict[str, Any],
+                 invoke_mode=ModeKeys.INFERENCE,
+                 input_type='query',
+                 **preprocessor_param) -> Dict[str, Any]:
+        if invoke_mode in (ModeKeys.TRAIN, ModeKeys.EVAL
+                           ) and invoke_mode != ModeKeys.INFERENCE:
+            query, positive, negative = data['query'], data['positive'], data[
+                'negative']
+
+            query_tokenizer_outputs = self.tokenizer.batch_encode_plus(
+                query,
+                padding=True,
+                return_tensors='pt',
+                max_length=self.query_sequence_length,
+                truncation=True)
+
+            context_tokenizer_outputs = self.tokenizer.batch_encode_plus(
+                positive + negative,
+                padding=True,
+                return_tensors='pt',
+                max_length=self.context_sequence_length,
+                truncation=True)
+
+            result = {
+                'query_input_ids': query_tokenizer_outputs.input_ids,
+                'query_attention_mask': query_tokenizer_outputs.attention_mask,
+                'context_input_ids': context_tokenizer_outputs.input_ids,
+                'context_attention_mask':
+                context_tokenizer_outputs.attention_mask,
+                'labels':
+                torch.tensor(list(range(len(query))), dtype=torch.long)
+            }
+        elif input_type == 'query':
+            query = data['query']
+            query_tokenizer_outputs = self.tokenizer.batch_encode_plus(
+                query,
+                padding=True,
+                return_tensors='pt',
+                max_length=self.query_sequence_length,
+                truncation=True)
+            result = {
+                'query_input_ids': query_tokenizer_outputs.input_ids,
+                'query_attention_mask': query_tokenizer_outputs.attention_mask,
+            }
+        else:
+            context = data['context']
+            context_tokenizer_outputs = self.tokenizer.batch_encode_plus(
+                context,
+                padding=True,
+                return_tensors='pt',
+                max_length=self.context_sequence_length,
+                truncation=True)
+            result = {
+                'context_input_ids': context_tokenizer_outputs.input_ids,
+                'context_attention_mask':
+                context_tokenizer_outputs.attention_mask,
+            }
+
+        for k, v in result.items():
+            result[k] = v.to(self.device)
+
+        return result
diff --git a/modelscope/preprocessors/nlp/faq_question_answering_preprocessor.py b/modelscope/preprocessors/nlp/faq_question_answering_preprocessor.py
index eb8c501f..157d204d 100644
--- a/modelscope/preprocessors/nlp/faq_question_answering_preprocessor.py
+++ b/modelscope/preprocessors/nlp/faq_question_answering_preprocessor.py
@@ -57,6 +57,8 @@ class FaqQuestionAnsweringTransformersPreprocessor(Preprocessor):
         self.support_set = support_set
         self.label_in_support_set = label_in_support_set
         self.text_in_support_set = text_in_support_set
+        # support non-prototype network
+        self.pad_support = kwargs.get('pad_support', False)
 
     def pad(self, samples, max_len):
         result = []
@@ -79,6 +81,23 @@ class FaqQuestionAnsweringTransformersPreprocessor(Preprocessor):
         ] + self.tokenizer.convert_tokens_to_ids(
             self.tokenizer.tokenize(text)) + [self.tokenizer.sep_token_id]
 
+    def preprocess(self, support_set):
+        label_to_samples = {}
+        for item in support_set:
+            label = item[self.label_in_support_set]
+            if label not in label_to_samples:
+                label_to_samples[label] = []
+            label_to_samples[label].append(item)
+        max_cnt = 0
+        for label, samples in label_to_samples.items():
+            if len(samples) > max_cnt:
+                max_cnt = len(samples)
+        new_support_set = []
+        for label, samples in label_to_samples.items():
+            new_support_set.extend(
+                samples + [samples[0] for _ in range(max_cnt - len(samples))])
+        return new_support_set
+
     @type_assert(object, Dict)
     def __call__(self, data: Dict[str, Any],
                  **preprocessor_param) -> Dict[str, Any]:
@@ -93,6 +112,8 @@ class FaqQuestionAnsweringTransformersPreprocessor(Preprocessor):
         if not isinstance(queryset, list):
             queryset = [queryset]
         supportset = data[self.support_set]
+        if self.pad_support:
+            supportset = self.preprocess(supportset)
         supportset = sorted(
             supportset, key=lambda d: d[self.label_in_support_set])
 
diff --git a/modelscope/preprocessors/nlp/fill_mask_preprocessor.py b/modelscope/preprocessors/nlp/fill_mask_preprocessor.py
index 0b9597d4..d269144e 100644
--- a/modelscope/preprocessors/nlp/fill_mask_preprocessor.py
+++ b/modelscope/preprocessors/nlp/fill_mask_preprocessor.py
@@ -43,8 +43,7 @@ class FillMaskPreprocessorBase(Preprocessor):
         Args:
             data (tuple): [sentence1, sentence2]
                 sentence1 (str): a sentence
-                    Example:
-                        'you are so handsome.'
+
         Returns:
             Dict[str, Any]: the preprocessed data
         """
diff --git a/modelscope/preprocessors/nlp/mgeo_ranking_preprocessor.py b/modelscope/preprocessors/nlp/mgeo_ranking_preprocessor.py
index c7f3677f..7f870cd5 100644
--- a/modelscope/preprocessors/nlp/mgeo_ranking_preprocessor.py
+++ b/modelscope/preprocessors/nlp/mgeo_ranking_preprocessor.py
@@ -10,6 +10,7 @@ from modelscope.preprocessors import Preprocessor
 from modelscope.preprocessors.builder import PREPROCESSORS
 from modelscope.utils.constant import Fields, ModeKeys
 from modelscope.utils.type_assert import type_assert
+from .text_ranking_preprocessor import TextRankingPreprocessorBase
 
 
 class GisUtt:
@@ -113,7 +114,7 @@ class GisUtt:
 
 @PREPROCESSORS.register_module(
     Fields.nlp, module_name=Preprocessors.mgeo_ranking)
-class MGeoRankingTransformersPreprocessor(Preprocessor):
+class MGeoRankingTransformersPreprocessor(TextRankingPreprocessorBase):
 
     def __init__(self,
                  model_dir: str,
@@ -125,39 +126,39 @@ class MGeoRankingTransformersPreprocessor(Preprocessor):
                  label='labels',
                  qid='qid',
                  max_length=None,
+                 padding='longest',
+                 truncation=True,
+                 use_fast=True,
                  **kwargs):
         """The tokenizer preprocessor class for the text ranking preprocessor.
 
         Args:
             model_dir(str, `optional`): The model dir used to parse the label mapping, can be None.
-            first_sequence(str, `optional`): The key of the first sequence.
-            second_sequence(str, `optional`): The key of the second sequence.
-            label(str, `optional`): The keys of the label columns, default `labels`.
-            qid(str, `optional`): The qid info.
-            mode: The mode for the preprocessor.
             max_length: The max sequence length which the model supported,
                 will be passed into tokenizer as the 'max_length' param.
+            padding: The padding method
+            truncation: The truncation method
         """
-        super().__init__(mode)
+        super().__init__(
+            mode=mode,
+            first_sequence=first_sequence,
+            second_sequence=second_sequence,
+            label=label,
+            qid=qid)
         self.model_dir = model_dir
-        self.first_sequence = first_sequence
-        self.second_sequence = second_sequence
         self.first_sequence_gis = first_sequence_gis
         self.second_sequence_gis = second_sequence_gis
-
-        self.label = label
-        self.qid = qid
         self.sequence_length = max_length if max_length is not None else kwargs.get(
             'sequence_length', 128)
         kwargs.pop('sequence_length', None)
-        self.tokenizer = AutoTokenizer.from_pretrained(self.model_dir)
+        self.tokenize_kwargs = kwargs
+        self.tokenize_kwargs['padding'] = padding
+        self.tokenize_kwargs['truncation'] = truncation
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            self.model_dir, use_fast=use_fast)
 
     @type_assert(object, dict)
-    def __call__(self,
-                 data: Dict,
-                 padding='longest',
-                 truncation=True,
-                 **kwargs) -> Dict[str, Any]:
+    def __call__(self, data: Dict, **kwargs) -> Dict[str, Any]:
         sentence1 = data.get(self.first_sequence)
         sentence2 = data.get(self.second_sequence)
         labels = data.get(self.label)
@@ -176,12 +177,9 @@ class MGeoRankingTransformersPreprocessor(Preprocessor):
             'max_length', kwargs.pop('sequence_length', self.sequence_length))
         if 'return_tensors' not in kwargs:
             kwargs['return_tensors'] = 'pt'
-        feature = self.tokenizer(
-            sentence1,
-            sentence2,
-            padding=padding,
-            truncation=truncation,
-            **kwargs)
+
+        self.tokenize_kwargs.update(kwargs)
+        feature = self.tokenizer(sentence1, sentence2, **self.tokenize_kwargs)
         if labels is not None:
             feature['labels'] = labels
         if qid is not None:
diff --git a/modelscope/preprocessors/nlp/sentence_embedding_preprocessor.py b/modelscope/preprocessors/nlp/sentence_embedding_preprocessor.py
index 5930e007..b03268c6 100644
--- a/modelscope/preprocessors/nlp/sentence_embedding_preprocessor.py
+++ b/modelscope/preprocessors/nlp/sentence_embedding_preprocessor.py
@@ -43,6 +43,7 @@ class SentenceEmbeddingTransformersPreprocessor(Preprocessor):
                 'sequence_length', 128)
         kwargs.pop('sequence_length', None)
         model_type = None
+        self.max_length = max_length
         if model_dir is not None:
             model_type = get_model_type(model_dir)
         self.nlp_tokenizer = NLPTokenizer(
@@ -72,16 +73,21 @@ class SentenceEmbeddingTransformersPreprocessor(Preprocessor):
         """
         source_sentences = data[self.first_sequence]
         if self.second_sequence in data:
+            if isinstance(source_sentences[0], list):
+                source_sentences = [source_sentences[0]]
             compare_sentences = data[self.second_sequence]
-            sentences = [source_sentences[0]]
-            for sent in compare_sentences:
-                sentences.append(sent)
         else:
-            sentences = source_sentences
+            compare_sentences = None
         if 'return_tensors' not in kwargs:
             kwargs[
                 'return_tensors'] = 'pt' if self.mode == ModeKeys.INFERENCE else None
-
-        tokenized_inputs = self.nlp_tokenizer(
-            sentences, padding=padding, truncation=truncation, **kwargs)
+        query_inputs = self.nlp_tokenizer(
+            source_sentences, padding=padding, truncation=truncation, **kwargs)
+        tokenized_inputs = {'query': query_inputs, 'docs': None}
+        if compare_sentences is not None and len(compare_sentences) > 0:
+            tokenized_inputs['docs'] = self.nlp_tokenizer(
+                compare_sentences,
+                padding=padding,
+                truncation=truncation,
+                **kwargs)
         return tokenized_inputs
diff --git a/modelscope/preprocessors/nlp/siamese_uie_preprocessor.py b/modelscope/preprocessors/nlp/siamese_uie_preprocessor.py
new file mode 100644
index 00000000..a224cd67
--- /dev/null
+++ b/modelscope/preprocessors/nlp/siamese_uie_preprocessor.py
@@ -0,0 +1,49 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+from typing import Any, Dict, Union
+
+from transformers import AutoTokenizer
+
+from modelscope.metainfo import Preprocessors
+from modelscope.preprocessors import Preprocessor
+from modelscope.preprocessors.builder import PREPROCESSORS
+from modelscope.utils.constant import Fields, ModeKeys
+from modelscope.utils.hub import get_model_type
+from .transformers_tokenizer import NLPTokenizer
+
+
+@PREPROCESSORS.register_module(
+    Fields.nlp, module_name=Preprocessors.siamese_uie_preprocessor)
+class SiameseUiePreprocessor(Preprocessor):
+    """The tokenizer preprocessor used in zero shot classification.
+    """
+
+    def __init__(
+        self,
+        model_dir: str,
+        mode: str = ModeKeys.INFERENCE,
+        **kwargs,
+    ):
+        """preprocess the data
+`
+        Args:
+            model_dir (str): model path
+        """
+        super().__init__(mode)
+        self.model_dir: str = model_dir
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            model_dir, use_fast=True)
+
+    def __call__(self, data: list, **kwargs) -> Dict[str, Any]:
+        """process the raw input data
+
+        Args:
+            data (str or dict): a sentence
+                Example:
+                    'you are so handsome.'
+
+        Returns:
+            Dict[str, Any]: the preprocessed data
+        """
+        features = self.tokenizer(data, **kwargs)
+        return features
diff --git a/modelscope/preprocessors/nlp/space/batch.py b/modelscope/preprocessors/nlp/space/batch.py
index d27776f5..7172b1ab 100644
--- a/modelscope/preprocessors/nlp/space/batch.py
+++ b/modelscope/preprocessors/nlp/space/batch.py
@@ -19,23 +19,21 @@ def batch(reader, batch_size, drop_last=False):
         generator
 
     Examples:
-        .. code-block:: python
+        >>> import paddle.fluid as fluid
+        >>> def reader():
+        >>>     for i in range(10):
+        >>>         yield i
+        >>> batch_reader = fluid.io.batch(reader, batch_size=2)
 
-            import paddle.fluid as fluid
-            def reader():
-                for i in range(10):
-                    yield i
-            batch_reader = fluid.io.batch(reader, batch_size=2)
+        >>> for data in batch_reader():
+        >>>     print(data)
 
-            for data in batch_reader():
-                print(data)
-
-            # Output is
-            # [0, 1]
-            # [2, 3]
-            # [4, 5]
-            # [6, 7]
-            # [8, 9]
+        >>> # Output is
+        >>> # [0, 1]
+        >>> # [2, 3]
+        >>> # [4, 5]
+        >>> # [6, 7]
+        >>> # [8, 9]
     """
 
     def batch_reader():
diff --git a/modelscope/preprocessors/nlp/space/tokenizer.py b/modelscope/preprocessors/nlp/space/tokenizer.py
index 798ce3b7..6a289393 100644
--- a/modelscope/preprocessors/nlp/space/tokenizer.py
+++ b/modelscope/preprocessors/nlp/space/tokenizer.py
@@ -365,8 +365,8 @@ class WordpieceTokenizer(object):
         using the given vocabulary.
 
         For example:
-          input = "unaffable"
-          output = ["un", "##aff", "##able"]
+          >>> input = "unaffable"
+          >>> output = ["un", "##aff", "##able"]
 
         Args:
           text: A single token or whitespace separated tokens. This should have
diff --git a/modelscope/preprocessors/nlp/text_classification_preprocessor.py b/modelscope/preprocessors/nlp/text_classification_preprocessor.py
index e62221ef..ba6d53d1 100644
--- a/modelscope/preprocessors/nlp/text_classification_preprocessor.py
+++ b/modelscope/preprocessors/nlp/text_classification_preprocessor.py
@@ -1,6 +1,5 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 
-from abc import abstractmethod
 from typing import Any, Dict, List, Tuple, Union
 
 import numpy as np
@@ -27,6 +26,7 @@ class TextClassificationPreprocessorBase(Preprocessor):
         label: str = 'label',
         label2id: Dict = None,
         mode: str = ModeKeys.INFERENCE,
+        keep_original_columns: List[str] = None,
     ):
         """The base class for the text classification preprocessor.
 
@@ -36,7 +36,9 @@ class TextClassificationPreprocessorBase(Preprocessor):
             second_sequence(str, `optional`): The key of the second sequence.
             label(str, `optional`): The keys of the label columns, default is `label`
             label2id: (dict, `optional`): The optional label2id mapping
-            mode: The mode for the preprocessor
+            mode(str, `optional`): The mode for the preprocessor
+            keep_original_columns(List[str], `optional`): The original columns to keep,
+                only available when the input is a `dict`, default None
         """
         super().__init__(mode)
         self.model_dir = model_dir
@@ -44,6 +46,7 @@ class TextClassificationPreprocessorBase(Preprocessor):
         self.second_sequence = second_sequence
         self.label = label
         self.label2id = label2id
+        self.keep_original_columns = keep_original_columns
         if self.label2id is None and self.model_dir is not None:
             self.label2id = parse_label_mapping(self.model_dir)
 
@@ -71,11 +74,8 @@ class TextClassificationPreprocessorBase(Preprocessor):
         Args:
             data (tuple): [sentence1, sentence2]
                 sentence1 (str): a sentence
-                    Example:
-                        'you are so handsome.'
                 sentence2 (str): a sentence
-                    Example:
-                        'you are so beautiful.'
+
         Returns:
             Dict[str, Any]: the preprocessed data
         """
@@ -90,6 +90,9 @@ class TextClassificationPreprocessorBase(Preprocessor):
             for k, v in output.items()
         }
         labels_to_id(labels, output, self.label2id)
+        if self.keep_original_columns and isinstance(data, dict):
+            for column in self.keep_original_columns:
+                output[column] = data[column]
         return output
 
     def _tokenize_text(self, sequence1, sequence2=None, **kwargs):
@@ -131,6 +134,7 @@ class TextClassificationTransformersPreprocessor(
                  mode: str = ModeKeys.INFERENCE,
                  max_length: int = None,
                  use_fast: bool = None,
+                 keep_original_columns=None,
                  **kwargs):
         """The tokenizer preprocessor used in sequence classification.
 
@@ -152,4 +156,4 @@ class TextClassificationTransformersPreprocessor(
         self.nlp_tokenizer = NLPTokenizer(
             model_dir, model_type, use_fast=use_fast, tokenize_kwargs=kwargs)
         super().__init__(model_dir, first_sequence, second_sequence, label,
-                         label2id, mode)
+                         label2id, mode, keep_original_columns)
diff --git a/modelscope/preprocessors/nlp/text_generation_preprocessor.py b/modelscope/preprocessors/nlp/text_generation_preprocessor.py
index 5f30b70a..76e1a4f9 100644
--- a/modelscope/preprocessors/nlp/text_generation_preprocessor.py
+++ b/modelscope/preprocessors/nlp/text_generation_preprocessor.py
@@ -24,17 +24,21 @@ class TextGenerationPreprocessorBase(Preprocessor):
     def __init__(self,
                  mode: str = ModeKeys.INFERENCE,
                  src_txt='src_txt',
-                 tgt_txt='tgt_txt'):
+                 tgt_txt='tgt_txt',
+                 keep_original_columns=None):
         """The base class for all the text generation task's preprocessors.
 
         Args:
             mode: The preprocessor mode.
             src_txt: The key for the src text.
             tgt_txt: The key for the tgt text.
+            keep_original_columns: Keep original columns and change them to attributes,
+                only available when the input is a `dict`, default True
         """
         super().__init__(mode)
         self.src_txt = src_txt
         self.tgt_txt = tgt_txt
+        self.keep_original_columns = keep_original_columns
 
     def _tokenize_text(self, sequence1, sequence2=None, **kwargs):
         """Tokenize the text.
@@ -57,6 +61,9 @@ class TextGenerationPreprocessorBase(Preprocessor):
             k: np.array(v) if isinstance(v, list) else v
             for k, v in output.items()
         }
+        if self.keep_original_columns and isinstance(data, dict):
+            for column in self.keep_original_columns:
+                output[column] = data[column]
         return output
 
     def decode(self, tokens, **kwargs):
@@ -102,6 +109,7 @@ class TextGenerationTransformersPreprocessor(TextGenerationPreprocessorBase):
                  tgt_txt='tgt_txt',
                  max_length: int = None,
                  use_fast: bool = None,
+                 keep_original_columns=None,
                  **kwargs):
         """The tokenizer preprocessor used in text generation.
 
@@ -117,7 +125,7 @@ class TextGenerationTransformersPreprocessor(TextGenerationPreprocessorBase):
         """
         if 'first_sequence' in kwargs:
             src_txt = kwargs.pop('first_sequence')
-        super().__init__(mode, src_txt, tgt_txt)
+        super().__init__(mode, src_txt, tgt_txt, keep_original_columns)
         kwargs['truncation'] = kwargs.get('truncation', True)
         kwargs['padding'] = kwargs.get('padding', 'max_length')
         kwargs['return_token_type_ids'] = kwargs.get('return_token_type_ids',
@@ -196,9 +204,10 @@ class TextGenerationJiebaPreprocessor(TextGenerationPreprocessorBase):
                  src_txt='src_txt',
                  tgt_txt='tgt_txt',
                  sequence_length: int = 128,
-                 use_fast=None):
+                 use_fast=None,
+                 **kwargs):
         from modelscope.models.nlp.gpt3 import JiebaBPETokenizer
-        super().__init__(mode, src_txt, tgt_txt)
+        super().__init__(mode, src_txt, tgt_txt, **kwargs)
         self.tokenizer = JiebaBPETokenizer(
             osp.join(model_dir, 'tokenizer.json'))
         self.max_length = sequence_length
@@ -280,7 +289,7 @@ class TextGenerationSentencePiecePreprocessor(TextGenerationPreprocessorBase):
             src_txt = kwargs.pop('first_sequence')
 
         import sentencepiece as spm
-        super().__init__(mode, src_txt, tgt_txt)
+        super().__init__(mode, src_txt, tgt_txt, **kwargs)
         self.tokenizer = None
         for file_name in os.listdir(model_dir):
             if file_name.endswith('.model'):
diff --git a/modelscope/preprocessors/nlp/text_ranking_preprocessor.py b/modelscope/preprocessors/nlp/text_ranking_preprocessor.py
index 86d42a3e..4ded00f1 100644
--- a/modelscope/preprocessors/nlp/text_ranking_preprocessor.py
+++ b/modelscope/preprocessors/nlp/text_ranking_preprocessor.py
@@ -11,9 +11,33 @@ from modelscope.utils.constant import Fields, ModeKeys
 from modelscope.utils.type_assert import type_assert
 
 
+class TextRankingPreprocessorBase(Preprocessor):
+
+    def __init__(self,
+                 mode: str = ModeKeys.INFERENCE,
+                 first_sequence='source_sentence',
+                 second_sequence='sentences_to_compare',
+                 label='labels',
+                 qid='qid'):
+        """The tokenizer preprocessor class for the text ranking preprocessor.
+
+        Args:
+            first_sequence(str, `optional`): The key of the first sequence.
+            second_sequence(str, `optional`): The key of the second sequence.
+            label(str, `optional`): The keys of the label columns, default `labels`.
+            qid(str, `optional`): The qid info.
+            mode: The mode for the preprocessor.
+        """
+        super().__init__(mode)
+        self.first_sequence = first_sequence
+        self.second_sequence = second_sequence
+        self.label = label
+        self.qid = qid
+
+
 @PREPROCESSORS.register_module(
     Fields.nlp, module_name=Preprocessors.text_ranking)
-class TextRankingTransformersPreprocessor(Preprocessor):
+class TextRankingTransformersPreprocessor(TextRankingPreprocessorBase):
 
     def __init__(self,
                  model_dir: str,
@@ -23,36 +47,35 @@ class TextRankingTransformersPreprocessor(Preprocessor):
                  label='labels',
                  qid='qid',
                  max_length=None,
+                 padding='max_length',
+                 truncation=True,
+                 use_fast=True,
                  **kwargs):
         """The tokenizer preprocessor class for the text ranking preprocessor.
 
         Args:
             model_dir(str, `optional`): The model dir used to parse the label mapping, can be None.
-            first_sequence(str, `optional`): The key of the first sequence.
-            second_sequence(str, `optional`): The key of the second sequence.
-            label(str, `optional`): The keys of the label columns, default `labels`.
-            qid(str, `optional`): The qid info.
-            mode: The mode for the preprocessor.
             max_length: The max sequence length which the model supported,
                 will be passed into tokenizer as the 'max_length' param.
         """
-        super().__init__(mode)
+        super().__init__(
+            mode=mode,
+            first_sequence=first_sequence,
+            second_sequence=second_sequence,
+            label=label,
+            qid=qid)
         self.model_dir = model_dir
-        self.first_sequence = first_sequence
-        self.second_sequence = second_sequence
-        self.label = label
-        self.qid = qid
         self.sequence_length = max_length if max_length is not None else kwargs.get(
             'sequence_length', 128)
         kwargs.pop('sequence_length', None)
-        self.tokenizer = AutoTokenizer.from_pretrained(self.model_dir)
+        self.tokenize_kwargs = kwargs
+        self.tokenize_kwargs['padding'] = padding
+        self.tokenize_kwargs['truncation'] = truncation
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            self.model_dir, use_fast=use_fast)
 
     @type_assert(object, dict)
-    def __call__(self,
-                 data: Dict,
-                 padding='max_length',
-                 truncation=True,
-                 **kwargs) -> Dict[str, Any]:
+    def __call__(self, data: Dict, **kwargs) -> Dict[str, Any]:
         sentence1 = data.get(self.first_sequence)
         sentence2 = data.get(self.second_sequence)
         labels = data.get(self.label)
@@ -67,12 +90,9 @@ class TextRankingTransformersPreprocessor(Preprocessor):
             'max_length', kwargs.pop('sequence_length', self.sequence_length))
         if 'return_tensors' not in kwargs:
             kwargs['return_tensors'] = 'pt'
-        feature = self.tokenizer(
-            sentence1,
-            sentence2,
-            padding=padding,
-            truncation=truncation,
-            **kwargs)
+
+        self.tokenize_kwargs.update(kwargs)
+        feature = self.tokenizer(sentence1, sentence2, **self.tokenize_kwargs)
         if labels is not None:
             feature['labels'] = labels
         if qid is not None:
diff --git a/modelscope/preprocessors/nlp/token_classification_preprocessor.py b/modelscope/preprocessors/nlp/token_classification_preprocessor.py
index 52181274..c07012e0 100644
--- a/modelscope/preprocessors/nlp/token_classification_preprocessor.py
+++ b/modelscope/preprocessors/nlp/token_classification_preprocessor.py
@@ -65,6 +65,7 @@ class TokenClassificationPreprocessorBase(Preprocessor):
         label2id: Dict = None,
         label_all_tokens: bool = False,
         mode: str = ModeKeys.INFERENCE,
+        keep_original_columns: List[str] = None,
     ):
         """The base class for all the token-classification tasks.
 
@@ -79,6 +80,8 @@ class TokenClassificationPreprocessorBase(Preprocessor):
                 If label_all_tokens is true, all non-initial sub-tokens will get labels like `I-xxx`,
                 or else the labels will be filled with -100, default False.
             mode: The preprocessor mode.
+            keep_original_columns(List[str], `optional`): The original columns to keep,
+                only available when the input is a `dict`, default None
         """
         super().__init__(mode)
         self.model_dir = model_dir
@@ -86,6 +89,7 @@ class TokenClassificationPreprocessorBase(Preprocessor):
         self.label = label
         self.label2id = label2id
         self.label_all_tokens = label_all_tokens
+        self.keep_original_columns = keep_original_columns
         if self.label2id is None and self.model_dir is not None:
             self.label2id = parse_label_mapping(self.model_dir)
 
@@ -157,6 +161,9 @@ class TokenClassificationPreprocessorBase(Preprocessor):
             k: np.array(v) if isinstance(v, list) else v
             for k, v in outputs.items()
         }
+        if self.keep_original_columns and isinstance(data, dict):
+            for column in self.keep_original_columns:
+                outputs[column] = data[column]
         if self.mode == ModeKeys.INFERENCE:
             outputs['text'] = text
         return outputs
@@ -200,6 +207,7 @@ class TokenClassificationTransformersPreprocessor(
                  mode: str = ModeKeys.INFERENCE,
                  max_length=None,
                  use_fast=None,
+                 keep_original_columns=None,
                  **kwargs):
         """
 
@@ -210,7 +218,7 @@ class TokenClassificationTransformersPreprocessor(
             **kwargs: Extra args input into the tokenizer's __call__ method.
         """
         super().__init__(model_dir, first_sequence, label, label2id,
-                         label_all_tokens, mode)
+                         label_all_tokens, mode, keep_original_columns)
         self.is_lstm_model = 'lstm' in model_dir
         model_type = None
         if self.is_lstm_model:
diff --git a/modelscope/preprocessors/nlp/transformers_tokenizer.py b/modelscope/preprocessors/nlp/transformers_tokenizer.py
index 9a14ef9a..7a4705b9 100644
--- a/modelscope/preprocessors/nlp/transformers_tokenizer.py
+++ b/modelscope/preprocessors/nlp/transformers_tokenizer.py
@@ -82,7 +82,8 @@ class NLPTokenizer:
                 model_dir) if model_dir is not None else tokenizer()
 
         if model_type in (Models.structbert, Models.gpt3, Models.palm,
-                          Models.plug, Models.megatron_bert):
+                          Models.plug, Models.megatron_bert,
+                          Models.plug_mental, Models.fid_plug):
             from transformers import BertTokenizer, BertTokenizerFast
             tokenizer = BertTokenizerFast if self.use_fast else BertTokenizer
             return tokenizer.from_pretrained(
diff --git a/modelscope/trainers/__init__.py b/modelscope/trainers/__init__.py
index b7d6c1c7..cb635a91 100644
--- a/modelscope/trainers/__init__.py
+++ b/modelscope/trainers/__init__.py
@@ -13,7 +13,7 @@ if TYPE_CHECKING:
                      ReferringVideoObjectSegmentationTrainer)
     from .multi_modal import CLIPTrainer
     from .nlp import SequenceClassificationTrainer, TextRankingTrainer
-    from .nlp_trainer import NlpEpochBasedTrainer, VecoTrainer, NlpTrainerArguments
+    from .nlp_trainer import NlpEpochBasedTrainer, VecoTrainer
     from .trainer import EpochBasedTrainer
 
 else:
@@ -28,8 +28,7 @@ else:
         ],
         'multi_modal': ['CLIPTrainer'],
         'nlp': ['SequenceClassificationTrainer', 'TextRankingTrainer'],
-        'nlp_trainer':
-        ['NlpEpochBasedTrainer', 'VecoTrainer', 'NlpTrainerArguments'],
+        'nlp_trainer': ['NlpEpochBasedTrainer', 'VecoTrainer'],
         'trainer': ['EpochBasedTrainer']
     }
 
diff --git a/modelscope/trainers/audio/asr_trainer.py b/modelscope/trainers/audio/asr_trainer.py
index 4ea25863..3ec01f2e 100644
--- a/modelscope/trainers/audio/asr_trainer.py
+++ b/modelscope/trainers/audio/asr_trainer.py
@@ -50,6 +50,7 @@ class ASRTrainer(BaseTrainer):
             lr (float): learning rate
             mate_params (dict): for saving other training args
         Examples:
+
         >>> import os
         >>> from modelscope.metainfo import Trainers
         >>> from modelscope.msdatasets import MsDataset
diff --git a/modelscope/trainers/audio/kws_nearfield_trainer.py b/modelscope/trainers/audio/kws_nearfield_trainer.py
index ba3f5f5f..bf00c435 100644
--- a/modelscope/trainers/audio/kws_nearfield_trainer.py
+++ b/modelscope/trainers/audio/kws_nearfield_trainer.py
@@ -102,7 +102,7 @@ class KWSNearfieldTrainer(BaseTrainer):
                 os.makedirs(self.work_dir)
             logger.info(f'Current working dir is {work_dir}')
 
-        # 2. prepare dataset and dataloader
+        # 2. prepare preset files
         token_file = os.path.join(self.model_dir, 'train/tokens.txt')
         assert os.path.exists(token_file), f'{token_file} is missing'
         self.token_table = read_token(token_file)
@@ -111,6 +111,24 @@ class KWSNearfieldTrainer(BaseTrainer):
         assert os.path.exists(lexicon_file), f'{lexicon_file} is missing'
         self.lexicon_table = read_lexicon(lexicon_file)
 
+        feature_transform_file = os.path.join(
+            self.model_dir, 'train/feature_transform.txt.80dim-l2r2')
+        assert os.path.exists(feature_transform_file), \
+            f'{feature_transform_file} is missing'
+        configs.model['cmvn_file'] = feature_transform_file
+
+        # 3. write config.yaml for inference
+        self.configs = configs
+        if self.rank == 0:
+            if not os.path.exists(self.work_dir):
+                os.makedirs(self.work_dir)
+            saved_config_path = os.path.join(self.work_dir, 'config.yaml')
+            with open(saved_config_path, 'w') as fout:
+                data = yaml.dump(configs.to_dict())
+                fout.write(data)
+
+    def train(self, *args, **kwargs):
+        # 1. prepare dataset and dataloader
         assert kwargs['train_data'], 'please config train data in dict kwargs'
         assert kwargs['cv_data'], 'please config cv data in dict kwargs'
         assert kwargs[
@@ -119,7 +137,7 @@ class KWSNearfieldTrainer(BaseTrainer):
         self.cv_data = kwargs['cv_data']
         self.trans_data = kwargs['trans_data']
 
-        train_conf = configs['preprocessor']
+        train_conf = self.configs['preprocessor']
         cv_conf = copy.deepcopy(train_conf)
         cv_conf['speed_perturb'] = False
         cv_conf['spec_aug'] = False
@@ -137,31 +155,25 @@ class KWSNearfieldTrainer(BaseTrainer):
             batch_size=None,
             pin_memory=kwargs.get('pin_memory', False),
             persistent_workers=True,
-            num_workers=configs.train.dataloader.workers_per_gpu,
-            prefetch_factor=configs.train.dataloader.get('prefetch', 2))
+            num_workers=self.configs.train.dataloader.workers_per_gpu,
+            prefetch_factor=self.configs.train.dataloader.get('prefetch', 2))
         self.cv_dataloader = DataLoader(
             self.cv_dataset,
             batch_size=None,
             pin_memory=kwargs.get('pin_memory', False),
             persistent_workers=True,
-            num_workers=configs.evaluation.dataloader.workers_per_gpu,
-            prefetch_factor=configs.evaluation.dataloader.get('prefetch', 2))
+            num_workers=self.configs.evaluation.dataloader.workers_per_gpu,
+            prefetch_factor=self.configs.evaluation.dataloader.get(
+                'prefetch', 2))
 
-        # 3. build model, and load checkpoint
-        feature_transform_file = os.path.join(
-            self.model_dir, 'train/feature_transform.txt.80dim-l2r2')
-        assert os.path.exists(feature_transform_file), \
-            f'{feature_transform_file} is missing'
-        configs.model['cmvn_file'] = feature_transform_file
-
-        # 3.1 Init kws model from configs
-        self.model = self.build_model(configs)
+        # 2. Init kws model from configs
+        self.model = self.build_model(self.configs)
         num_params = count_parameters(self.model)
         if self.rank == 0:
             # print(model)
             logger.warning('the number of model params: {}'.format(num_params))
 
-        # 3.2 if specify checkpoint, load infos and params
+        # 3. if specify checkpoint, load infos and params
         if self.checkpoint is not None and os.path.exists(self.checkpoint):
             load_checkpoint(self.checkpoint, self.model)
             info_path = re.sub('.pt$', '.yaml', self.checkpoint)
@@ -173,12 +185,13 @@ class KWSNearfieldTrainer(BaseTrainer):
             logger.warning('Training with random initialized params')
             infos = {}
         self.start_epoch = infos.get('epoch', -1) + 1
-        configs['train']['start_epoch'] = self.start_epoch
+        self.configs['train']['start_epoch'] = self.start_epoch
 
-        lr_last_epoch = infos.get('lr', configs['train']['optimizer']['lr'])
-        configs['train']['optimizer']['lr'] = lr_last_epoch
+        lr_last_epoch = infos.get('lr',
+                                  self.configs['train']['optimizer']['lr'])
+        self.configs['train']['optimizer']['lr'] = lr_last_epoch
 
-        # 3.3 model placement
+        # 4. model placement
         self.device_name = kwargs.get('device', 'gpu')
         if self.world_size > 1:
             self.device_name = f'cuda:{self.local_rank}'
@@ -192,17 +205,15 @@ class KWSNearfieldTrainer(BaseTrainer):
         else:
             self.model = self.model.to(self.device)
 
-        # 4. write config.yaml for inference and export
-        self.configs = configs
+        # 5. update training config file
         if self.rank == 0:
             if not os.path.exists(self.work_dir):
                 os.makedirs(self.work_dir)
             saved_config_path = os.path.join(self.work_dir, 'config.yaml')
             with open(saved_config_path, 'w') as fout:
-                data = yaml.dump(configs.to_dict())
+                data = yaml.dump(self.configs.to_dict())
                 fout.write(data)
 
-    def train(self, *args, **kwargs):
         logger.info('Start training...')
 
         writer = None
@@ -301,7 +312,7 @@ class KWSNearfieldTrainer(BaseTrainer):
                            os.environ['CUDA_VISIBLE_DEVICES'] will be setted
         '''
         # 1. get checkpoint
-        if checkpoint_path is not None and checkpoint_path != '':
+        if checkpoint_path is not None and os.path.exists(checkpoint_path):
             logger.warning(
                 f'evaluating with specific model: {checkpoint_path}')
             eval_checkpoint = checkpoint_path
@@ -326,7 +337,8 @@ class KWSNearfieldTrainer(BaseTrainer):
                     self.avg_checkpoint,
                     self.work_dir,
                 )
-                logger.warning(f'average convert to kaldi: {kaldi_cvt}')
+                logger.warning(
+                    f'average model convert to kaldi network: {kaldi_cvt}')
 
             eval_checkpoint = self.avg_checkpoint
             logger.warning(
diff --git a/modelscope/trainers/audio/kws_utils/det_utils.py b/modelscope/trainers/audio/kws_utils/det_utils.py
index 97b0c2de..ee6710f7 100644
--- a/modelscope/trainers/audio/kws_utils/det_utils.py
+++ b/modelscope/trainers/audio/kws_utils/det_utils.py
@@ -15,12 +15,14 @@
 
 import glob
 import os
+import threading
 
 import json
+import kaldiio
 import matplotlib.font_manager as fm
 import matplotlib.pyplot as plt
 import numpy as np
-import torchaudio
+import torch
 
 from modelscope.utils.logger import get_logger
 from .file_utils import make_pair, read_lists
@@ -30,6 +32,51 @@ logger = get_logger()
 font = fm.FontProperties(size=15)
 
 
+class thread_wrapper(threading.Thread):
+
+    def __init__(self, func, args=()):
+        super(thread_wrapper, self).__init__()
+        self.func = func
+        self.args = args
+        self.result = []
+
+    def run(self):
+        self.result = self.func(*self.args)
+
+    def get_result(self):
+        try:
+            return self.result
+        except Exception:
+            return None
+
+
+def count_duration(tid, data_lists):
+    results = []
+
+    for obj in data_lists:
+        assert 'key' in obj
+        assert 'wav' in obj
+        assert 'txt' in obj
+        # key = obj['key']
+        wav_file = obj['wav']
+        # txt = obj['txt']
+
+        try:
+            rate, waveform = kaldiio.load_mat(wav_file)
+            waveform = torch.tensor(waveform, dtype=torch.float32)
+            waveform = waveform.unsqueeze(0)
+            frames = len(waveform[0])
+            duration = frames / float(rate)
+        except Exception:
+            logging.info(f'load file failed: {wav_file}')
+            duration = 0.0
+
+        obj['duration'] = duration
+        results.append(obj)
+
+    return results
+
+
 def load_data_and_score(keywords_list, data_file, trans_file, score_file):
     # score_table: {uttid: [keywordlist]}
     score_table = {}
@@ -54,6 +101,26 @@ def load_data_and_score(keywords_list, data_file, trans_file, score_file):
     trans_lists = read_lists(trans_file)
     data_lists = make_pair(wav_lists, trans_lists)
 
+    # count duration for each wave use multi-thread
+    num_workers = 8
+    start = 0
+    step = int(len(data_lists) / num_workers)
+    tasks = []
+    for idx in range(8):
+        if idx != num_workers - 1:
+            task = thread_wrapper(count_duration,
+                                  (idx, data_lists[start:start + step]))
+        else:
+            task = thread_wrapper(count_duration, (idx, data_lists[start:]))
+        task.start()
+        tasks.append(task)
+        start += step
+
+    duration_lists = []
+    for task in tasks:
+        task.join()
+        duration_lists += task.get_result()
+
     # build empty structure for keyword-filler infos
     keyword_filler_table = {}
     for keyword in keywords_list:
@@ -63,35 +130,36 @@ def load_data_and_score(keywords_list, data_file, trans_file, score_file):
         keyword_filler_table[keyword]['filler_table'] = {}
         keyword_filler_table[keyword]['filler_duration'] = 0.0
 
-    for obj in data_lists:
+    for obj in duration_lists:
         assert 'key' in obj
         assert 'wav' in obj
         assert 'txt' in obj
-        key = obj['key']
-        wav_file = obj['wav']
-        txt = obj['txt']
-        assert key in score_table
+        assert 'duration' in obj
 
-        waveform, rate = torchaudio.load(wav_file)
-        frames = len(waveform[0])
-        duration = frames / float(rate)
+        key = obj['key']
+        # wav_file = obj['wav']
+        txt = obj['txt']
+        duration = obj['duration']
+        assert key in score_table
 
         for keyword in keywords_list:
             if txt.find(keyword) != -1:
                 if keyword == score_table[key]['kw']:
                     keyword_filler_table[keyword]['keyword_table'].update(
                         {key: score_table[key]['confi']})
-                    keyword_filler_table[keyword][
-                        'keyword_duration'] += duration
                 else:
                     # uttrance detected but not match this keyword
                     keyword_filler_table[keyword]['keyword_table'].update(
                         {key: -1.0})
-                    keyword_filler_table[keyword][
-                        'keyword_duration'] += duration
+                keyword_filler_table[keyword]['keyword_duration'] += duration
             else:
-                keyword_filler_table[keyword]['filler_table'].update(
-                    {key: score_table[key]['confi']})
+                if keyword == score_table[key]['kw']:
+                    keyword_filler_table[keyword]['filler_table'].update(
+                        {key: score_table[key]['confi']})
+                else:
+                    # uttrance if detected, which is not FA for this keyword
+                    keyword_filler_table[keyword]['filler_table'].update(
+                        {key: -1.0})
                 keyword_filler_table[keyword]['filler_duration'] += duration
 
     return keyword_filler_table
diff --git a/modelscope/trainers/base.py b/modelscope/trainers/base.py
index 665d9180..c524d32b 100644
--- a/modelscope/trainers/base.py
+++ b/modelscope/trainers/base.py
@@ -55,7 +55,7 @@ class BaseTrainer(ABC):
         """ Train (and evaluate) process
 
         Train process should be implemented for specific task or
-        model, releated paramters have been intialized in
+        model, related parameters have been initialized in
         ``BaseTrainer.__init__`` and should be used in this function
         """
         pass
@@ -66,7 +66,7 @@ class BaseTrainer(ABC):
         """ Evaluation process
 
         Evaluation process should be implemented for specific task or
-        model, releated paramters have been intialized in
+        model, related parameters have been initialized in
         ``BaseTrainer.__init__`` and should be used in this function
         """
         pass
@@ -87,7 +87,7 @@ class DummyTrainer(BaseTrainer):
         """ Train (and evaluate) process
 
         Train process should be implemented for specific task or
-        model, releated paramters have been intialized in
+        model, related parameters have been initialized in
         ``BaseTrainer.__init__`` and should be used in this function
         """
         cfg = self.cfg.train
@@ -100,7 +100,7 @@ class DummyTrainer(BaseTrainer):
         """ Evaluation process
 
         Evaluation process should be implemented for specific task or
-        model, releated paramters have been intialized in
+        model, related parameters have been initialized in
         ``BaseTrainer.__init__`` and should be used in this function
         """
         cfg = self.cfg.evaluation
diff --git a/modelscope/trainers/cv/image_defrcn_fewshot_detection_trainer.py b/modelscope/trainers/cv/image_defrcn_fewshot_detection_trainer.py
index 04b2967a..f77deedd 100644
--- a/modelscope/trainers/cv/image_defrcn_fewshot_detection_trainer.py
+++ b/modelscope/trainers/cv/image_defrcn_fewshot_detection_trainer.py
@@ -4,16 +4,26 @@
 # https://github.com/er-muyue/DeFRCN/blob/main/tools/model_surgery.py
 
 import os
+from collections import OrderedDict
 from typing import Callable, Optional, Union
 
-import torch
+from detectron2.checkpoint.detection_checkpoint import DetectionCheckpointer
+from detectron2.data import MetadataCatalog
+from detectron2.data.build import (build_detection_test_loader,
+                                   build_detection_train_loader)
 from detectron2.engine import SimpleTrainer, hooks
-from detectron2.evaluation import DatasetEvaluators, verify_results
+from detectron2.evaluation import (DatasetEvaluator, DatasetEvaluators,
+                                   verify_results)
+from detectron2.evaluation.testing import print_csv_format
+from detectron2.solver.build import build_lr_scheduler, build_optimizer
 from detectron2.utils import comm
 from torch import nn
+from torch.nn.parallel import DistributedDataParallel
 
 from modelscope.metainfo import Trainers
 from modelscope.models.base import Model, TorchModel
+from modelscope.models.cv.image_defrcn_fewshot.evaluation.evaluator import \
+    inference_on_dataset
 from modelscope.trainers.base import BaseTrainer
 from modelscope.trainers.builder import TRAINERS
 from modelscope.utils.constant import DEFAULT_MODEL_REVISION, ModelFile
@@ -21,16 +31,17 @@ from modelscope.utils.logger import get_logger
 
 
 class DefaultTrainer(SimpleTrainer):
+    """
+    Trainer inherit from detectron2 SimpleTrainer, use detectron2 framework to train.
+    """
 
     def __init__(self, model, cfg):
+        """ initialize model with cfg
 
-        from collections import OrderedDict
-        from fvcore.nn.precise_bn import get_bn_modules
-        from torch.nn.parallel import DistributedDataParallel
-
-        from detectron2.data.build import build_detection_train_loader, build_detection_test_loader
-        from detectron2.solver.build import build_optimizer, build_lr_scheduler
-        from detectron2.checkpoint.detection_checkpoint import DetectionCheckpointer
+        Args:
+            model: torch.nn.Module
+            cfg: model config with detectron2 format
+        """
         from detectron2.utils.logger import setup_logger
 
         setup_logger()
@@ -130,19 +141,18 @@ class DefaultTrainer(SimpleTrainer):
 
     @classmethod
     def build_evaluator(cls, cfg, dataset_name, output_folder=None):
-        from detectron2.data import MetadataCatalog
 
         if output_folder is None:
             output_folder = os.path.join(cfg.OUTPUT_DIR, 'inference')
         evaluator_list = []
         evaluator_type = MetadataCatalog.get(dataset_name).evaluator_type
         if evaluator_type == 'coco':
-            from detectron2.evaluation import COCOEvaluator
+            from modelscope.models.cv.image_defrcn_fewshot.evaluation.coco_evaluation import COCOEvaluator
             evaluator_list.append(
                 COCOEvaluator(dataset_name, True, output_folder))
         if evaluator_type == 'pascal_voc':
-            from detectron2.evaluation import PascalVOCDetectionEvaluator
-            return PascalVOCDetectionEvaluator(dataset_name)
+            from modelscope.models.cv.image_defrcn_fewshot.evaluation.pascal_voc_evaluation import PascalVOCEvaluator
+            return PascalVOCEvaluator(dataset_name)
         if len(evaluator_list) == 0:
             raise NotImplementedError(
                 'no Evaluator for the dataset {} with the type {}'.format(
@@ -153,14 +163,54 @@ class DefaultTrainer(SimpleTrainer):
 
     @classmethod
     def test(cls, cfg, model, evaluators=None):
-        from detectron2.engine.defaults import DefaultTrainer as _DefaultTrainer
-        _DefaultTrainer.build_evaluator = cls.build_evaluator
+        logger = get_logger()
 
-        return _DefaultTrainer.test(cfg, model, evaluators)
+        if isinstance(evaluators, DatasetEvaluator):
+            evaluators = [evaluators]
+        if evaluators is not None:
+            assert len(
+                cfg.DATASETS.TEST) == len(evaluators), '{} != {}'.format(
+                    len(cfg.DATASETS.TEST), len(evaluators))
+
+        results = OrderedDict()
+        for idx, dataset_name in enumerate(cfg.DATASETS.TEST):
+            data_loader = build_detection_test_loader(cfg, dataset_name)
+            # When evaluators are passed in as arguments,
+            # implicitly assume that evaluators can be created before data_loader.
+            if evaluators is not None:
+                evaluator = evaluators[idx]
+            else:
+                try:
+                    evaluator = cls.build_evaluator(cfg, dataset_name)
+                except NotImplementedError:
+                    logger.warn(
+                        'No evaluator found. Use `DefaultTrainer.test(evaluators=)`, '
+                        'or implement its `build_evaluator` method.')
+                    results[dataset_name] = {}
+                    continue
+            results_i = inference_on_dataset(model, data_loader, evaluator,
+                                             cfg)
+            results[dataset_name] = results_i
+            if comm.is_main_process():
+                assert isinstance(
+                    results_i, dict
+                ), 'Evaluator must return a dict on the main process. Got {} instead.'.format(
+                    results_i)
+                logger.info('Evaluation results for {} in csv format:'.format(
+                    dataset_name))
+                print_csv_format(results_i)
+
+        if len(results) == 1:
+            results = list(results.values())[0]
+        return results
 
 
 @TRAINERS.register_module(module_name=Trainers.image_fewshot_detection)
 class ImageDefrcnFewshotTrainer(BaseTrainer):
+    """
+    Defrcn model trainer, used to train base model and fsod/gfsod model.
+    And model_surgery function used to modify model outputs arch, to train fsod & gfsod.
+    """
 
     def __init__(self,
                  model: Optional[Union[TorchModel, nn.Module, str]] = None,
@@ -170,6 +220,16 @@ class ImageDefrcnFewshotTrainer(BaseTrainer):
                  seed: int = 0,
                  cfg_modify_fn: Optional[Callable] = None,
                  **kwargs):
+        """ init model
+
+        Args:
+            model:  used to init model
+            cfg_file: model config file path, if none, will init from model_dir by ModelFile.CONFIGURATION
+            arg_parse_fn: Same as ``parse_fn`` in :obj:`Config.to_args`.
+            model_revision: model version. Use latest if model_revision is none.
+            seed: random seed
+            cfg_modify_fn: modify model config, should be callable
+        """
 
         if isinstance(model, str):
             self.model_dir = self.get_or_download_model_dir(
@@ -188,6 +248,8 @@ class ImageDefrcnFewshotTrainer(BaseTrainer):
 
         self.logger = get_logger(log_level=self.cfg.get('log_level', 'INFO'))
 
+        kwargs['_cfg_dict'] = self.cfg
+
         if isinstance(model, (TorchModel, nn.Module)):
             self.model = model
         else:
@@ -195,25 +257,9 @@ class ImageDefrcnFewshotTrainer(BaseTrainer):
 
         self.model_cfg = self.model.get_model_cfg()
 
-        if 'datasets_train' in kwargs:
-            self.model_cfg.merge_from_list(
-                ['DATASETS.TRAIN', kwargs['datasets_train']])
-        if 'datasets_test' in kwargs:
-            self.model_cfg.merge_from_list(
-                ['DATASETS.TEST', kwargs['datasets_test']])
-        if 'work_dir' in kwargs:
-            self.model_cfg.merge_from_list(['OUTPUT_DIR', kwargs['work_dir']])
-
         if not os.path.exists(self.model_cfg.OUTPUT_DIR):
             os.makedirs(self.model_cfg.OUTPUT_DIR)
 
-        self.model_cfg.freeze()
-
-        self.data_dir = kwargs.get('data_dir', None)
-        self.data_type = kwargs.get('data_type', 'pascal_voc')
-
-        self.register_data(self.data_type, self.data_dir)
-
         self.trainer = DefaultTrainer(self.model, self.model_cfg)
 
     def train(self, *args, **kwargs):
@@ -230,87 +276,23 @@ class ImageDefrcnFewshotTrainer(BaseTrainer):
         return metric_values
 
     def build_model(self, *args, **kwargs) -> Union[nn.Module, TorchModel]:
-        model = Model.from_pretrained(self.model_dir, **kwargs)
+        model = Model.from_pretrained(
+            model_name_or_path=self.model_dir, cfg_dict=self.cfg, **kwargs)
         if not isinstance(model, nn.Module) and hasattr(model, 'model'):
             return model.model
         elif isinstance(model, nn.Module):
             return model
 
-    @classmethod
-    def register_data(cls, data_type='pascal_voc', data_dir=None):
-
-        if data_type == 'pascal_voc':
-            from modelscope.models.cv.image_defrcn_fewshot.utils.voc_register import register_all_voc
-            if data_dir:
-                register_all_voc(data_dir)
-            else:
-                register_all_voc()
-        else:
-            raise NotImplementedError(
-                'no {} dataset was registered'.format(data_type))
-
     @classmethod
     def model_surgery(cls,
                       src_path,
                       save_dir,
                       data_type='pascal_voc',
-                      method='remove'):
+                      method='remove',
+                      params_name=[
+                          'model.roi_heads.box_predictor.cls_score',
+                          'model.roi_heads.box_predictor.bbox_pred'
+                      ]):
 
-        assert method in ['remove',
-                          'randinit'], '{} not implemented'.format(method)
-
-        def _surgery(param_name, is_weight, tar_size, ckpt):
-            weight_name = param_name + ('.weight' if is_weight else '.bias')
-            pretrained_weight = ckpt['model'][weight_name]
-            prev_cls = pretrained_weight.size(0)
-            if 'cls_score' in param_name:
-                prev_cls -= 1
-            if is_weight:
-                feat_size = pretrained_weight.size(1)
-                new_weight = torch.rand((tar_size, feat_size))
-                torch.nn.init.normal_(new_weight, 0, 0.01)
-            else:
-                new_weight = torch.zeros(tar_size)
-
-            new_weight[:prev_cls] = pretrained_weight[:prev_cls]
-            if 'cls_score' in param_name:
-                new_weight[-1] = pretrained_weight[-1]  # bg class
-            ckpt['model'][weight_name] = new_weight
-
-        if data_type == 'pascal_voc':
-            TAR_SIZE = 20
-            params_name = [
-                'model.roi_heads.box_predictor.cls_score',
-                'model.roi_heads.box_predictor.bbox_pred'
-            ]
-
-            save_name = 'model_reset_' + ('remove' if method == 'remove' else
-                                          'surgery') + '.pth'
-            save_path = os.path.join(save_dir, save_name)
-            os.makedirs(save_dir, exist_ok=True)
-
-            ckpt = torch.load(src_path)
-
-            if 'scheduler' in ckpt:
-                del ckpt['scheduler']
-            if 'optimizer' in ckpt:
-                del ckpt['optimizer']
-            if 'iteration' in ckpt:
-                ckpt['iteration'] = 0
-
-            if method == 'remove':
-                for param_name in params_name:
-                    del ckpt['model'][param_name + '.weight']
-                    if param_name + '.bias' in ckpt['model']:
-                        del ckpt['model'][param_name + '.bias']
-            else:
-                tar_sizes = [TAR_SIZE + 1, TAR_SIZE * 4]
-                for idx, (param_name,
-                          tar_size) in enumerate(zip(params_name, tar_sizes)):
-                    _surgery(param_name, True, tar_size, ckpt)
-                    _surgery(param_name, False, tar_size, ckpt)
-
-            torch.save(ckpt, save_path)
-        else:
-            NotImplementedError(
-                '{} dataset does not supported'.format(data_type))
+        from modelscope.models.cv.image_defrcn_fewshot.utils.model_surgery_op import model_surgery as _model_surgery
+        _model_surgery(src_path, save_dir, data_type, method, params_name)
diff --git a/modelscope/trainers/cv/image_detection_damoyolo_trainer.py b/modelscope/trainers/cv/image_detection_damoyolo_trainer.py
index e9c4cc20..fe827b74 100644
--- a/modelscope/trainers/cv/image_detection_damoyolo_trainer.py
+++ b/modelscope/trainers/cv/image_detection_damoyolo_trainer.py
@@ -30,7 +30,7 @@ from modelscope.msdatasets.task_datasets.damoyolo import (build_dataloader,
 from modelscope.trainers.base import BaseTrainer
 from modelscope.trainers.builder import TRAINERS
 from modelscope.utils.checkpoint import save_checkpoint
-from modelscope.utils.constant import ModelFile
+from modelscope.utils.constant import DEFAULT_MODEL_REVISION, ModelFile
 from modelscope.utils.logger import get_logger
 from modelscope.utils.metric import MeterBuffer
 from modelscope.utils.torch_utils import get_rank, synchronize
@@ -44,6 +44,7 @@ class ImageDetectionDamoyoloTrainer(BaseTrainer):
                  cfg_file: str = None,
                  load_pretrain: bool = True,
                  cache_path: str = None,
+                 model_revision: str = DEFAULT_MODEL_REVISION,
                  *args,
                  **kwargs):
         """ High-level finetune api for Damoyolo.
@@ -56,7 +57,8 @@ class ImageDetectionDamoyoloTrainer(BaseTrainer):
             cache_path: cache path of model files.
         """
         if model is not None:
-            self.cache_path = self.get_or_download_model_dir(model)
+            self.cache_path = self.get_or_download_model_dir(
+                model, model_revision)
             if cfg_file is None:
                 self.cfg_file = os.path.join(self.cache_path,
                                              ModelFile.CONFIGURATION)
diff --git a/modelscope/trainers/default_config.py b/modelscope/trainers/default_config.py
index 7b2e339a..7619633f 100644
--- a/modelscope/trainers/default_config.py
+++ b/modelscope/trainers/default_config.py
@@ -1,26 +1,51 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 
+from typing import Dict, List, Optional
+
 from modelscope.utils.config import Config
 
-DEFAULT_CONFIG = {
+DEFAULT_CONFIG = Config({
+    'framework': 'pytorch',
     'train': {
-        'hooks': [
-            {
-                'type': 'CheckpointHook',
-                'interval': 1
-            },
-            {
-                'type': 'TextLoggerHook',
-                'interval': 10
-            },
-            {
-                'type': 'IterTimerHook'
-            },
-            {
-                'type': 'TensorboardHook',
-                'interval': 10
-            },
-        ]
+        'work_dir': '/tmp',
+        'max_epochs': 10,
+        'dataloader': {
+            'batch_size_per_gpu': 16,
+            'workers_per_gpu': 0
+        },
+        'optimizer': {
+            'type': 'SGD',
+            'lr': 1e-3
+        },
+        'lr_scheduler': {
+            'type': 'StepLR',
+            'step_size': 2
+        },
+        'hooks': [{
+            'type': 'CheckpointHook',
+            'interval': 1
+        }]
+    },
+    'evaluation': {
+        'dataloader': {
+            'batch_size_per_gpu': 16,
+            'workers_per_gpu': 0,
+            'shuffle': False
+        },
+    }
+})
+
+DEFAULT_HOOKS_CONFIG = {
+    'train': {
+        'hooks': [{
+            'type': 'CheckpointHook',
+            'interval': 1
+        }, {
+            'type': 'TextLoggerHook',
+            'interval': 10
+        }, {
+            'type': 'IterTimerHook'
+        }]
     }
 }
 
@@ -33,10 +58,40 @@ def merge_cfg(cfg: Config):
     Aegs:
         cfg: The input cfg to be merged into.
     """
-    cfg.merge_from_dict(DEFAULT_CONFIG, force=False)
-    # pop duplicate hook
+    cfg.merge_from_dict(DEFAULT_HOOKS_CONFIG, force=False)
 
-    if any(['BestCkptSaverHook' == hook['type'] for hook in cfg.train.hooks]):
-        cfg.train.hooks = list(
-            filter(lambda hook: hook['type'] != 'CheckpointHook',
-                   cfg.train.hooks))
+
+def merge_hooks(cfg: Config) -> List[Dict]:
+    key_chain_hook_map = {
+        'train.logging': 'TextLoggerHook',
+        'train.checkpoint.period': 'CheckpointHook',
+        'train.checkpoint.best': 'BestCkptSaverHook',
+        'evaluation.period': 'EvaluationHook'
+    }
+    hooks = cfg.train.hooks.copy()
+    for key_chain, hook_type in key_chain_hook_map.items():
+        hook = _key_chain_to_hook(cfg, key_chain, hook_type)
+        if hook is not None:
+            hooks.append(hook)
+    return hooks
+
+
+def _key_chain_to_hook(cfg: Config, key_chain: str,
+                       hook_type: str) -> Optional[Dict]:
+    if not _check_basic_hook(cfg, key_chain, hook_type):
+        return None
+    hook_params: Dict = cfg.safe_get(key_chain)
+    hook = {'type': hook_type}
+    hook.update(hook_params)
+    return hook
+
+
+def _check_basic_hook(cfg: Config, key_chain: str, hook_type: str) -> bool:
+    if cfg.safe_get(key_chain) is None:
+        return False
+    hooks = list(
+        filter(lambda hook: hook['type'] == hook_type, cfg.train.hooks))
+    assert len(hooks) == 0, f'The key_chain {key_chain} and the traditional hook ' \
+                            f'cannot exist at the same time, ' \
+                            f'please delete {hook_type} in the configuration file.'
+    return True
diff --git a/modelscope/trainers/hooks/__init__.py b/modelscope/trainers/hooks/__init__.py
index 11a73f24..51677f25 100644
--- a/modelscope/trainers/hooks/__init__.py
+++ b/modelscope/trainers/hooks/__init__.py
@@ -5,7 +5,7 @@ from modelscope.utils.import_utils import LazyImportModule
 
 if TYPE_CHECKING:
     from .builder import HOOKS, build_hook
-    from .checkpoint_hook import BestCkptSaverHook, CheckpointHook
+    from .checkpoint_hook import BestCkptSaverHook, CheckpointHook, LoadCheckpointHook
     from .early_stop_hook import EarlyStopHook
     from .compression import SparsityHook
     from .evaluation_hook import EvaluationHook
@@ -20,7 +20,8 @@ if TYPE_CHECKING:
 else:
     _import_structure = {
         'builder': ['HOOKS', 'build_hook'],
-        'checkpoint_hook': ['BestCkptSaverHook', 'CheckpointHook'],
+        'checkpoint_hook':
+        ['BestCkptSaverHook', 'CheckpointHook', 'LoadCheckpointHook'],
         'compression': ['SparsityHook'],
         'evaluation_hook': ['EvaluationHook'],
         'hook': ['Hook'],
diff --git a/modelscope/trainers/hooks/checkpoint_hook.py b/modelscope/trainers/hooks/checkpoint_hook.py
index e76f46e4..27f32556 100644
--- a/modelscope/trainers/hooks/checkpoint_hook.py
+++ b/modelscope/trainers/hooks/checkpoint_hook.py
@@ -1,4 +1,5 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
+import importlib
 import os
 import random
 
@@ -27,36 +28,31 @@ class CheckpointHook(Hook):
         by_epoch (bool): Saving checkpoints by epoch or by iteration.
         save_optimizer (bool): Whether to save optimizer state dict.  Default: True.
         save_dir (str): The directory to save checkpoints. If is None, use `trainer.work_dir`
+        output_sub_dir (str): The sub folder under the `save_dir` to save the output checkpoint for inference.
+            Default 'output'.
         save_last (bool): Whether to save the last checkpoint. Default: True.
-        checkpoint_file (str): The checkpoint file to be loaded.
-        load_all_state (bool): Load all states(optimizer, epoch, lr_scheduler, random_state, etc.) when loading old
-            training state file or not. The model's state dict will only be loaded if False.
         max_checkpoint_num (int): The max number of checkpoint files, default None which means never delete anything.
-        If the number exceeding the limit, earlier checkpoints will be deleted first.
+            If the number exceeding the limit, earlier checkpoints will be deleted first.
     """
 
     PRIORITY = Priority.LOW
 
-    def __init__(
-        self,
-        interval=0,
-        by_epoch=True,
-        save_optimizer=True,
-        save_dir=None,
-        save_last=True,
-        checkpoint_file=None,
-        load_all_state=True,
-        max_checkpoint_num=None,
-    ):
+    def __init__(self,
+                 interval=0,
+                 by_epoch=True,
+                 save_optimizer=True,
+                 save_dir=None,
+                 output_sub_dir=ModelFile.TRAIN_OUTPUT_DIR,
+                 save_last=True,
+                 max_checkpoint_num=None,
+                 **kwargs):
         self.interval = interval
         self.by_epoch = by_epoch
         self.save_optimizer = save_optimizer
         self.save_dir = save_dir
-        self.checkpoint_file = checkpoint_file
+        self.output_sub_dir = output_sub_dir
         self.save_last = save_last
         self.rng_state = None
-        self.need_load_rng_state = False
-        self.load_all_state = load_all_state
         self.max_checkpoint_num = None
         if max_checkpoint_num is not None:
             self.max_checkpoint_num = max(int(max_checkpoint_num), 1)
@@ -77,28 +73,6 @@ class CheckpointHook(Hook):
         if is_master():
             self.logger.info(f'Checkpoints will be saved to {self.save_dir}')
 
-        if self.checkpoint_file is not None and os.path.isfile(
-                self.checkpoint_file):
-            meta = self.load_checkpoint(self.checkpoint_file, trainer,
-                                        self.load_all_state)
-            self.rng_state = meta.get('rng_state')
-            self.need_load_rng_state = self.load_all_state
-
-    def before_train_iter(self, trainer):
-        if self.need_load_rng_state:
-            if self.rng_state is not None:
-                random.setstate(self.rng_state['random'])
-                np.random.set_state(self.rng_state['numpy'])
-                torch.random.set_rng_state(self.rng_state['cpu'])
-                if torch.cuda.is_available():
-                    torch.cuda.random.set_rng_state_all(self.rng_state['cuda'])
-                self.need_load_rng_state = False
-            else:
-                self.logger.warning(
-                    'Random state cannot be found in checkpoint file, '
-                    'this may cause a random data order or model initialization.'
-                )
-
     def after_train_epoch(self, trainer):
         if not self.by_epoch:
             return
@@ -110,42 +84,6 @@ class CheckpointHook(Hook):
                     f'Saving checkpoint at {trainer.epoch + 1} epoch')
                 self._save_checkpoint(trainer)
 
-    @classmethod
-    def load_checkpoint(cls, filename, trainer, load_all_state=True):
-        from modelscope.trainers.parallel.utils import is_parallel
-        if is_parallel(trainer.model):
-            model = trainer.model.module
-        else:
-            model = trainer.model
-        meta = load_checkpoint(
-            filename, model,
-            getattr(trainer, 'optimizer', None) if load_all_state else None,
-            getattr(trainer, 'lr_scheduler', None) if load_all_state else None)
-        if load_all_state:
-            trainer._epoch = meta.get('epoch', trainer._epoch)
-            trainer._iter = meta.get('iter', trainer._iter)
-            trainer._inner_iter = meta.get('inner_iter', trainer._inner_iter)
-
-            for i, hook in enumerate(trainer.hooks):
-                # hook: Hook
-                key = f'{hook.__class__}-{i}'
-                if key in meta and hasattr(hook, 'load_state_dict'):
-                    hook.load_state_dict(meta.get(key, {}))
-                else:
-                    trainer.logger.warning(
-                        f'The state_dict of hook {hook.__class__} at index {i} is not found in the checkpoint file.'
-                    )
-
-        version = meta.get('modelscope')
-        if version != __version__:
-            trainer.logger.warning(
-                f'The modelscope version of loaded checkpoint does not match the runtime version. '
-                f'The saved version: {version}, runtime version: {__version__}'
-            )
-        trainer.logger.info(
-            f'Checkpoint {filename} saving time: {meta.get("time")}')
-        return meta
-
     def _save_checkpoint(self, trainer):
         if self.by_epoch:
             cur_save_name = os.path.join(
@@ -153,6 +91,7 @@ class CheckpointHook(Hook):
         else:
             cur_save_name = os.path.join(
                 self.save_dir, f'{LogKeys.ITER}_{trainer.iter + 1}.pth')
+        cur_save_name = extend_save_name_for_parallel(cur_save_name)
 
         self.rng_state = {
             'random': random.getstate(),
@@ -166,9 +105,13 @@ class CheckpointHook(Hook):
             'inner_iter': trainer.inner_iter + 1,
             'rng_state': self.rng_state,
         }
-        for i, hook in enumerate(trainer.hooks):
-            if hasattr(hook, 'state_dict'):
+
+        i = 0
+        for hook in trainer.hooks:
+            if hasattr(hook, 'state_dict') and getattr(hook, '_should_save',
+                                                       True):
                 meta[f'{hook.__class__}-{i}'] = hook.state_dict()
+                i += 1
 
         save_checkpoint(
             trainer.model,
@@ -197,7 +140,7 @@ class CheckpointHook(Hook):
                     self.history_checkpoints.append(ckpt_file)
 
     def _save_pretrained(self, trainer):
-        output_dir = os.path.join(self.save_dir, ModelFile.TRAIN_OUTPUT_DIR)
+        output_dir = os.path.join(self.save_dir, self.output_sub_dir)
         from modelscope.trainers.parallel.utils import is_parallel
 
         if is_parallel(trainer.model):
@@ -288,15 +231,19 @@ class CheckpointHook(Hook):
 
 @HOOKS.register_module(module_name=Hooks.BestCkptSaverHook)
 class BestCkptSaverHook(CheckpointHook):
-    """Save best checkpoints hook.
+    """
+    Save best checkpoints hook.
+
     Args:
         metric_key (str): Metric key to compare rule for best score.
-        rule (str): Comparison rule for best score.
-            Support "max" and "min". If rule is "max", the checkpoint at the maximum `metric_key`
-            will be saved, If rule is "min", the checkpoint at the minimum `metric_key` will be saved.
+        rule (str): Comparison rule for best score. Support "max" and "min". If rule is "max", the checkpoint
+            at the maximum `metric_key` will be saved, If rule is "min", the checkpoint at the minimum `metric_key`
+            will be saved.
         by_epoch (bool): Save best checkpoints by epoch or by iteration.
         save_optimizer (bool): Whether to save optimizer state dict.  Default: True.
         save_dir (str): Output directory to save best checkpoint.
+        output_sub_dir (str): The sub folder under the `save_dir` to save the output checkpoint for inference.
+            Default 'output_best'.
         restore_best (bool): Whether to restore the best checkpoint after training.
         max_checkpoint_num (int): The max number of checkpoint files, default None which means never delete anything.
             If the number exceeding the limit, checkpoints with worse metric will be deleted, which is judged by the
@@ -312,6 +259,7 @@ class BestCkptSaverHook(CheckpointHook):
                  by_epoch=True,
                  save_optimizer=True,
                  save_dir=None,
+                 output_sub_dir=ModelFile.TRAIN_BEST_OUTPUT_DIR,
                  save_file_name=None,
                  restore_best=False,
                  max_checkpoint_num=1,
@@ -323,6 +271,7 @@ class BestCkptSaverHook(CheckpointHook):
             by_epoch=by_epoch,
             save_optimizer=save_optimizer,
             save_dir=save_dir,
+            output_sub_dir=output_sub_dir,
             max_checkpoint_num=max_checkpoint_num,
             **kwargs,
         )
@@ -372,6 +321,7 @@ class BestCkptSaverHook(CheckpointHook):
             if '.' not in cur_save_name:
                 cur_save_name = f'{cur_save_name}.pth'
             cur_save_name = os.path.join(self.save_dir, cur_save_name)
+        cur_save_name = extend_save_name_for_parallel(cur_save_name)
 
         meta = {
             'epoch': trainer.epoch,
@@ -379,8 +329,13 @@ class BestCkptSaverHook(CheckpointHook):
             'inner_iter': trainer.inner_iter + 1,
             'rng_state': self.rng_state,
         }
-        for i, hook in enumerate(trainer.hooks):
-            meta[f'{hook.__class__}-{i}'] = hook.state_dict()
+
+        i = 0
+        for hook in trainer.hooks:
+            if hasattr(hook, 'state_dict') and getattr(hook, '_should_save',
+                                                       True):
+                meta[f'{hook.__class__}-{i}'] = hook.state_dict()
+                i += 1
 
         if os.path.isfile(cur_save_name):
             os.remove(cur_save_name)
@@ -428,4 +383,127 @@ class BestCkptSaverHook(CheckpointHook):
     def after_run(self, trainer):
         if self.restore_best:
             if is_master():
-                self.load_checkpoint(self._best_ckpt_file, trainer)
+                LoadCheckpointHook.load_checkpoint(self._best_ckpt_file,
+                                                   trainer)
+
+
+@HOOKS.register_module(module_name=Hooks.LoadCheckpointHook)
+class LoadCheckpointHook(Hook):
+    """Load a checkpoint file at the beginning of training or evaluating.
+
+    This hook does not need to be configured or saved in the config file.
+    User should use it by:
+    >>> trainer.train('some-checkpoint', load_all_state=True)
+    or
+    >>> trainer.evaluate('some-checkpoint')
+    instead.
+
+    Args:
+        checkpoint_file (str): The checkpoint file to be loaded.
+        load_all_state (bool): Load all states(optimizer, epoch, lr_scheduler, random_state, etc.) when loading old
+            training state file or not. The model's state dict will only be loaded if False.
+    """
+
+    PRIORITY = Priority.HIGH
+
+    _should_save = False
+
+    def __init__(
+        self,
+        checkpoint_file=None,
+        load_all_state=True,
+    ):
+        self.checkpoint_file = checkpoint_file
+        self.rng_state = None
+        self.need_load_rng_state = False
+        self.load_all_state = load_all_state
+
+    def before_run(self, trainer):
+        if not hasattr(trainer, 'logger'):
+            self.logger = get_logger()
+        else:
+            self.logger = trainer.logger
+
+        if self.checkpoint_file is not None and os.path.isfile(
+                self.checkpoint_file):
+            meta = self.load_checkpoint(self.checkpoint_file, trainer,
+                                        self.load_all_state)
+            self.rng_state = meta.get('rng_state')
+            self.need_load_rng_state = self.load_all_state
+
+    def before_train_iter(self, trainer):
+        if self.need_load_rng_state:
+            if self.rng_state is not None:
+                random.setstate(self.rng_state['random'])
+                np.random.set_state(self.rng_state['numpy'])
+                torch.random.set_rng_state(self.rng_state['cpu'])
+                if torch.cuda.is_available():
+                    torch.cuda.random.set_rng_state_all(self.rng_state['cuda'])
+                self.need_load_rng_state = False
+            else:
+                self.logger.warning(
+                    'Random state cannot be found in checkpoint file, '
+                    'this may cause a random data order or model initialization.'
+                )
+
+    @classmethod
+    def load_checkpoint(cls, filename, trainer, load_all_state=True):
+        from modelscope.trainers.parallel.utils import is_parallel
+        if is_parallel(trainer.model):
+            model = trainer.model.module
+        else:
+            model = trainer.model
+        meta = load_checkpoint(
+            filename, model,
+            getattr(trainer, 'optimizer', None) if load_all_state else None,
+            getattr(trainer, 'lr_scheduler', None) if load_all_state else None)
+        if load_all_state:
+            trainer._epoch = meta.get('epoch', trainer._epoch)
+            trainer._iter = meta.get('iter', trainer._iter)
+            trainer._inner_iter = meta.get('inner_iter', trainer._inner_iter)
+
+            i = 0
+            for hook in trainer.hooks:
+                if hasattr(hook, 'load_state_dict') and getattr(
+                        hook, '_should_save', True):
+                    key = f'{hook.__class__}-{i}'
+                    if key in meta:
+                        hook.load_state_dict(meta.get(key, {}))
+                    else:
+                        trainer.logger.warning(
+                            f'The state_dict of hook {hook.__class__} at index {i} is not found in the checkpoint file.'
+                        )
+                    i += 1
+
+        version = meta.get('modelscope')
+        if version != __version__:
+            trainer.logger.warning(
+                f'The modelscope version of loaded checkpoint does not match the runtime version. '
+                f'The saved version: {version}, runtime version: {__version__}'
+            )
+        trainer.logger.info(
+            f'Checkpoint {filename} saving time: {meta.get("time")}')
+        return meta
+
+
+def extend_save_name_for_parallel(cur_save_name: str) -> str:
+    """Saving model parameters during tensor parallel training
+    requires each process to save its own parameters,
+    This function will try to get the local rank of the process
+    and extend save name for multi-slice model.
+
+    Args:
+        cur_save_name (str): Original save name.
+
+    Returns:
+        str: Extended save name.
+    """
+    try:
+        mpu = importlib.import_module('megatron_util.mpu')
+        tp_world_size = mpu.get_tensor_model_parallel_world_size()
+        if tp_world_size == 1:
+            return cur_save_name
+        mp_rank = mpu.get_tensor_model_parallel_rank()
+        return cur_save_name.replace('.', '_mp_rank_{:02d}.'.format(mp_rank))
+    except (ImportError, AssertionError):
+        return cur_save_name
diff --git a/modelscope/trainers/hooks/evaluation_hook.py b/modelscope/trainers/hooks/evaluation_hook.py
index 331b8f04..80c8c31a 100644
--- a/modelscope/trainers/hooks/evaluation_hook.py
+++ b/modelscope/trainers/hooks/evaluation_hook.py
@@ -8,11 +8,13 @@ from .hook import Hook
 
 @HOOKS.register_module(module_name=Hooks.EvaluationHook)
 class EvaluationHook(Hook):
-    """Evaluation hook.
+    """
+    Evaluation hook.
+
     Args:
         interval (int): Evaluation interval.
         by_epoch (bool): Evaluate by epoch or by iteration.
-        start_idx (int | None, optional): The epoch/iterations validation begins.
+        start_idx (int or None, optional): The epoch or iterations validation begins.
             Default: None, validate every interval epochs/iterations from scratch.
     """
 
diff --git a/modelscope/trainers/hooks/logger/tensorboard_hook.py b/modelscope/trainers/hooks/logger/tensorboard_hook.py
index 31bef4f0..d7ce0dac 100644
--- a/modelscope/trainers/hooks/logger/tensorboard_hook.py
+++ b/modelscope/trainers/hooks/logger/tensorboard_hook.py
@@ -13,7 +13,9 @@ from .base import LoggerHook
 
 @HOOKS.register_module(module_name=Hooks.TensorboardHook)
 class TensorboardHook(LoggerHook):
-    """TensorBoard hook for visualization.
+    """
+    TensorBoard hook for visualization.
+
     Args:
         out_dir: output directory to save tensorboard files
         interval (int): Logging interval (every k iterations).
diff --git a/modelscope/trainers/hooks/logger/text_logger_hook.py b/modelscope/trainers/hooks/logger/text_logger_hook.py
index eb22d03c..e6a17691 100644
--- a/modelscope/trainers/hooks/logger/text_logger_hook.py
+++ b/modelscope/trainers/hooks/logger/text_logger_hook.py
@@ -131,6 +131,7 @@ class TextLoggerHook(LoggerHook):
             if self.by_epoch:
                 log_str = f'{epoch_key}({log_dict[mode_key]}) [{log_dict[epoch_key]}][{log_dict[iter_key]}]\t'
             else:
+                # TODO log_dict[iter_key] is not correct because of it's train_loop's inner_iter
                 log_str = f'{iter_key}({log_dict[mode_key]}) [{log_dict[iter_key]}]\t'
             self._logged_keys.extend([mode_key, iter_key, epoch_key])
 
@@ -138,7 +139,8 @@ class TextLoggerHook(LoggerHook):
         for name, val in log_dict.items():
             if name in self._logged_keys:
                 continue
-            if isinstance(val, float):
+            if isinstance(val,
+                          float) and name not in self.ignore_rounding_keys:
                 val = f'{val:.4f}'
             log_items.append(f'{name}: {val}')
         log_str += ', '.join(log_items)
@@ -168,7 +170,9 @@ class TextLoggerHook(LoggerHook):
             return items
 
     def log(self, trainer):
-        cur_iter = self.get_iter(trainer, inner_iter=True)
+        cur_iter = self.get_iter(
+            trainer, inner_iter=True
+        ) if trainer.mode == ModeKeys.TRAIN else trainer.iters_per_epoch
 
         log_dict = OrderedDict(
             mode=trainer.mode, epoch=self.get_epoch(trainer), iter=cur_iter)
diff --git a/modelscope/trainers/hooks/optimizer/apex_optimizer_hook.py b/modelscope/trainers/hooks/optimizer/apex_optimizer_hook.py
index f87ae849..dca17593 100644
--- a/modelscope/trainers/hooks/optimizer/apex_optimizer_hook.py
+++ b/modelscope/trainers/hooks/optimizer/apex_optimizer_hook.py
@@ -8,8 +8,10 @@ from .base import OptimizerHook
 
 @HOOKS.register_module(module_name=Hooks.ApexAMPOptimizerHook)
 class ApexAMPOptimizerHook(OptimizerHook):
-    """Fp16 optimizer, if torch version is less than 1.6.0,
+    """
+    Fp16 optimizer, if torch version is less than 1.6.0,
     you must install apex (https://www.github.com/nvidia/apex) else use torch.cuda.amp by default
+
     Args:
         cumulative_iters (int): interval of gradients accumulation. Default: 1
         grad_clip (dict): Default None. Containing keys:
diff --git a/modelscope/trainers/hooks/optimizer/torch_optimizer_hook.py b/modelscope/trainers/hooks/optimizer/torch_optimizer_hook.py
index 30ea88a2..77367985 100644
--- a/modelscope/trainers/hooks/optimizer/torch_optimizer_hook.py
+++ b/modelscope/trainers/hooks/optimizer/torch_optimizer_hook.py
@@ -8,8 +8,10 @@ from .base import OptimizerHook
 
 @HOOKS.register_module(module_name=Hooks.TorchAMPOptimizerHook)
 class TorchAMPOptimizerHook(OptimizerHook):
-    """Fp16 optimizer, if torch version is less than 1.6.0,
+    """
+    Fp16 optimizer, if torch version is less than 1.6.0,
     you must install apex (https://www.github.com/nvidia/apex) else use torch.cuda.amp by default
+
     Args:
         cumulative_iters (int): interval of gradients accumulation. Default: 1
         grad_clip (dict): Default None. Containing keys:
diff --git a/modelscope/trainers/multi_modal/mgeo_ranking_trainer.py b/modelscope/trainers/multi_modal/mgeo_ranking_trainer.py
index 6079a8a8..772a5620 100644
--- a/modelscope/trainers/multi_modal/mgeo_ranking_trainer.py
+++ b/modelscope/trainers/multi_modal/mgeo_ranking_trainer.py
@@ -1,18 +1,14 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 
-import time
 from dataclasses import dataclass
 from typing import Any, Callable, Dict, List, Optional, Tuple, Union
 
-import numpy as np
 import torch
 from torch import nn
-from torch.utils.data import DataLoader, Dataset
-from tqdm import tqdm
+from torch.utils.data import Dataset
 
 from modelscope.metainfo import Trainers
-from modelscope.models.base import Model, TorchModel
-from modelscope.models.nlp import BertForTextRanking
+from modelscope.models.base import TorchModel
 from modelscope.msdatasets.ms_dataset import MsDataset
 from modelscope.preprocessors.base import Preprocessor
 from modelscope.trainers.builder import TRAINERS
@@ -189,135 +185,3 @@ class MGeoRankingTrainer(NlpEpochBasedTrainer):
             eval_dataset=eval_dataset,
             model_revision=model_revision,
             **kwargs)
-
-    def compute_mrr(self, result, k=10):
-        mrr = 0
-        for res in result.values():
-            sorted_res = sorted(res, key=lambda x: x[0], reverse=True)
-            ar = 0
-            for index, ele in enumerate(sorted_res[:k]):
-                if str(ele[1]) == '1':
-                    ar = 1.0 / (index + 1)
-                    break
-            mrr += ar
-        return mrr / len(result)
-
-    def compute_ndcg(self, result, k=10):
-        ndcg = 0
-        from sklearn import ndcg_score
-        for res in result.values():
-            sorted_res = sorted(res, key=lambda x: [0], reverse=True)
-            labels = np.array([[ele[1] for ele in sorted_res]])
-            scores = np.array([[ele[0] for ele in sorted_res]])
-            ndcg += float(ndcg_score(labels, scores, k=k))
-        ndcg = ndcg / len(result)
-        return ndcg
-
-    def to_device(self, val, device):
-        if isinstance(val, torch.Tensor):
-            return val.to(device)
-        elif isinstance(val, list):
-            return [self.to_device(item, device) for item in val]
-        elif isinstance(val, dict):
-            new_val = {}
-            for key in val:
-                new_val[key] = self.to_device(val[key], device)
-            return new_val
-        print('can not convert to device')
-        raise Exception('can not convert to device')
-
-    def evaluate(self,
-                 checkpoint_path: Optional[str] = None,
-                 *args,
-                 **kwargs) -> Dict[str, float]:
-        """evaluate a dataset
-
-        evaluate a dataset via a specific model from the `checkpoint_path` path,
-        if the `checkpoint_path` does not exist, read from the config file.
-
-        Args:
-            checkpoint_path (Optional[str], optional): the model path. Defaults
-            to None.
-
-        Returns:
-            Dict[str, float]: the results about the evaluation Example:
-            {"accuracy": 0.5091743119266054, "f1": 0.673780487804878}
-        """
-        # get the raw online dataset
-        self.eval_dataloader = self._build_dataloader_with_dataset(
-            self.eval_dataset,
-            **self.cfg.evaluation.get('dataloader', {}),
-            collate_fn=self.eval_data_collator)
-        # generate a standard dataloader
-        # generate a model
-        if checkpoint_path is not None:
-            model = BertForTextRanking.from_pretrained(checkpoint_path)
-        else:
-            model = self.model
-
-        # copy from easynlp (start)
-        model.eval()
-        total_samples = 0
-
-        logits_list = list()
-        label_list = list()
-        qid_list = list()
-
-        total_spent_time = 0.0
-        device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
-        model.to(device)
-        for _step, batch in enumerate(tqdm(self.eval_dataloader)):
-            try:
-                batch = self.to_device(batch, device)
-            except RuntimeError:
-                batch = {key: val for key, val in batch.items()}
-
-            infer_start_time = time.time()
-            with torch.no_grad():
-                label_ids = batch.pop('labels').detach().cpu().numpy()
-                qids = batch.pop('qid').detach().cpu().numpy()
-                outputs = model(**batch)
-            infer_end_time = time.time()
-            total_spent_time += infer_end_time - infer_start_time
-            total_samples += self.eval_dataloader.batch_size
-
-            def sigmoid(logits):
-                return np.exp(logits) / (1 + np.exp(logits))
-
-            logits = outputs['logits'].squeeze(-1).detach().cpu().numpy()
-            logits = sigmoid(logits).tolist()
-
-            label_list.extend(label_ids)
-            logits_list.extend(logits)
-            qid_list.extend(qids)
-
-        logger.info('Inference time = {:.2f}s, [{:.4f} ms / sample] '.format(
-            total_spent_time, total_spent_time * 1000 / total_samples))
-
-        rank_result = {}
-        for qid, score, label in zip(qid_list, logits_list, label_list):
-            if qid not in rank_result:
-                rank_result[qid] = []
-            rank_result[qid].append((score, label))
-
-        for qid in rank_result:
-            rank_result[qid] = sorted(rank_result[qid], key=lambda x: x[0])
-
-        eval_outputs = list()
-        for metric in self.metrics:
-            if metric.startswith('mrr'):
-                k = metric.split('@')[-1]
-                k = int(k)
-                mrr = self.compute_mrr(rank_result, k=k)
-                logger.info('{}: {}'.format(metric, mrr))
-                eval_outputs.append((metric, mrr))
-            elif metric.startswith('ndcg'):
-                k = metric.split('@')[-1]
-                k = int(k)
-                ndcg = self.compute_ndcg(rank_result, k=k)
-                logger.info('{}: {}'.format(metric, ndcg))
-                eval_outputs.append(('ndcg', ndcg))
-            else:
-                raise NotImplementedError('Metric %s not implemented' % metric)
-
-        return dict(eval_outputs)
diff --git a/modelscope/trainers/nlp/__init__.py b/modelscope/trainers/nlp/__init__.py
index e3c39cf2..125e82c6 100644
--- a/modelscope/trainers/nlp/__init__.py
+++ b/modelscope/trainers/nlp/__init__.py
@@ -8,12 +8,14 @@ if TYPE_CHECKING:
     from .csanmt_translation_trainer import CsanmtTranslationTrainer
     from .text_ranking_trainer import TextRankingTrainer
     from .text_generation_trainer import TextGenerationTrainer
+    from .sentence_embedding_trainer import SentenceEmbeddingTrainer
 else:
     _import_structure = {
         'sequence_classification_trainer': ['SequenceClassificationTrainer'],
         'csanmt_translation_trainer': ['CsanmtTranslationTrainer'],
         'text_ranking_trainer': ['TextRankingTrainer'],
         'text_generation_trainer': ['TextGenerationTrainer'],
+        'sentence_emebedding_trainer': ['SentenceEmbeddingTrainer']
     }
 
     import sys
diff --git a/modelscope/trainers/nlp/document_grounded_dialog_generate_trainer.py b/modelscope/trainers/nlp/document_grounded_dialog_generate_trainer.py
new file mode 100644
index 00000000..213550bb
--- /dev/null
+++ b/modelscope/trainers/nlp/document_grounded_dialog_generate_trainer.py
@@ -0,0 +1,287 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os
+import re
+import string
+from collections import Counter
+
+import json
+import sacrebleu
+import torch
+import tqdm
+from rouge import Rouge
+from torch.cuda.amp import GradScaler, autocast
+from torch.utils.data import DataLoader
+from transformers import AdamW, get_scheduler
+
+from modelscope.metainfo import Trainers
+from modelscope.models import Model
+from modelscope.preprocessors import DocumentGroundedDialogGeneratePreprocessor
+from modelscope.trainers import EpochBasedTrainer
+from modelscope.trainers.builder import TRAINERS
+from modelscope.utils.constant import ModeKeys
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+
+def collate(batch):
+    query = [item['query'] for item in batch]
+    context = [json.loads(item['rerank']) for item in batch]
+    label = [item['response'] for item in batch]
+    return query, context, label
+
+
+def prepare_optimizer(model, lr, weight_decay, eps):
+    no_decay = ['bias', 'LayerNorm.weight']
+    optimizer_grouped_parameters = [{
+        'params': [
+            p for n, p in model.named_parameters()
+            if not any(nd in n for nd in no_decay)
+        ],
+        'weight_decay':
+        weight_decay,
+    }, {
+        'params': [
+            p for n, p in model.named_parameters()
+            if any(nd in n for nd in no_decay)
+        ],
+        'weight_decay':
+        0.0,
+    }]
+    optimizer = AdamW(optimizer_grouped_parameters, lr=lr, eps=eps)
+    return optimizer
+
+
+def prepare_scheduler(optimizer, epochs, steps_per_epoch, warmup_rate):
+    total_steps = epochs * steps_per_epoch
+    warmup_steps = int(total_steps * warmup_rate)
+    scheduler = get_scheduler(
+        name='linear',
+        optimizer=optimizer,
+        num_warmup_steps=warmup_steps,
+        num_training_steps=total_steps)
+    return scheduler
+
+
+def normalize_answer(s):
+    """Lower text and remove punctuation, articles and extra whitespace."""
+
+    def remove_articles(text):
+        return re.sub(r'\b(a|an|the)\b', ' ', text)
+
+    def white_space_fix(text):
+        return ' '.join(text.split())
+
+    def remove_punc(text):
+        exclude = set(string.punctuation)
+        return ''.join(ch for ch in text if ch not in exclude)
+
+    def lower(text):
+        return text.lower()
+
+    return white_space_fix(remove_articles(remove_punc(lower(s))))
+
+
+def f1_score(prediction, ground_truth):
+    prediction_tokens = normalize_answer(prediction).split()
+    ground_truth_tokens = normalize_answer(ground_truth).split()
+    common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
+    num_same = sum(common.values())
+    if num_same == 0:
+        return 0
+    precision = 1.0 * num_same / len(prediction_tokens)
+    recall = 1.0 * num_same / len(ground_truth_tokens)
+    f1 = (2 * precision * recall) / (precision + recall)
+    return f1
+
+
+def exact_match_score(prediction, ground_truth):
+    return normalize_answer(prediction) == normalize_answer(ground_truth)
+
+
+def metric_max_over_ground_truths(metric_fn, prediction, ground_truths):
+    scores_for_ground_truths = []
+    for ground_truth in ground_truths:
+        score = metric_fn(prediction, ground_truth)
+        scores_for_ground_truths.append(score)
+    return max(scores_for_ground_truths)
+
+
+def matching_evaluate(references, predictions):
+    f1 = em = total = 0
+    for ref_text, prediction in zip(references, predictions):
+        total += 1
+        ground_truths = [ref_text]
+        f1 += metric_max_over_ground_truths(f1_score, prediction,
+                                            ground_truths)
+        em += metric_max_over_ground_truths(exact_match_score, prediction,
+                                            ground_truths)
+    f1 = 100.0 * f1 / total
+    em = 100.0 * em / total
+
+    return f1, em
+
+
+def measure_result(result_dict):
+    meters = dict()
+
+    hypothesis_list = [
+        x.split('<response>')[-1].strip() for x in result_dict['outputs']
+    ]
+    hypothesis_list = [x if x else '@' for x in hypothesis_list]
+    reference_list = [
+        x.split('<response>')[-1].strip() for x in result_dict['targets']
+    ]
+    instance_num = len(reference_list)
+
+    # F1
+    f1, em = matching_evaluate(reference_list, hypothesis_list)
+    meters['f1'] = f1
+
+    # SacreBleu
+    bleu_score = [
+        sacrebleu.sentence_bleu(hypothesis, [reference]).score
+        for hypothesis, reference in zip(hypothesis_list, reference_list)
+    ]
+    bleu_score = sum(bleu_score) / instance_num
+    meters['bleu'] = bleu_score
+
+    # Rouge-L
+    rouge_func = Rouge()
+    rouge_score = [
+        x['rouge-l']['f']
+        for x in rouge_func.get_scores(hypothesis_list, reference_list)
+    ]
+    rouge_score = (sum(rouge_score) / instance_num) * 100
+    meters['rouge'] = rouge_score
+
+    return meters
+
+
+@TRAINERS.register_module(
+    module_name=Trainers.document_grounded_dialog_generate_trainer)
+class DocumentGroundedDialogGenerateTrainer(EpochBasedTrainer):
+
+    def __init__(self, model: str, revision='v1.0.0', *args, **kwargs):
+        self.model = Model.from_pretrained(model, revision=revision)
+        self.preprocessor = DocumentGroundedDialogGeneratePreprocessor(
+            model_dir=self.model.model_dir)
+        self.device = self.preprocessor.device
+        self.model.model.to(self.device)
+        self.train_dataset = kwargs['train_dataset']
+        self.eval_dataset = kwargs['eval_dataset']
+
+    def train(self,
+              total_epoches=10,
+              batch_size=16,
+              accumulation_steps=1,
+              learning_rate=1e-4,
+              warmup_ratio=0.1,
+              weight_decay=0.1,
+              eps=1e-06,
+              loss_log_freq=40):
+        """
+        Fine-tuning trainsets
+        """
+        # obtain train loader
+        train_loader = DataLoader(
+            dataset=self.train_dataset,
+            batch_size=batch_size,
+            shuffle=True,
+            collate_fn=collate)
+
+        optimizer = prepare_optimizer(self.model.model, learning_rate,
+                                      weight_decay, eps)
+        steps_per_epoch = len(train_loader) // accumulation_steps
+        scheduler = prepare_scheduler(optimizer, total_epoches,
+                                      steps_per_epoch, warmup_ratio)
+        scaler = GradScaler()
+        best_score = 0.0
+        for epoch in range(total_epoches):
+            self.model.model.train()
+            losses = []
+            for index, payload in enumerate(tqdm.tqdm(train_loader)):
+                query, context, label = payload
+                processed = self.preprocessor(
+                    {
+                        'query': query,
+                        'context': context,
+                        'label': label
+                    },
+                    invoke_mode=ModeKeys.TRAIN)
+                with autocast():
+                    outputs = self.model.forward(processed)
+                    loss = outputs.loss.mean()
+
+                if accumulation_steps > 1:
+                    loss = loss / accumulation_steps
+
+                scaler.scale(loss).backward()
+
+                if (index + 1) % accumulation_steps == 0:
+                    scaler.step(optimizer)
+                    scaler.update()
+                    scheduler.step()
+                    optimizer.zero_grad()
+                losses.append(loss.item())
+                if (index + 1) % loss_log_freq == 0:
+                    logger.info(
+                        f'epoch: {epoch} \t batch: {batch_size * index} \t loss: {sum(losses) / len(losses)}'
+                    )
+                    losses = []
+            if losses:
+                logger.info(
+                    f'epoch: {epoch} \t batch: last \t loss: {sum(losses) / len(losses)}'
+                )
+
+            meters = self.evaluate(batch_size=batch_size)
+            total_score = sum([x for x in meters.values()])
+            if total_score >= best_score:
+                best_score = total_score
+                model_path = os.path.join(self.model.model_dir,
+                                          'finetuned_model.bin')
+                state_dict = self.model.model.state_dict()
+                torch.save(state_dict, model_path)
+                logger.info(
+                    'epoch %d obtain max score: %.4f, saving model to %s' %
+                    (epoch, total_score, model_path))
+
+    def evaluate(self, batch_size=16, checkpoint_path=None):
+        """
+        Evaluate testsets
+        """
+        if checkpoint_path is not None:
+            state_dict = torch.load(checkpoint_path)
+            self.model.model.load_state_dict(state_dict)
+
+        valid_loader = DataLoader(
+            dataset=self.eval_dataset,
+            batch_size=batch_size,
+            collate_fn=collate)
+        self.model.model.eval()
+        with torch.no_grad():
+            results = {'outputs': [], 'targets': []}
+            for index, payload in enumerate(tqdm.tqdm(valid_loader)):
+                query, context, label = payload
+                processed = self.preprocessor(
+                    {
+                        'query': query,
+                        'context': context,
+                    },
+                    invoke_mode=ModeKeys.INFERENCE)
+                outputs = self.model.generate(processed)
+                predictions = self.preprocessor.generation_tokenizer.batch_decode(
+                    outputs,
+                    skip_special_tokens=True,
+                    clean_up_tokenization_spaces=False)
+                label = self.preprocessor.generation_tokenizer.batch_decode(
+                    self.preprocessor.generation_tokenizer.batch_encode_plus(
+                        label, add_special_tokens=False).input_ids,
+                    skip_special_tokens=True,
+                    clean_up_tokenization_spaces=False)
+
+                results['outputs'] += predictions
+                results['targets'] += label
+            meters = measure_result(results)
+        logger.info(meters)
+        return meters
diff --git a/modelscope/trainers/nlp/document_grounded_dialog_rerank_trainer.py b/modelscope/trainers/nlp/document_grounded_dialog_rerank_trainer.py
new file mode 100644
index 00000000..e5d07ee8
--- /dev/null
+++ b/modelscope/trainers/nlp/document_grounded_dialog_rerank_trainer.py
@@ -0,0 +1,603 @@
+import os
+import random
+import time
+from typing import Iterable
+
+import numpy as np
+import torch
+import torch.cuda
+import torch.nn.functional as F
+from transformers import AdamW, get_linear_schedule_with_warmup
+
+from modelscope.metainfo import Trainers
+from modelscope.models import Model
+from modelscope.preprocessors import DocumentGroundedDialogRerankPreprocessor
+from modelscope.trainers import EpochBasedTrainer
+from modelscope.trainers.builder import TRAINERS
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+
+@TRAINERS.register_module(
+    module_name=Trainers.document_grounded_dialog_rerank_trainer)
+class DocumentGroundedDialogRerankTrainer(EpochBasedTrainer):
+
+    def __init__(self, model, dataset, **args):
+        args = args['args']
+        set_seed(args['seed'])
+        self.positive_pids = ''
+        self.instances_size = 1
+        # load id to positive pid map
+        self.inst_id2pos_pids = dict()
+        self.inst_id2pos_passages = dict()
+        self.dataset = dataset
+        self.model = Model.from_pretrained(model, revision='v1.0.0')
+        self.preprocessor = DocumentGroundedDialogRerankPreprocessor(
+            self.model.model_dir, **args)
+        self.tokenizer = self.preprocessor.tokenizer
+        if args['model_resize']:
+            self.model.resize_token_embeddings(len(self.tokenizer))
+        self.device = self.preprocessor.device
+        self.model.to(self.device)
+        for jobj in self.dataset:
+            self.inst_id2pos_pids[jobj['id']] = eval(jobj['positive_pids'])
+            assert isinstance(eval(jobj['positive_pids']), list)
+        logger.info(
+            f'gathered positive pids for {len(self.inst_id2pos_pids)} instances'
+        )
+
+        # remove out-of-recall
+        instance_count = 0
+        for jobj in self.dataset:
+            inst_id = jobj['id']
+            if inst_id not in self.inst_id2pos_pids:
+                continue
+            passages = eval(jobj['passages'])
+            positive_pids = self.inst_id2pos_pids[inst_id]
+            target_mask = [p['pid'] in positive_pids for p in passages]
+            if not any(target_mask) or all(target_mask):
+                del self.inst_id2pos_pids[inst_id]
+            else:
+                instance_count += 1
+        if instance_count != len(self.inst_id2pos_pids):
+            logger.error(
+                f'!!! Mismatch between --positive_pids and --initial_retrieval! '
+                f'{len(self.inst_id2pos_pids)} vs {instance_count}')
+
+        # transformer_optimize
+        if args['train_instances'] <= 0:
+            args['train_instances'] = instance_count
+        # MARK
+        instances_to_train_over = args['train_instances'] * args[
+            'num_train_epochs'] // args['instances_size']
+        self.optimizer = TransformerOptimize(args, instances_to_train_over,
+                                             self.model)
+        logger.info('  Num Epochs = %d', args['num_train_epochs'])
+        self.optimizer.model.zero_grad()
+        # MARK
+        train_batch_size = \
+            args['full_train_batch_size'] // args['gradient_accumulation_steps']
+        self.loss_history = \
+            LossHistory(
+                args['train_instances'] // train_batch_size // args['instances_size']
+            )
+        self.args = args
+        self.max_length_count = 0
+
+    def one_instance(self, query, passages):
+        model = self.optimizer.model
+        input_dict = {'query': query, 'passages': passages}
+        inputs = self.preprocessor(input_dict)
+        logits = F.log_softmax(
+            model(inputs).logits,
+            dim=-1)[:, 1]  # log_softmax over the binary classification
+        logprobs = F.log_softmax(
+            logits, dim=0)  # log_softmax over the passages
+        # we want the logits rather than the logprobs as the teacher labels
+        return logprobs
+
+    def limit_gpu_sequences_binary(self, passages, target_mask, rand):
+        if len(passages) > self.args['max_num_seq_pairs_per_device']:
+            num_pos = min(
+                sum(target_mask),
+                self.args['max_num_seq_pairs_per_device'] // 2)
+            num_neg = self.args['max_num_seq_pairs_per_device'] - num_pos
+            passage_and_pos = list(zip(passages, target_mask))
+            rand.shuffle(passage_and_pos)
+            pos_count = 0
+            neg_count = 0
+            passages = []
+            target_mask = []
+            for passage, mask in passage_and_pos:
+                if mask and pos_count < num_pos:
+                    passages.append(passage)
+                    target_mask.append(mask)
+                    pos_count += 1
+                elif not mask and neg_count < num_neg:
+                    passages.append(passage)
+                    target_mask.append(mask)
+                    neg_count += 1
+        return passages, target_mask
+
+    def limit_gpu_sequences(self, passages, correctness, rand):
+        if len(passages) > self.args['max_num_seq_pairs_per_device']:
+            num_pos = min(
+                sum([c > 0 for c in correctness]),
+                self.args['max_num_seq_pairs_per_device'] // 2)
+            num_neg = self.args['max_num_seq_pairs_per_device'] - num_pos
+            passage_and_pos = list(zip(passages, correctness))
+            rand.shuffle(passage_and_pos)
+            pos_count = 0
+            neg_count = 0
+            passages = []
+            correctness = []
+            for passage, pos in passage_and_pos:
+                if pos > 0 and pos_count < num_pos:
+                    passages.append(passage)
+                    correctness.append(pos)
+                    pos_count += 1
+                elif pos == 0 and neg_count < num_neg:
+                    passages.append(passage)
+                    correctness.append(pos)
+                    neg_count += 1
+        return passages, correctness
+
+    def passage_correctness(self, pid, positive_pids, positive_dids):
+        if pid in positive_pids:
+            return 1.0
+        elif positive_dids and pid[:pid.index('::')] in positive_dids:
+            return self.args['doc_match_weight']
+        else:
+            return 0
+
+    def train(self):
+        rand = random.Random()
+        while self.optimizer.should_continue():
+            self.optimizer.model.train()
+            dataset = block_shuffle(self.dataset, block_size=100000, rand=rand)
+            for line_ndx, jobj in enumerate(dataset):
+                inst_id = jobj['id']
+                if inst_id not in self.inst_id2pos_pids:
+                    continue
+                if line_ndx % self.args['world_size'] != \
+                        self.args['global_rank']:
+                    continue
+                query = jobj['input'] if 'input' in jobj else jobj['query']
+                passages = eval(jobj['passages'])
+                positive_pids = self.inst_id2pos_pids[inst_id]
+                if self.args['doc_match_weight'] > 0:
+                    positive_dids = [
+                        pid[:pid.index('::')] for pid in positive_pids
+                    ]
+                else:
+                    positive_dids = None
+                correctness = [
+                    self.passage_correctness(p['pid'], positive_pids,
+                                             positive_dids) for p in passages
+                ]
+                passages, correctness = self.limit_gpu_sequences(
+                    passages, correctness, rand)
+                logits = self.one_instance(query, passages)
+                # nll = -(logits[target_mask].sum())  # TODO: instead take the weighted sum
+                nll = -(
+                    logits.dot(torch.tensor(correctness).to(logits.device)))
+                loss_val = self.optimizer.step_loss(nll)
+                self.loss_history.note_loss(loss_val)
+                if not self.optimizer.should_continue():
+                    break
+        get_length = self.args['max_seq_length']
+        logger.info(f'loss_history = {self.loss_history.loss_history}')
+        logger.info(
+            f'truncated to max length ({get_length}) {self.max_length_count} times'
+        )
+        save_transformer(self.args, self.optimizer.model, self.tokenizer)
+
+
+class Reporting:
+
+    def __init__(self,
+                 *,
+                 recency_weight=0.001,
+                 report_interval_secs=300,
+                 check_every=1,
+                 gather_samples: Iterable = (),
+                 num_samples=10000):
+        """The Reporting to print parameter status
+
+        Args:
+            recency_weight: when computing the moving average, how much weight to give to the current sample.
+            report_interval_secs: how many seconds between returning true for is_time.
+            check_every: how often to check the time, when calling is_time.
+            gather_samples: keep the last num_samples of the listed names (gathered from moving_averages).
+            num_samples: how many samples to keep.
+        """
+        self.check_count = 0
+        self.check_every = check_every
+        self.start_time = time.time()
+        self.last_time = self.start_time
+        self.report_interval_secs = report_interval_secs
+        # For tracking moving averages of various values
+        self.names = None
+        self.averages = None
+        self.counts = None
+        self.recency_weight = recency_weight
+        self.per_value_recency_weight = dict()
+        self.report_count = 0
+        self._prev_check_count = 0
+        self.sample_names = list(gather_samples)
+        if len(self.sample_names) > 0:
+            self.sample_values = np.zeros(
+                (len(self.sample_names), num_samples), dtype=np.float32)
+            self.sample_ndxs = np.zeros(len(self.sample_names), dtype=np.int32)
+        else:
+            self.sample_values = None
+            self.sample_ndxs = None
+
+    def reset(self):
+        self.check_count = 0
+        self.start_time = time.time()
+        self.last_time = self.start_time
+        self.report_count = 0
+        self._prev_check_count = 0
+        if len(self.sample_names) > 0:
+            self.sample_values[:, :] = 0
+            self.sample_ndxs[:] = 0
+        if self.counts is not None:
+            self.counts[:] = 0
+            self.averages[:] = 0
+
+    def is_time(self):
+        self.check_count += 1
+        if self.check_count % self.check_every == 0:
+            elapsed = time.time() - self.last_time
+            if elapsed >= self.report_interval_secs:
+                # check the time more or less often
+                if self.check_every > 1 and self.check_count - self._prev_check_count < 5 * self.check_every:
+                    self.check_every //= 2
+                elif self.check_count - self._prev_check_count > 50 * self.check_every:
+                    self.check_every *= 2
+                self.last_time = time.time()
+                self.report_count += 1
+                self._prev_check_count = self.check_count
+                return True
+        return False
+
+    def moving_averages(self, **values):
+        # create entries in avgs and counts when needed
+        # update the avgs and counts
+        if self.names is None:
+            self.names = list(values.keys())
+            self.averages = np.zeros(len(self.names))
+            self.counts = np.zeros(len(self.names))
+        for name in values.keys():
+            if name not in self.names:
+                self.names.append(name)
+        if self.averages.shape[0] < len(self.names):
+            old_len = self.averages.shape[0]
+            self.averages = np.resize(self.averages, len(self.names))
+            self.averages[old_len:] = 0
+            self.counts = np.resize(self.counts, len(self.names))
+            self.counts[old_len:] = 0
+        for ndx, name in enumerate(self.names):
+            if name in values:
+                self.counts[ndx] += 1
+                # support per-name recency_weight
+                if name in self.per_value_recency_weight:
+                    rweight = max(self.per_value_recency_weight[name],
+                                  1.0 / self.counts[ndx])
+                else:
+                    rweight = max(self.recency_weight, 1.0 / self.counts[ndx])
+                self.averages[ndx] = \
+                    rweight * values[name] + (1.0 - rweight) * self.averages[ndx]
+        for ndx, name in enumerate(self.sample_names):
+            if name in values:
+                self.sample_values[self.sample_ndxs[ndx]] = values[name]
+                self.sample_ndxs[ndx] = (self.sample_ndxs[ndx]
+                                         + 1) % self.sample_values.shape[1]
+
+    def get_samples(self, name):
+        for ndx, n in enumerate(self.sample_names):
+            if n == name:
+                count = self.get_count(name)
+                if count is None:
+                    count = 0
+                return self.sample_values[ndx, 0:count]  # NOTE: not in order
+        return None
+
+    def get_moving_average(self, name):
+        if self.names is None:
+            return None
+        for ndx, n in enumerate(self.names):
+            if n == name:
+                return self.averages[ndx]
+        return None
+
+    def get_count(self, name):
+        if self.names is None:
+            return None
+        for ndx, n in enumerate(self.names):
+            if n == name:
+                return self.counts[ndx]
+        return None
+
+    def elapsed_seconds(self) -> float:
+        return time.time() - self.start_time
+
+    def elapsed_time_str(self) -> str:
+        return time_str(self.elapsed_seconds())
+
+    def progress_str(self, instance_name='instance'):
+        return f'On {instance_name} {self.check_count}, ' \
+               f'{self.check_count / self.elapsed_seconds()} {instance_name}s per second.'
+
+    def display(self, *, prefix=''):
+        # display the moving averages
+        logger.info('==========================================')
+        if self.names is not None:
+            for n, v in zip(self.names, self.averages):
+                logger.info(f'{prefix}{n} = {v}')
+
+    def display_warn(self, *, prefix=''):
+        # display the moving averages
+        logger.info('==========================================')
+        if self.names is not None:
+            for n, v in zip(self.names, self.averages):
+                logger.warning(f'{prefix}{n} = {v}')
+
+
+class LossHistory:
+
+    def __init__(self,
+                 one_epoch_batch_count,
+                 *,
+                 loss_points_per_epoch=10,
+                 recency_weight=0.001):
+        self.avg_loss = 0
+        self.batch_count = 0
+        self.recency_weight = recency_weight
+        self.loss_history = []
+        self.record_loss_every = max(
+            1, one_epoch_batch_count // loss_points_per_epoch)
+
+    def note_loss(self, loss_val):
+        self.batch_count += 1
+        rweight = max(self.recency_weight, 1.0 / self.batch_count)
+        self.avg_loss = (1.0 - rweight) * self.avg_loss + rweight * loss_val
+        if self.batch_count % self.record_loss_every == 0:
+            self.loss_history.append(self.avg_loss)
+            logger.info(
+                f'loss point {self.batch_count // self.record_loss_every} = {self.avg_loss}'
+            )
+            if self.avg_loss == min(
+                    self.loss_history) and len(self.loss_history) > 10:
+                return 2
+            return True
+        return False
+
+
+class TransformerOptimize:
+    """
+    Collects standard steps to train transformer
+    call step_loss after computing each loss
+    """
+
+    def __init__(self, hypers, num_instances_to_train_over: int, model):
+        self.step = 0
+        self.global_step = 0
+        self.hypers = hypers
+        self.model = model
+        instances_per_step = hypers['full_train_batch_size'] // hypers[
+            'gradient_accumulation_steps']
+        self.reporting = Reporting(recency_weight=0.0001 * instances_per_step)
+        args = self.hypers
+
+        self.t_total = num_instances_to_train_over // args[
+            'full_train_batch_size']
+
+        # Prepare optimizer and schedule (linear warmup and decay)
+        no_decay = ['bias', 'LayerNorm.weight']
+        optimizer_grouped_parameters = [
+            {
+                'params': [
+                    p for n, p in self.model.named_parameters()
+                    if not any(nd in n for nd in no_decay)
+                ],
+                'weight_decay':
+                args['weight_decay'],
+            },
+            {
+                'params': [
+                    p for n, p in self.model.named_parameters()
+                    if any(nd in n for nd in no_decay)
+                ],
+                'weight_decay':
+                0.0
+            },
+        ]
+
+        warmup_instances = args['warmup_instances']
+        if hasattr(
+                args, 'warmup_fraction'
+        ) and args['warmup_fraction'] > 0 >= args['warmup_instances']:
+            warmup_instances = \
+                args['warmup_fraction'] * num_instances_to_train_over
+        if warmup_instances < 0:
+            warmup_instances = 0
+
+        self.optimizer = AdamW(
+            optimizer_grouped_parameters,
+            lr=args['learning_rate'],
+            eps=args['adam_epsilon'])
+        self.scheduler = get_linear_schedule_with_warmup(
+            self.optimizer,
+            num_warmup_steps=warmup_instances // args['full_train_batch_size'],
+            num_training_steps=self.t_total)
+
+        # Check if saved optimizer or scheduler states exist
+        if args['resume_from'] and os.path.isfile(os.path.join(args['resume_from'], 'optimizer.pt')) and \
+                os.path.isfile(os.path.join(args['resume_from'], 'scheduler.pt')):
+            resume_from = args['resume_from']
+        # elif os.path.isfile(os.path.join(args['model_name_or_path'], "optimizer.pt")) and \
+        #         os.path.isfile(os.path.join(args['model_name_or_path'], "scheduler.pt")):
+        #     resume_from = args['model_name_or_path']
+        else:
+            resume_from = None
+        if resume_from is not None:
+            # Load in optimizer and scheduler states
+            self.optimizer.load_state_dict(
+                torch.load(
+                    os.path.join(resume_from, 'optimizer.pt'),
+                    map_location='cpu'))
+            self.scheduler.load_state_dict(
+                torch.load(
+                    os.path.join(resume_from, 'scheduler.pt'),
+                    map_location='cpu'))
+            logger.info(f'loaded optimizer and scheduler from {resume_from}')
+
+        if args['fp16']:
+            self.model, optimizer = amp.initialize(
+                self.model, self.optimizer, opt_level=args['fp16_opt_level'])
+
+        # multi-gpu training (should be after apex fp16 initialization)
+        if args['n_gpu'] > 1:
+            # NOTE: won't work at O2, only O1
+            self.model = torch.nn.DataParallel(
+                self.model, device_ids=list(range(args['n_gpu'])))
+
+        # Distributed training (should be after apex fp16 initialization)
+        # if args.local_rank != -1:
+        #     self.model = torch.nn.parallel.DistributedDataParallel(
+        #         self.model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True,
+        #     )
+        # set_seed(args)
+        # assert args.per_gpu_train_batch_size * (args.n_gpu if args.n_gpu > 0 else 1) * \
+        #        args.world_size * args.gradient_accumulation_steps == args.full_train_batch_size
+        logger.info('***** Running training *****')
+        logger.info('  Instantaneous batch size per GPU = %d',
+                    args['per_gpu_train_batch_size'])
+        logger.info(
+            '  Total train batch size (w. parallel, distributed & accumulation) = %d',
+            args['full_train_batch_size'])
+        logger.info('  Gradient Accumulation steps = %d',
+                    args['gradient_accumulation_steps'])
+        logger.info('  Total optimization steps = %d', self.t_total)
+
+    def should_continue(self):
+        return self.global_step < self.t_total
+
+    def backward_on_loss(self, loss, **moving_averages):
+        if self.hypers['n_gpu'] > 1:
+            loss = loss.mean(
+            )  # mean() to average on multi-gpu parallel training
+        loss_val = loss.item()
+        if self.hypers['gradient_accumulation_steps'] > 1:
+            loss = loss / self.hypers['gradient_accumulation_steps']
+        if self.hypers['fp16']:
+            with amp.scale_loss(loss, self.optimizer) as scaled_loss:
+                scaled_loss.backward()
+        else:
+            loss.backward()
+        self.reporting.moving_averages(loss=loss_val, **moving_averages)
+        return loss_val
+
+    def optimizer_step(self):
+        if self.global_step >= self.t_total:
+            logger.warning(
+                f'Warning, exceeded total steps! {self.global_step} step of {self.t_total}'
+            )
+            return False
+        if (self.step + 1) % self.hypers['gradient_accumulation_steps'] == 0:
+            if self.hypers['max_grad_norm'] > 0:
+                if self.hypers['fp16']:
+                    torch.nn.utils.clip_grad_norm_(
+                        amp.master_params(self.optimizer),
+                        self.hypers['max_grad_norm'])
+                else:
+                    torch.nn.utils.clip_grad_norm_(
+                        self.model.parameters(), self.hypers['max_grad_norm'])
+
+            self.optimizer.step()
+            self.scheduler.step()  # Update learning rate schedule
+            self.model.zero_grad()
+            self.global_step += 1
+        self.step += 1
+
+        if self.reporting.is_time():
+            self.reporting.display()
+            inst_count = \
+                self.hypers['world_size'] * self.hypers['n_gpu'] * self.hypers[
+                    'per_gpu_train_batch_size'] * self.reporting.check_count
+            learning_rate_scalar = self.scheduler.get_lr()[0]
+            logger.info(
+                f'{inst_count / self.reporting.elapsed_seconds()} instances per second; '
+                f'{inst_count} total ({learning_rate_scalar} learn rate)')
+        return True
+
+    def step_loss(self, loss, **moving_averages):
+        loss_val = self.backward_on_loss(loss, **moving_averages)
+        if self.optimizer_step():
+            return loss_val
+        else:
+            return None
+
+
+def block_shuffle(iter, *, block_size=20000, rand=random):
+    """
+    shuffle the possibly endless iterator by blocks
+    Good shuffling over multiple files:
+    block_shuffle(read_lines(files, shuffled_files=rand), rand=rand, block_size=100000)
+    :param iter: the iterator we will yield shuffled items from
+    :param block_size: size of memory to use for block shuffling
+    :param rand: rand.shuffle will be used on the list block
+    :return:
+    """
+    assert block_size >= 4
+    block = []
+    for item in iter:
+        block.append(item)
+        if len(block) >= block_size:
+            rand.shuffle(block)
+            for _ in range(block_size // 2):
+                yield block.pop(-1)
+    rand.shuffle(block)
+    for bi in block:
+        yield bi
+
+
+def save_transformer(hypers, model, tokenizer, *, save_dir=None):
+    if hypers['global_rank'] == 0:
+        if save_dir is None:
+            save_dir = hypers['output_dir']
+        # Create output directory if needed
+        os.makedirs(save_dir, exist_ok=True)
+        logger.info('Saving model checkpoint to %s', save_dir)
+        # Save a trained model, configuration and tokenizer using `save_pretrained()`.
+        # They can then be reloaded using `from_pretrained()`
+        model_to_save = (model.module if hasattr(model, 'module') else model
+                         )  # Take care of distributed/parallel training
+        torch.save(hypers, os.path.join(save_dir, 'training_args.bin'))
+        model_to_save.save_pretrained(save_dir)
+        if tokenizer is not None:
+            tokenizer.save_pretrained(save_dir)
+
+
+def kofn(kofn: str):
+    """
+    ''     -> 0, 1
+    '1of2' -> 0, 2
+    '2of2' -> 1, 2
+    :param kofn:
+    :return:
+    """
+    if not kofn:
+        return 0, 1
+    k, n = [int(i) for i in kofn.lower().split('of')]
+    assert 1 <= k <= n
+    return k - 1, n
+
+
+def set_seed(seed):
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
diff --git a/modelscope/trainers/nlp/document_grounded_dialog_retrieval_trainer.py b/modelscope/trainers/nlp/document_grounded_dialog_retrieval_trainer.py
new file mode 100644
index 00000000..ba670625
--- /dev/null
+++ b/modelscope/trainers/nlp/document_grounded_dialog_retrieval_trainer.py
@@ -0,0 +1,216 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os
+
+import faiss
+import json
+import numpy as np
+import torch
+import tqdm
+from torch.utils.data import DataLoader
+from transformers import AdamW, get_scheduler
+
+from modelscope.metainfo import Trainers
+from modelscope.models import Model
+from modelscope.preprocessors import \
+    DocumentGroundedDialogRetrievalPreprocessor
+from modelscope.trainers import EpochBasedTrainer
+from modelscope.trainers.builder import TRAINERS
+from modelscope.utils.constant import ModeKeys
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+
+def collate(batch):
+    query = [item['query'] for item in batch]
+    positive = [item['positive'] for item in batch]
+    negative = [item['negative'] for item in batch]
+    return query, positive, negative
+
+
+def prepare_optimizer(model, lr, weight_decay, eps):
+    no_decay = ['bias', 'LayerNorm.weight']
+    optimizer_grouped_parameters = [{
+        'params': [
+            p for n, p in model.named_parameters()
+            if not any(nd in n for nd in no_decay)
+        ],
+        'weight_decay':
+        weight_decay,
+    }, {
+        'params': [
+            p for n, p in model.named_parameters()
+            if any(nd in n for nd in no_decay)
+        ],
+        'weight_decay':
+        0.0,
+    }]
+    optimizer = AdamW(optimizer_grouped_parameters, lr=lr, eps=eps)
+    return optimizer
+
+
+def prepare_scheduler(optimizer, epochs, steps_per_epoch, warmup_rate):
+    total_steps = epochs * steps_per_epoch
+    warmup_steps = int(total_steps * warmup_rate)
+    scheduler = get_scheduler(
+        name='linear',
+        optimizer=optimizer,
+        num_warmup_steps=warmup_steps,
+        num_training_steps=total_steps)
+    return scheduler
+
+
+def measure_result(result_dict):
+    recall_k = [1, 5, 10, 20]
+    meters = {f'R@{k}': [] for k in recall_k}
+
+    for output, target in zip(result_dict['outputs'], result_dict['targets']):
+        for k in recall_k:
+            if target in output[:k]:
+                meters[f'R@{k}'].append(1)
+            else:
+                meters[f'R@{k}'].append(0)
+    for k, v in meters.items():
+        meters[k] = sum(v) / len(v)
+    return meters
+
+
+@TRAINERS.register_module(
+    module_name=Trainers.document_grounded_dialog_retrieval_trainer)
+class DocumentGroundedDialogRetrievalTrainer(EpochBasedTrainer):
+
+    def __init__(self, model: str, revision='v1.0.0', *args, **kwargs):
+        self.model = Model.from_pretrained(model, revision=revision)
+        self.preprocessor = DocumentGroundedDialogRetrievalPreprocessor(
+            model_dir=self.model.model_dir)
+        self.device = self.preprocessor.device
+        self.model.model.to(self.device)
+        self.train_dataset = kwargs['train_dataset']
+        self.eval_dataset = kwargs['eval_dataset']
+        self.all_passages = kwargs['all_passages']
+
+    def train(self,
+              total_epoches=20,
+              batch_size=128,
+              per_gpu_batch_size=32,
+              accumulation_steps=1,
+              learning_rate=2e-5,
+              warmup_ratio=0.1,
+              weight_decay=0.1,
+              eps=1e-06,
+              loss_log_freq=40):
+        """
+        Fine-tuning trainsets
+        """
+        # obtain train loader
+        train_loader = DataLoader(
+            dataset=self.train_dataset,
+            batch_size=batch_size,
+            shuffle=True,
+            collate_fn=collate)
+
+        optimizer = prepare_optimizer(self.model.model, learning_rate,
+                                      weight_decay, eps)
+        steps_per_epoch = len(train_loader) // accumulation_steps
+        scheduler = prepare_scheduler(optimizer, total_epoches,
+                                      steps_per_epoch, warmup_ratio)
+
+        best_score = 0.0
+        for epoch in range(total_epoches):
+            self.model.model.train()
+            losses = []
+            for index, payload in enumerate(tqdm.tqdm(train_loader)):
+                query, positive, negative = payload
+                processed = self.preprocessor(
+                    {
+                        'query': query,
+                        'positive': positive,
+                        'negative': negative
+                    },
+                    invoke_mode=ModeKeys.TRAIN)
+                loss, logits = self.model.forward(processed)
+
+                if accumulation_steps > 1:
+                    loss = loss / accumulation_steps
+
+                loss.backward()
+
+                if (index + 1) % accumulation_steps == 0:
+                    optimizer.step()
+                    scheduler.step()
+                    optimizer.zero_grad()
+                losses.append(loss.item())
+                if (index + 1) % loss_log_freq == 0:
+                    logger.info(
+                        f'epoch: {epoch} \t batch: {batch_size * index} \t loss: {sum(losses) / len(losses)}'
+                    )
+                    losses = []
+            if losses:
+                logger.info(
+                    f'epoch: {epoch} \t batch: last \t loss: {sum(losses) / len(losses)}'
+                )
+
+            meters = self.evaluate(per_gpu_batch_size=per_gpu_batch_size)
+            total_score = sum([x for x in meters.values()])
+            if total_score >= best_score:
+                best_score = total_score
+                model_path = os.path.join(self.model.model_dir,
+                                          'finetuned_model.bin')
+                state_dict = self.model.model.state_dict()
+                torch.save(state_dict, model_path)
+                logger.info(
+                    'epoch %d obtain max score: %.4f, saving model to %s' %
+                    (epoch, total_score, model_path))
+
+    def evaluate(self, per_gpu_batch_size=32, checkpoint_path=None):
+        """
+        Evaluate testsets
+        """
+        if checkpoint_path is not None:
+            state_dict = torch.load(checkpoint_path)
+            self.model.model.load_state_dict(state_dict)
+
+        valid_loader = DataLoader(
+            dataset=self.eval_dataset,
+            batch_size=per_gpu_batch_size,
+            collate_fn=collate)
+        self.model.model.eval()
+        with torch.no_grad():
+            all_ctx_vector = []
+            for mini_batch in tqdm.tqdm(
+                    range(0, len(self.all_passages), per_gpu_batch_size)):
+                context = self.all_passages[mini_batch:mini_batch
+                                            + per_gpu_batch_size]
+                processed = \
+                    self.preprocessor({'context': context},
+                                      invoke_mode=ModeKeys.INFERENCE,
+                                      input_type='context')
+                sub_ctx_vector = self.model.encode_context(
+                    processed).detach().cpu().numpy()
+                all_ctx_vector.append(sub_ctx_vector)
+
+            all_ctx_vector = np.concatenate(all_ctx_vector, axis=0)
+            all_ctx_vector = np.array(all_ctx_vector).astype('float32')
+            faiss_index = faiss.IndexFlatIP(all_ctx_vector.shape[-1])
+            faiss_index.add(all_ctx_vector)
+
+            results = {'outputs': [], 'targets': []}
+            for index, payload in enumerate(tqdm.tqdm(valid_loader)):
+                query, positive, negative = payload
+                processed = self.preprocessor({'query': query},
+                                              invoke_mode=ModeKeys.INFERENCE)
+                query_vector = self.model.encode_query(
+                    processed).detach().cpu().numpy().astype('float32')
+                D, Index = faiss_index.search(query_vector, 20)
+                results['outputs'] += [[
+                    self.all_passages[x] for x in retrieved_ids
+                ] for retrieved_ids in Index.tolist()]
+                results['targets'] += positive
+            meters = measure_result(results)
+            result_path = os.path.join(self.model.model_dir,
+                                       'evaluate_result.json')
+            with open(result_path, 'w') as f:
+                json.dump(results, f, ensure_ascii=False, indent=4)
+
+        logger.info(meters)
+        return meters
diff --git a/modelscope/trainers/nlp/faq_question_answering_trainer.py b/modelscope/trainers/nlp/faq_question_answering_trainer.py
index a4a78cf7..dc6f0426 100644
--- a/modelscope/trainers/nlp/faq_question_answering_trainer.py
+++ b/modelscope/trainers/nlp/faq_question_answering_trainer.py
@@ -64,6 +64,9 @@ class EpisodeSampler(torch.utils.data.BatchSampler):
         self.episode = n_iter
         domain_label_sampleid = {}
         bad_sample_ids = self.get_bad_sampleids(dataset)
+        if dataset.mode == 'train':
+            logger.info(
+                f'num. of bad sample ids:{len(bad_sample_ids)}/{len(dataset)}')
         for sample_index, sample in enumerate(dataset):
             if sample_index in bad_sample_ids:
                 continue
@@ -95,7 +98,9 @@ class EpisodeSampler(torch.utils.data.BatchSampler):
                 data_size += len(tokens)
         if dataset.mode == 'train':
             logger.info(
-                f'{dataset.mode}: label size:{total}, data size:{data_size}')
+                f'{dataset.mode}: label size:{total}, data size:{data_size}, \
+                domain_size:{len(self.domain_label_tokens)}')
+        self.mode = dataset.mode
 
     def __iter__(self):
         for i in range(self.episode):
@@ -109,18 +114,21 @@ class EpisodeSampler(torch.utils.data.BatchSampler):
                     list(self.domain_label_tokens[domain].keys()))
                 N = min(self.n_way, len(all_labels))
                 labels = np.random.choice(
-                    all_labels, size=min(N, len(all_labels)), replace=False)
+                    all_labels, size=min(N, len(all_labels)),
+                    replace=False).tolist()
                 batch = []
                 for label in labels[:N]:
                     candidates = self.domain_label_tokens[domain][label]
-                    K = min(len(candidates), int((self.k_shot + self.r_query)))
-                    tmp = np.random.choice(candidates, size=K, replace=False)
+                    num_samples = self.k_shot + self.r_query
+                    K = min(len(candidates), int(num_samples))
+                    tmp = np.random.choice(
+                        candidates, size=K, replace=False).tolist()
                     batch.extend(tmp)
                 batch = [int(n) for n in batch]
                 yield batch
 
     def _get_field(self, obj, key, default=None):
-        value = getattr(obj, key, default) or obj.get(key, default)
+        value = obj.get(key, default)
         if value is not None:
             return str(value)
         return None
diff --git a/modelscope/trainers/nlp/sentence_embedding_trainer.py b/modelscope/trainers/nlp/sentence_embedding_trainer.py
new file mode 100644
index 00000000..b2116443
--- /dev/null
+++ b/modelscope/trainers/nlp/sentence_embedding_trainer.py
@@ -0,0 +1,105 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import time
+from dataclasses import dataclass
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+
+import numpy as np
+import torch
+from torch import nn
+from torch.utils.data import DataLoader, Dataset
+from tqdm import tqdm
+from transformers import DataCollatorWithPadding
+
+from modelscope.metainfo import Trainers
+from modelscope.models.base import Model, TorchModel
+from modelscope.models.nlp import BertForTextRanking
+from modelscope.msdatasets.ms_dataset import MsDataset
+from modelscope.preprocessors.base import Preprocessor
+from modelscope.trainers.builder import TRAINERS
+from modelscope.trainers.nlp_trainer import NlpEpochBasedTrainer
+from modelscope.utils.constant import DEFAULT_MODEL_REVISION
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+
+@dataclass
+class SentenceEmbeddingCollator(DataCollatorWithPadding):
+    """
+    Wrapper that does conversion from List[Tuple[encode_qry, encode_psg]] to List[qry], List[psg]
+    and pass batch separately to the actual collator.
+    Abstract out data detail for the model.
+    """
+    max_length = 128
+    tokenizer = None
+
+    def __call__(self, features):
+        qq = [f['query'] for f in features]
+        dd = [f['docs'] for f in features]
+        keys = qq[0].keys()
+        qq = {k: [ele[k] for ele in qq] for k in keys}
+        q_collated = self.tokenizer._tokenizer.pad(
+            qq,
+            padding='max_length',
+            max_length=self.max_length,
+            return_tensors='pt')
+        keys = dd[0].keys()
+        dd = {k: sum([ele[k] for ele in dd], []) for k in keys}
+        d_collated = self.tokenizer._tokenizer.pad(
+            dd,
+            padding='max_length',
+            max_length=self.max_length,
+            return_tensors='pt')
+        return {'query': q_collated, 'docs': d_collated}
+
+
+@TRAINERS.register_module(module_name=Trainers.nlp_sentence_embedding_trainer)
+class SentenceEmbeddingTrainer(NlpEpochBasedTrainer):
+
+    def __init__(
+            self,
+            model: Optional[Union[TorchModel, nn.Module, str]] = None,
+            cfg_file: Optional[str] = None,
+            cfg_modify_fn: Optional[Callable] = None,
+            arg_parse_fn: Optional[Callable] = None,
+            data_collator: Optional[Callable] = None,
+            train_dataset: Optional[Union[MsDataset, Dataset]] = None,
+            eval_dataset: Optional[Union[MsDataset, Dataset]] = None,
+            preprocessor: Optional[Preprocessor] = None,
+            optimizers: Tuple[torch.optim.Optimizer,
+                              torch.optim.lr_scheduler._LRScheduler] = (None,
+                                                                        None),
+            model_revision: Optional[str] = DEFAULT_MODEL_REVISION,
+            **kwargs):
+
+        super().__init__(
+            model=model,
+            cfg_file=cfg_file,
+            cfg_modify_fn=cfg_modify_fn,
+            arg_parse_fn=arg_parse_fn,
+            data_collator=data_collator,
+            preprocessor=preprocessor,
+            optimizers=optimizers,
+            train_dataset=train_dataset,
+            eval_dataset=eval_dataset,
+            model_revision=model_revision,
+            **kwargs)
+
+    def get_data_collator(self, data_collator):
+        """Get the data collator for both training and evaluating.
+
+        Args:
+            data_collator: The input data_collator param.
+
+        Returns:
+            The train_data_collator and eval_data_collator, can be None.
+        """
+        if data_collator is None:
+            data_collator = SentenceEmbeddingCollator(
+                tokenizer=self.train_preprocessor.nlp_tokenizer,
+                max_length=self.train_preprocessor.max_length)
+        return super().get_data_collator(data_collator)
+
+    def evauate(self):
+        return {}
diff --git a/modelscope/trainers/nlp/sequence_classification_trainer.py b/modelscope/trainers/nlp/sequence_classification_trainer.py
index ec46e037..2ea4b341 100644
--- a/modelscope/trainers/nlp/sequence_classification_trainer.py
+++ b/modelscope/trainers/nlp/sequence_classification_trainer.py
@@ -37,10 +37,11 @@ class SequenceClassificationTrainer(BaseTrainer):
         """get attribute from config, if the attribute does exist, return false
 
         Example:
+
         >>> self.__attr_is_exist("model path")
-        out: (model-path, "/workspace/bert-base-sst2")
+        >>> out: (model-path, "/workspace/bert-base-sst2")
         >>> self.__attr_is_exist("model weights")
-        out: (model-weights, False)
+        >>> out: (model-weights, False)
 
         Args:
             attr (str): attribute str, "model path" -> config["model"][path]
diff --git a/modelscope/trainers/nlp/table_question_answering_trainer.py b/modelscope/trainers/nlp/table_question_answering_trainer.py
index 49d88874..35b399a6 100644
--- a/modelscope/trainers/nlp/table_question_answering_trainer.py
+++ b/modelscope/trainers/nlp/table_question_answering_trainer.py
@@ -38,7 +38,7 @@ class TableQuestionAnsweringTrainer(BaseTrainer):
                                         num_training_steps,
                                         last_epoch=-1):
         """
-        set scheduler
+        set scheduler.
         """
 
         def lr_lambda(current_step: int):
diff --git a/modelscope/trainers/nlp_trainer.py b/modelscope/trainers/nlp_trainer.py
index 87d175a2..1bada221 100644
--- a/modelscope/trainers/nlp_trainer.py
+++ b/modelscope/trainers/nlp_trainer.py
@@ -1,429 +1,21 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 
 import os
-from copy import deepcopy
-from dataclasses import dataclass, field
-from typing import Callable, Dict, List, Optional, Tuple, Union
+from typing import Tuple, Union
 
 import numpy as np
-import torch
 from torch import nn
-from torch.utils.data import Dataset
 
 from modelscope.metainfo import Trainers
 from modelscope.metrics.builder import build_metric
 from modelscope.models.base import Model, TorchModel
-from modelscope.msdatasets import MsDataset
 from modelscope.preprocessors import Preprocessor
-from modelscope.utils.config import Config, ConfigDict
-from modelscope.utils.constant import (DEFAULT_MODEL_REVISION, ModeKeys,
-                                       ModelFile)
-from modelscope.utils.hub import parse_label_mapping
+from modelscope.utils.config import Config
+from modelscope.utils.constant import ModeKeys
 from .base import TRAINERS
 from .trainer import EpochBasedTrainer
 
 
-@dataclass
-class NlpTrainerArguments:
-    """The arguments for the nlp trainer.
-
-    All the arguments listed here have None default values, which means follow the default value in the input
-    cfg dict.
-    """
-
-    work_dir: Optional[str] = field(
-        default=None, metadata={'help': 'The work dir(key: train.work_dir)'})
-
-    task: Optional[str] = field(
-        default=None, metadata={'help': 'The task type(key: task)'})
-
-    preprocessor_type: Optional[str] = field(
-        default=None,
-        metadata={'help': 'The preprocessor type(key: preprocessor.type)'})
-
-    train_first_sequence: str = field(
-        default=None,
-        metadata={
-            'help':
-            'The key of first sentence for the training dataset(key:preprocessor.train.'
-            'first_sequence/dataset.train.first_sequence)'
-        })
-
-    train_second_sequence: Optional[str] = field(
-        default=None,
-        metadata={
-            'help':
-            'The key of second sentence for the training dataset(key:preprocessor.train.'
-            'second_sequence/dataset.train.second_sequence)'
-        })
-
-    train_label: str = field(
-        default=None,
-        metadata={
-            'help':
-            'The key of label for the training dataset(key:preprocessor.train.'
-            'second_sequence/dataset.train.second_sequence)'
-        })
-
-    eval_first_sequence: Optional[str] = field(
-        default=None,
-        metadata={
-            'help':
-            'The key of first sentence for the eval dataset(key:preprocessor.val.'
-            'first_sequence/dataset.val.first_sequence), '
-            'if not provided, the trainer will use the train_first_sequence for evaluation'
-        })
-
-    eval_second_sequence: Optional[str] = field(
-        default=None,
-        metadata={
-            'help':
-            'The key of second sentence for the eval dataset(key:preprocessor.val.'
-            'second_sequence/dataset.val.second_sequence),'
-            'if not provided, the trainer will use the train_second_sequence for evaluation'
-        })
-
-    eval_label: Optional[str] = field(
-        default=None,
-        metadata={
-            'help':
-            'The key of label for the eval dataset(key:preprocessor.val.'
-            'second_sequence/dataset.val.second_sequence),'
-            'if not provided, the trainer will use the train_label for evaluation'
-        })
-
-    labels: Optional[List] = field(
-        default=None,
-        metadata={
-            'help':
-            'The labels list of the dataset(key:dataset.train.labels),'
-            'This parameter has the same effect with "label2id"'
-        })
-
-    max_epochs: Optional[int] = field(
-        default=None,
-        metadata={
-            'help':
-            'The max_epochs of the training loop(key: train.max_epochs)'
-        })
-
-    train_batch_size_per_gpu: Optional[int] = field(
-        default=None,
-        metadata={
-            'help':
-            'The train batch size per gpu(key: train.dataloader.batch_size_per_gpu)'
-        })
-
-    train_workers_per_gpu: Optional[int] = field(
-        default=None,
-        metadata={
-            'help':
-            'The number of workers per gpu(key: train.dataloader.workers_per_gpu)'
-        })
-
-    train_shuffle: Optional[bool] = field(
-        default=None,
-        metadata={
-            'help':
-            'Shuffle the train dataset or not(key: train.dataloader.shuffle)'
-        })
-
-    eval_batch_size_per_gpu: Optional[int] = field(
-        default=None,
-        metadata={
-            'help':
-            'The eval batch size per gpu(key: evaluation.dataloader.batch_size_per_gpu)'
-        })
-
-    eval_workers_per_gpu: Optional[int] = field(
-        default=None,
-        metadata={
-            'help':
-            'The number of workers per gpu(key: evaluation.dataloader.workers_per_gpu)'
-        })
-
-    eval_shuffle: Optional[bool] = field(
-        default=None,
-        metadata={
-            'help':
-            'Shuffle the eval dataset or not(key: evaluation.dataloader.shuffle)'
-        })
-
-    optimizer_args: Optional[Dict] = field(
-        default=None,
-        metadata={'help': 'The optimizer config dict(key: train.optimizer)'})
-
-    lr_scheduler_args: Optional[Dict] = field(
-        default=None,
-        metadata={
-            'help': 'The lr_scheduler config dict(key: train.lr_scheduler)'
-        })
-
-    checkpoint_saving_type: Optional[str] = field(
-        default=None,
-        metadata={
-            'help':
-            'The checkpoint saving type(key: The ckpt hook dict in train.hooks), '
-            'valid options: "BestCkptSaverHook", "CheckpointHook"'
-        })
-
-    checkpoint_by_epoch: Optional[bool] = field(
-        default=None,
-        metadata={
-            'help':
-            'Saving checkpoint by epoch or not(key: The by_epoch key in '
-            'ckpt hook dict in train.hooks)'
-        })
-
-    checkpoint_interval: Optional[int] = field(
-        default=None,
-        metadata={
-            'help':
-            'The checkpoint saving interval(key: The interval key in '
-            'ckpt hook dict in train.hooks)'
-        })
-
-    metric_key: Optional[str] = field(
-        default=None,
-        metadata={
-            'help':
-            'The metric key for the BestCkptSaverHook(key: The metric_key key in '
-            'ckpt hook dict in train.hooks), if the checkpoint_saving_type is "CheckpointHook" or '
-            '"None", the metric_key key has no effects'
-        })
-
-    evaluation_type: Optional[str] = field(
-        default=None,
-        metadata={
-            'help':
-            'The evaluation type(key: The evaluation hook dict in train.hooks), '
-            'valid options: "EvaluationHook", "None"'
-        })
-
-    evaluation_by_epoch: Optional[bool] = field(
-        default=None,
-        metadata={
-            'help':
-            'Evaluating by epoch or not(key: The by_epoch key in '
-            'evaluation hook dict in train.hooks)'
-        })
-
-    evaluation_interval: Optional[int] = field(
-        default=None,
-        metadata={
-            'help':
-            'The evaluating interval(key: The interval key in '
-            'evaluation hook dict in train.hooks)'
-        })
-
-    metrics: Optional[List[str]] = field(
-        default=None,
-        metadata={'help': 'The metrics class keys(key: evaluation.metrics)'})
-
-    default_train_config = ConfigDict({
-        'work_dir':
-        '/tmp',
-        'max_epochs':
-        5,
-        'dataloader': {
-            'batch_size_per_gpu': 32,
-            'workers_per_gpu': 0
-        },
-        'optimizer': {
-            'type': 'AdamW',
-            'lr': 2e-5,
-            'options': {}
-        },
-        'lr_scheduler': {
-            'type': 'LinearLR',
-            'start_factor': 1.0,
-            'end_factor': 0.0,
-            'total_iters': 10000,
-            'options': {
-                'by_epoch': False
-            }
-        },
-        'hooks': [{
-            'type': 'CheckpointHook',
-            'by_epoch': False,
-            'interval': 100
-        }, {
-            'type': 'TextLoggerHook',
-            'interval': 1
-        }, {
-            'type': 'IterTimerHook'
-        }, {
-            'type': 'EvaluationHook',
-            'by_epoch': False,
-            'interval': 100
-        }]
-    })
-
-    def __call__(self, cfg):
-        """
-
-        Args:
-            cfg(`Config`): The cfg to be modified.
-
-        Returns:
-            The cfg after modification.
-        """
-
-        if self.task is not None:
-            cfg.task = self.task
-
-        if self.preprocessor_type is not None:
-            if not hasattr(cfg, 'preprocessor'):
-                cfg.preprocessor = ConfigDict()
-            cfg.preprocessor.type = self.preprocessor_type
-
-        if self.train_first_sequence is not None or self.train_second_sequence \
-                is not None or self.train_label is not None or self.labels is not None:
-            if not hasattr(cfg, 'dataset'):
-                cfg.dataset = ConfigDict()
-            if not hasattr(cfg.dataset, 'train'):
-                cfg.dataset.train = ConfigDict()
-            if self.train_first_sequence is not None:
-                cfg.dataset.train.first_sequence = self.train_first_sequence
-            if self.train_second_sequence is not None:
-                cfg.dataset.train.second_sequence = self.train_second_sequence
-            if self.train_label is not None:
-                cfg.dataset.train.label = self.train_label
-            if self.labels is not None:
-                cfg.dataset.train.labels = self.labels
-
-        if self.eval_first_sequence is not None or self.eval_second_sequence \
-                is not None or self.eval_label is not None:
-            if not hasattr(cfg, 'dataset'):
-                cfg.dataset = ConfigDict()
-            if not hasattr(cfg.dataset, 'val'):
-                cfg.dataset.val = ConfigDict()
-            if self.eval_first_sequence is not None:
-                cfg.dataset.val.first_sequence = self.eval_first_sequence
-            if self.eval_second_sequence is not None:
-                cfg.dataset.val.second_sequence = self.eval_second_sequence
-            if self.eval_label is not None:
-                cfg.dataset.val.label = self.eval_label
-
-        if self.max_epochs is not None or self.train_batch_size_per_gpu is not None \
-                or self.train_shuffle is not None or self.optimizer_args is not None \
-                or self.work_dir is not None or self.lr_scheduler_args is not None\
-                or self.train_workers_per_gpu is not None:
-            if not hasattr(cfg, 'train'):
-                cfg.train = deepcopy(self.default_train_config)
-            if not hasattr(cfg.train, 'dataloader'):
-                cfg.train.dataloader = deepcopy(
-                    self.default_train_config.dataloader)
-            if not hasattr(cfg.train, 'optimizer'):
-                cfg.train.optimizer = deepcopy(
-                    self.default_train_config.optimizer)
-            if not hasattr(cfg.train, 'lr_scheduler'):
-                cfg.train.lr_scheduler = deepcopy(
-                    self.default_train_config.lr_scheduler)
-            if self.work_dir is not None:
-                cfg.train.work_dir = self.work_dir
-            if self.max_epochs is not None:
-                cfg.train.max_epochs = self.max_epochs
-            if self.train_batch_size_per_gpu is not None:
-                cfg.train.dataloader.batch_size_per_gpu = self.train_batch_size_per_gpu
-            if self.train_workers_per_gpu is not None:
-                cfg.train.dataloader.workers_per_gpu = self.train_workers_per_gpu
-            if self.train_shuffle is not None:
-                cfg.train.dataloader.shuffle = self.train_shuffle
-            if self.optimizer_args is not None:
-                if cfg.train.optimizer.type != self.optimizer_args.get(
-                        'type', cfg.train.optimizer.type):
-                    cfg.train.optimizer = ConfigDict(
-                        deepcopy(self.optimizer_args))
-                else:
-                    cfg.train.optimizer = Config._merge_a_into_b(
-                        self.optimizer_args, cfg.train.optimizer, force=True)
-            if self.lr_scheduler_args is not None:
-                if cfg.train.lr_scheduler.type != self.lr_scheduler_args.get(
-                        'type', cfg.train.lr_scheduler.type):
-                    cfg.train.lr_scheduler = ConfigDict(
-                        deepcopy(self.lr_scheduler_args))
-                else:
-                    cfg.train.lr_scheduler = Config._merge_a_into_b(
-                        self.lr_scheduler_args,
-                        cfg.train.lr_scheduler,
-                        force=True)
-
-        if self.checkpoint_saving_type is not None or self.checkpoint_by_epoch is not None \
-                or self.checkpoint_interval is not None or self.metric_key is not None:
-            if not any([
-                    self.checkpoint_saving_type == hook['type']
-                    for hook in cfg.train.hooks
-            ]):
-                cfg.train.hooks = list(
-                    filter(
-                        lambda hook: hook['type'] not in
-                        ['CheckpointHook', 'BestCkptSaverHook'],
-                        cfg.train.hooks))
-                cfg.train.hooks.append(
-                    deepcopy(self.default_train_config.hooks[0]))
-                cfg.train.hooks[-1].type = self.checkpoint_saving_type
-            checkpoint_hook = list(
-                filter(
-                    lambda hook: hook[
-                        'type'] in ['CheckpointHook', 'BestCkptSaverHook'],
-                    cfg.train.hooks))[0]
-            if self.checkpoint_by_epoch is not None:
-                checkpoint_hook['by_epoch'] = self.checkpoint_by_epoch
-            if self.checkpoint_interval is not None:
-                checkpoint_hook['interval'] = self.checkpoint_interval
-            if checkpoint_hook['type'] == 'BestCkptSaverHook':
-                assert self.metric_key is not None, 'The metric_key must be provided ' \
-                                                    'if the ckpt saving hook is "BestCkptSaverHook"'
-                checkpoint_hook['metric_key'] = self.metric_key
-
-        if self.evaluation_type is not None or self.evaluation_by_epoch is not None \
-                or self.evaluation_interval is not None or self.eval_batch_size_per_gpu is not None or \
-                self.eval_shuffle is not None or self.metrics is not None:
-            if self.evaluation_type is not None and not any([
-                    self.evaluation_type == hook['type']
-                    for hook in cfg.train.hooks
-            ]):
-                cfg.train.hooks = list(
-                    filter(lambda hook: hook['type'] not in ['EvaluationHook'],
-                           cfg.train.hooks))
-                if self.evaluation_type != 'None':
-                    cfg.train.hooks.append(
-                        deepcopy(self.default_train_config.hooks[3]))
-                    cfg.train.hooks[-1].type = self.evaluation_type
-
-            evaluation_hook = list(
-                filter(lambda hook: hook['type'] in ['EvaluationHook'],
-                       cfg.train.hooks))
-            evaluation_hook = evaluation_hook[0] if len(
-                evaluation_hook) > 0 else None
-
-            if evaluation_hook is not None and self.evaluation_by_epoch is not None:
-                evaluation_hook['by_epoch'] = self.evaluation_by_epoch
-            if evaluation_hook is not None and self.evaluation_interval is not None:
-                evaluation_hook['interval'] = self.evaluation_interval
-
-            if not hasattr(cfg, 'evaluation'):
-                cfg.evaluation = ConfigDict({
-                    'dataloader': {
-                        'batch_size_per_gpu': 32,
-                        'workers_per_gpu': 0,
-                        'shuffle': False
-                    }
-                })
-
-            if self.metrics is not None:
-                cfg.evaluation.metrics = self.metrics
-            if self.eval_batch_size_per_gpu is not None:
-                cfg.evaluation.dataloader.batch_size_per_gpu = self.eval_batch_size_per_gpu
-            if self.eval_workers_per_gpu is not None:
-                cfg.evaluation.dataloader.workers_per_gpu = self.eval_workers_per_gpu
-            if self.eval_shuffle is not None:
-                cfg.evaluation.dataloader.shuffle = self.eval_shuffle
-
-        return cfg
-
-
 @TRAINERS.register_module(module_name=Trainers.nlp_base_trainer)
 class NlpEpochBasedTrainer(EpochBasedTrainer):
     """Add code to adapt with nlp models.
diff --git a/modelscope/trainers/trainer.py b/modelscope/trainers/trainer.py
index bbe9d006..715d9946 100644
--- a/modelscope/trainers/trainer.py
+++ b/modelscope/trainers/trainer.py
@@ -1,4 +1,5 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
+import inspect
 import os
 import time
 from collections.abc import Mapping
@@ -16,6 +17,8 @@ from torch.utils.data.distributed import DistributedSampler
 
 from modelscope.metainfo import Trainers
 from modelscope.metrics import build_metric, task_default_metrics
+from modelscope.metrics.prediction_saving_wrapper import \
+    PredictionSavingWrapper
 from modelscope.models.base import Model, TorchModel
 from modelscope.msdatasets.ms_dataset import MsDataset
 from modelscope.msdatasets.task_datasets.builder import build_task_dataset
@@ -27,7 +30,7 @@ from modelscope.trainers.hooks.builder import HOOKS
 from modelscope.trainers.hooks.priority import Priority, get_priority
 from modelscope.trainers.lrscheduler.builder import build_lr_scheduler
 from modelscope.trainers.optimizer.builder import build_optimizer
-from modelscope.utils.config import Config, ConfigDict
+from modelscope.utils.config import Config, ConfigDict, JSONIteratorEncoder
 from modelscope.utils.constant import (DEFAULT_MODEL_REVISION, ConfigFields,
                                        ConfigKeys, ModeKeys, ModelFile,
                                        TrainerStages)
@@ -41,7 +44,7 @@ from modelscope.utils.torch_utils import (broadcast, get_dist_info,
                                           is_master, set_random_seed)
 from .base import BaseTrainer
 from .builder import TRAINERS
-from .default_config import merge_cfg
+from .default_config import merge_cfg, merge_hooks
 from .hooks.hook import Hook
 from .parallel.builder import build_parallel
 from .parallel.utils import is_parallel
@@ -76,12 +79,15 @@ class EpochBasedTrainer(BaseTrainer):
         seed (int): The optional random seed for torch, cuda, numpy and random.
         max_epochs: (int, optional): Total training epochs.
         cfg_modify_fn: An input fn which is used to modify the cfg read out of the file.
+        remove_unused_data: Automatically remove unused data keys in mini-batches.
+            The remove action based on the `inspect` on the model's forward method, the removed columns will be
+            moved to the mini-batch's attributes.
 
         Examples of cfg_modify_fn:
-        >>> def cfg_modify_fn(cfg):
-        >>>     cfg.preprocessor.first_sequence= 'text1'
-        >>>     cfg.preprocessor.second_sequence='text2'
-        >>>     return cfg
+            >>> def cfg_modify_fn(cfg):
+            >>>     cfg.preprocessor.first_sequence= 'text1'
+            >>>     cfg.preprocessor.second_sequence='text2'
+            >>>     return cfg
     """
 
     def __init__(
@@ -129,6 +135,15 @@ class EpochBasedTrainer(BaseTrainer):
         # add default config
         merge_cfg(self.cfg)
         self.cfg = self.rebuild_config(self.cfg)
+        self.logger = get_logger(log_level=self.cfg.get('log_level', 'INFO'))
+        self.logger.info(
+            '==========================Training Config Start=========================='
+        )
+        self.logger.info(
+            json.dumps(self.cfg._cfg_dict, indent=4, cls=JSONIteratorEncoder))
+        self.logger.info(
+            '===========================Training Config End==========================='
+        )
         if 'cfg_options' in kwargs:
             self.cfg.merge_from_dict(kwargs['cfg_options'])
 
@@ -171,7 +186,8 @@ class EpochBasedTrainer(BaseTrainer):
             **kwargs)
 
         self.train_data_collator, self.eval_data_collator = self.get_data_collator(
-            data_collator)
+            data_collator,
+            remove_unused_data=kwargs.get('remove_unused_data', False))
         self.metrics = self.get_metrics()
         self._max_epochs = kwargs.get('max_epochs',
                                       self.cfg.safe_get('train.max_epochs'))
@@ -195,12 +211,12 @@ class EpochBasedTrainer(BaseTrainer):
             if not is_parallel(self.model) and self._dist:
                 self.model = self.to_parallel(self.model)
 
-    def get_data_collator(self, data_collator):
+    def get_data_collator(self, data_collator, remove_unused_data=False):
         """Get the data collator for both training and evaluating.
 
         Args:
             data_collator: The input data_collator param.
-
+            remove_unused_data: Remove the unused data with 'RemoveColumnsCollator'.
         Returns:
             The train_data_collator and eval_data_collator, can be None.
         """
@@ -217,6 +233,19 @@ class EpochBasedTrainer(BaseTrainer):
             collate_fn = default_collate if data_collator is None else data_collator
             train_data_collator = collate_fn
             eval_data_collator = collate_fn
+
+        if remove_unused_data:
+            from modelscope.utils.data_collators import RemoveColumnsCollator
+
+            def _set_signature_columns_if_needed():
+                signature = inspect.signature(self.model.forward)
+                return list(signature.parameters.keys())
+
+            model_inputs = _set_signature_columns_if_needed()
+            train_data_collator = RemoveColumnsCollator(
+                train_data_collator, model_inputs)
+            eval_data_collator = RemoveColumnsCollator(eval_data_collator,
+                                                       model_inputs)
         return train_data_collator, eval_data_collator
 
     def init_dist(self, launcher=None):
@@ -470,39 +499,123 @@ class EpochBasedTrainer(BaseTrainer):
             metrics = [metrics]
         return metrics
 
-    def set_checkpoint_file_to_hook(self, checkpoint_path):
+    def set_checkpoint_file_to_hook(self, checkpoint_path, load_all_state):
         if checkpoint_path is not None:
             if os.path.isfile(checkpoint_path):
-                from modelscope.trainers.hooks import CheckpointHook
-                checkpoint_hooks = list(
-                    filter(lambda hook: isinstance(hook, CheckpointHook),
+                from modelscope.trainers.hooks import LoadCheckpointHook
+                load_ckpt_hooks = list(
+                    filter(lambda hook: isinstance(hook, LoadCheckpointHook),
                            self.hooks))
-                for hook in checkpoint_hooks:
-                    hook.checkpoint_file = checkpoint_path
+                if len(load_ckpt_hooks) == 0:
+                    load_ckpt_hook = LoadCheckpointHook()
+                    self.hooks.append(load_ckpt_hook)
+                    load_ckpt_hooks.append(load_ckpt_hook)
+                load_ckpt_hooks[0].checkpoint_file = checkpoint_path
+                load_ckpt_hooks[0].load_all_state = load_all_state
             else:
                 self.logger.error(
                     f'No {checkpoint_path} found in local file system.')
 
-    def train(self, checkpoint_path=None, *args, **kwargs):
+    def train(self,
+              checkpoint_path=None,
+              load_all_state=True,
+              *args,
+              **kwargs):
+        """Start training.
+
+        Args:
+            checkpoint_path(`str`, `optional`): The previous saving checkpoint to read,
+                usually it's a `some-file-name.pth` file generated by this trainer.
+            load_all_state(`bool`: `optional`): Load all state out of the `checkpoint_path` file, including the
+                state dict of model, optimizer, lr_scheduler, the random state and epoch/iter number. If False, only
+                the model's state dict will be read, and model will be trained again.
+        """
+
         self._mode = ModeKeys.TRAIN
         self.train_dataloader = self.get_train_dataloader()
         self.data_loader = self.train_dataloader
         self.register_optimizers_hook()
-        self.register_hook_from_cfg(self.cfg.train.hooks)
-        self.set_checkpoint_file_to_hook(checkpoint_path)
+        hooks = merge_hooks(self.cfg)
+        self.register_hook_from_cfg(hooks)
+        self.set_checkpoint_file_to_hook(checkpoint_path, load_all_state)
         self.model.train()
 
         self.train_loop(self.train_dataloader)
 
-    def evaluate(self, checkpoint_path=None):
+    def predict(self,
+                predict_datasets: Union[Dataset, List[Dataset]],
+                saving_fn,
+                checkpoint_path=None):
+        """Start prediction.
+
+        Args:
+            predict_datasets(Union[Dataset, List[Dataset]]): The datasets used to predict ground truth.
+
+            saving_fn(`Callable`): The callable used to save the prediction values to files. Like:
+                >>> class SavingFn:
+                >>>     def __init__(self):
+                >>>         self.filename = '/tmp/results.txt'
+                >>>
+                >>>     def __call__(self, inputs, outputs):
+                >>>         import numpy as np
+                >>>         ids = inputs.ids
+                >>>         predictions = np.argmax(outputs['logits'].cpu().numpy(), axis=1)
+                >>>         with open(self.filename, 'a') as f:
+                >>>             for id, pred in zip(ids, predictions):
+                >>>                 f.writelines(f'{id}, {pred}')
+
+                This saving_fn's result will not be collected to one file, Training with multiprocessing please
+                consider combining these files manually.
+
+            checkpoint_path(`str`, `optional`): The previous saving checkpoint to read,
+                usually it's a `some-file-name.pth` file or a pure PyTorch `some-file.bin` file
+                generated by this trainer.
+        """
+
         if checkpoint_path is not None and os.path.isfile(checkpoint_path):
-            from modelscope.trainers.hooks import CheckpointHook
-            CheckpointHook.load_checkpoint(checkpoint_path, self)
+            from modelscope.trainers.hooks import LoadCheckpointHook
+            LoadCheckpointHook.load_checkpoint(checkpoint_path, self)
+        self.model.eval()
+        self._mode = ModeKeys.EVAL
+        predict_dataloader = self.get_predict_data_loader(predict_datasets)
+        metric_classes = [PredictionSavingWrapper(saving_fn=saving_fn)]
+
+        for m in metric_classes:
+            m.trainer = self
+
+        self.evaluation_loop(predict_dataloader, metric_classes)
+
+    def evaluate(self, checkpoint_path=None, saving_fn=None, **kwargs):
+        """Start evaluation.
+
+        Args:
+            checkpoint_path(`str`, `optional`): The previous saving checkpoint to read,
+                usually it's a `some-file-name.pth` file or a pure PyTorch `some-file.bin` file
+                generated by this trainer.
+
+            saving_fn(`Callable`): The callable used to save the prediction values to files. Like:
+                >>> class SavingFn:
+                >>>     def __init__(self):
+                >>>         self.filename = '/tmp/results.txt'
+                >>>
+                >>>     def __call__(self, inputs, outputs):
+                >>>         import numpy as np
+                >>>         ids = inputs.ids
+                >>>         predictions = np.argmax(outputs['logits'].cpu().numpy(), axis=1)
+                >>>         with open(self.filename, 'a') as f:
+                >>>             for id, pred in zip(ids, predictions):
+                >>>                 f.writelines(f'{id}, {pred}')
+        """
+        if checkpoint_path is not None and os.path.isfile(checkpoint_path):
+            from modelscope.trainers.hooks import LoadCheckpointHook
+            LoadCheckpointHook.load_checkpoint(checkpoint_path, self)
         self.model.eval()
         self._mode = ModeKeys.EVAL
         self.eval_dataloader = self.get_eval_data_loader()
         self.data_loader = self.eval_dataloader
         metric_classes = [build_metric(metric) for metric in self.metrics]
+        if saving_fn is not None:
+            metric_classes.append(PredictionSavingWrapper(saving_fn=saving_fn))
         for m in metric_classes:
             m.trainer = self
 
@@ -672,6 +785,28 @@ class EpochBasedTrainer(BaseTrainer):
             **default_config)
         return data_loader
 
+    def get_predict_data_loader(self, predict_datasets: Union[Dataset,
+                                                              List[Dataset]]):
+        """ Builder torch dataloader for prediction with the config of evaluation.
+
+        Args:
+            predict_datasets(Union[Dataset, List[Dataset]]): The datasets used to predict ground truth.
+        """
+        dataset = self.to_task_dataset(
+            predict_datasets,
+            mode=ModeKeys.EVAL,
+            preprocessor=self.eval_preprocessor)
+
+        default_config = {'shuffle': False}
+        default_config.update(self.cfg.evaluation.get('dataloader', {}))
+        data_loader = self._build_dataloader_with_dataset(
+            dataset,
+            dist=self._dist,
+            seed=self._seed,
+            collate_fn=self.eval_data_collator,
+            **default_config)
+        return data_loader
+
     def build_dataset(self, data_cfg, mode, preprocessor=None):
         """ Build torch dataset object using data config
         """
@@ -955,23 +1090,21 @@ class EpochBasedTrainer(BaseTrainer):
                 vis_closure=vis_closure,
                 data_loader_iters=self._eval_iters_per_epoch)
 
-        self._inner_iter = self.iters_per_epoch - 1  # start from index 0
-
         return metric_values
 
     def visualization(self, batch_result, dataset, **kwargs):
         """ visualization function for evaluation results.
 
         Examples:
-            # draw list of images as numpy array
-            images = draw_images(num_of_visualization)
+            >>> # draw list of images as numpy array
+            >>> images = draw_images(num_of_visualization)
 
-            # set displayed name for each image
-            filenames = get_image_display_names()
-            vis_results = {'images': images, 'filenames' : filenames}
+            >>> # set displayed name for each image
+            >>> filenames = get_image_display_names()
+            >>> vis_results = {'images': images, 'filenames' : filenames}
 
-            # visualization results will be displayed in group named eva_vis
-            self.visualization_buffer.output['eval_vis'] = vis_results
+            >>> # visualization results will be displayed in group named eva_vis
+            >>> self.visualization_buffer.output['eval_vis'] = vis_results
 
         Args:
             results (list(dict)):  a list of result dict.
@@ -1006,7 +1139,7 @@ class EpochBasedTrainer(BaseTrainer):
         if not inserted:
             self._hooks.insert(0, hook)
 
-    def register_hook_from_cfg(self, hook_cfg: Dict) -> None:
+    def register_hook_from_cfg(self, hook_cfg: List) -> None:
         """Register a hook from its cfg.
 
         Args:
diff --git a/modelscope/trainers/training_args.py b/modelscope/trainers/training_args.py
index c387e7b8..c5a33426 100644
--- a/modelscope/trainers/training_args.py
+++ b/modelscope/trainers/training_args.py
@@ -1,169 +1,625 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 
-import dataclasses
+import re
 from argparse import Action, ArgumentDefaultsHelpFormatter, ArgumentParser
-from typing import Any, Dict, List, Union
+from dataclasses import dataclass, field, fields
+from functools import partial
+from typing import Any, Dict, List, Tuple, Union
 
-from addict import Dict as Adict
+from modelscope.trainers.default_config import DEFAULT_CONFIG
+from modelscope.utils.config import Config, ConfigDict
+from modelscope.utils.hub import read_config
 
 
-@dataclasses.dataclass
-class ArgAttr():
-    """ Attributes for each arg
+def get_flatten_value(config: Config, metadata: Dict, exclusions=None):
+    cfg_node = metadata['cfg_node']
+    if exclusions is None:
+        exclusions = []
 
-    Args:
-        cfg_node_name (str or list[str]): if set empty, it means a normal arg for argparse, otherwise it means
-            this arg value correspond to those nodes in configuration file, and will replace them for training.
-        default:  default value for current argument.
-        type:  type for current argument.
-        choices (list of str): choices of value for this argument.
-        help (str): help str for this argument.
-
-    Examples:
-    ```python
-    # define argument train_batch_size which corresponds to train.dataloader.batch_size_per_gpu
-    training_args = Adict(
-        train_batch_size=ArgAttr(
-            'train.dataloader.batch_size_per_gpu',
-            default=16,
-            type=int,
-            help='training batch size')
-    )
-
-    # num_classes which will modify three places in configuration
-    training_args = Adict(
-    num_classes = ArgAttr(
-        ['model.mm_model.head.num_classes',
-         'model.mm_model.train_cfg.augments.0.num_classes',
-         'model.mm_model.train_cfg.augments.1.num_classes'],
-        type=int,
-        help='number of classes')
-    )
-    ```
-    # a normal argument which has no relation with configuration
-    training_args = Adict(
-        local_rank = ArgAttr(
-            '',
-            default=1,
-            type=int,
-            help='local rank for current training process')
-        )
-
-    """
-    cfg_node_name: Union[str, List[str]] = ''
-    default: Any = None
-    type: type = None
-    choices: List[str] = None
-    help: str = ''
+    values = config.safe_get(cfg_node)
+    if isinstance(values, dict):
+        param_map = []
+        for key, value in values.items():
+            if key in exclusions or not isinstance(value,
+                                                   (str, int, float, bool)):
+                continue
+            value = add_quotes_for_str(value)
+            param_map.append(f'{key}={value}')
+        return ','.join(param_map)
+    else:
+        return values
 
 
-training_args = Adict(
-    train_batch_size=ArgAttr(
-        'train.dataloader.batch_size_per_gpu',
-        default=16,
-        type=int,
-        help='training batch size'),
-    train_data_worker=ArgAttr(
-        'train.dataloader.workers_per_gpu',
-        default=8,
-        type=int,
-        help='number of data worker used for training'),
-    eval_batch_size=ArgAttr(
-        'evaluation.dataloader.batch_size_per_gpu',
-        default=16,
-        type=int,
-        help='training batch size'),
-    max_epochs=ArgAttr(
-        'train.max_epochs',
-        default=10,
-        type=int,
-        help='max number of training epoch'),
-    work_dir=ArgAttr(
-        'train.work_dir',
-        default='./work_dir',
-        type=str,
-        help='training directory to save models and training logs'),
-    lr=ArgAttr(
-        'train.optimizer.lr',
-        default=0.001,
-        type=float,
-        help='initial learning rate'),
-    optimizer=ArgAttr(
-        'train.optimizer.type',
-        default='SGD',
-        type=str,
-        choices=[
-            'Adadelta', 'Adagrad', 'Adam', 'AdamW', 'Adamax', 'ASGD',
-            'RMSprop', 'Rprop'
-            'SGD'
-        ],
-        help='optimizer type'),
-    local_rank=ArgAttr(
-        '', default=0, type=int, help='local rank for this process'))
+def set_flatten_value(config: Config, values: Union[str, List[str]],
+                      metadata: Dict):
+    cfg_node = metadata['cfg_node']
+    if values is None:
+        return config
+
+    pairs = values.split(',') if isinstance(values, str) else values
+    for kv in pairs:
+        if len(kv.strip()) == 0:
+            continue
+        key, value = kv.split('=')
+        value = parse_value(value)
+        config.merge_from_dict({cfg_node + '.' + key: value})
+    return config
+
+
+def get_base_hook_args(config: Config, metadata: Dict):
+    cfg_node = metadata['cfg_node']
+    hook_type = metadata['hook_type']
+    key = metadata['key']
+    value = config.safe_get(cfg_node)
+    if value is None:
+        return get_hook_param(config, hook_type, key)
+    else:
+        return True if key == 'type' else value
+
+
+def set_base_hook_args(config: Config, value: Any, metadata: Dict):
+    cfg_node = metadata['cfg_node']
+    hook_type = metadata['hook_type']
+    key = metadata['key']
+    if 'hooks' in config.train:
+        config.train.hooks = [
+            hook for hook in config.train.hooks if hook['type'] != hook_type
+        ]
+    if key == 'type':
+        if value and config.safe_get(cfg_node) is None:
+            config.merge_from_dict({cfg_node: {}})
+    else:
+        config.merge_from_dict({cfg_node: value})
+
+
+def get_strategy(config: Config,
+                 metadata: Dict,
+                 value_pair: Tuple[str] = ('by_epoch', 'by_step')):
+    flag = get_base_hook_args(config, metadata)
+    if flag is None:
+        return None
+    return value_pair[0] if flag else value_pair[1]
+
+
+def set_strategy(config: Config,
+                 value: Any,
+                 metadata: Dict,
+                 value_pair: Tuple[str] = ('by_epoch', 'by_step')):
+    set_base_hook_args(config, value == value_pair[0], metadata)
+
+
+def get_hook_param(config, hook_type: str, key='type'):
+    hooks = config.safe_get('train.hooks', [])
+    _hooks = list(filter(lambda hook: hook['type'] == hook_type, hooks))
+    if key == 'type':
+        return len(_hooks) > 0
+    elif len(_hooks) > 0:
+        return getattr(_hooks[0], key, None)
+    return None
+
+
+def add_quotes_for_str(value: Union[str, float, bool, None]) -> str:
+    if isinstance(value, str):
+        return f'"{value}"'
+    else:
+        return str(value)
+
+
+def parse_value(value: str) -> Union[str, float, bool, None]:
+    const_map = {
+        'True': True,
+        'true': True,
+        'False': False,
+        'false': False,
+        'None': None,
+        'none': None,
+        'null': None
+    }
+    if value in const_map:
+        return const_map[value]
+    elif '"' in value or "'" in value:
+        return value.replace('"', '').replace("'", '')
+    elif re.match(r'^\d+$', value):
+        return int(value)
+    elif re.match(r'[+-]?(?=\d*[.eE])(?=\.?\d)\d*\.?\d*(?:[eE][+-]?\d+)?',
+                  value):
+        return float(value)
+    else:
+        return value
+
+
+@dataclass
+class TrainingArgs:
+    model: str = field(
+        default=None, metadata={
+            'help': 'A model id or model dir',
+        })
+
+    seed: int = field(
+        default=42, metadata={
+            'help': 'The random seed',
+        })
+
+    task: str = field(
+        default=None,
+        metadata={
+            'help': 'The task code to be used',
+            'cfg_node': 'task'
+        })
+
+    dataset_name: str = field(
+        default=None, metadata={
+            'help': 'The dataset name',
+        })
+
+    subset_name: str = field(
+        default=None, metadata={
+            'help': 'The subset name of the dataset',
+        })
+
+    train_dataset_name: str = field(
+        default=None, metadata={
+            'help': 'The train dataset name',
+        })
+
+    val_dataset_name: str = field(
+        default=None, metadata={
+            'help': 'The validation dataset name',
+        })
+
+    per_device_train_batch_size: int = field(
+        default=None,
+        metadata={
+            'cfg_node': 'train.dataloader.batch_size_per_gpu',
+            'help': 'The training batch size per GPU',
+        })
+
+    train_data_worker: int = field(
+        default=None,
+        metadata={
+            'cfg_node': 'train.dataloader.workers_per_gpu',
+            'help': 'The number of data workers for train dataloader',
+        })
+
+    train_shuffle: bool = field(
+        default=None,
+        metadata={
+            'cfg_node': 'train.dataloader.shuffle',
+            'help': 'Shuffle the train dataset or not',
+        })
+
+    per_device_eval_batch_size: int = field(
+        default=None,
+        metadata={
+            'cfg_node': 'evaluation.dataloader.batch_size_per_gpu',
+            'help': 'The eval batch size per GPU',
+        })
+
+    eval_data_worker: int = field(
+        default=None,
+        metadata={
+            'cfg_node': 'evaluation.dataloader.workers_per_gpu',
+            'help': 'The number of data workers for eval dataloader',
+        })
+
+    eval_shuffle: bool = field(
+        default=None,
+        metadata={
+            'cfg_node': 'evaluation.dataloader.shuffle',
+            'help': 'Shuffle the eval dataset or not',
+        })
+
+    max_epochs: int = field(
+        default=None,
+        metadata={
+            'cfg_node': 'train.max_epochs',
+            'help': 'The training epochs',
+        })
+
+    work_dir: str = field(
+        default=None,
+        metadata={
+            'cfg_node': 'train.work_dir',
+            'help': 'The training dir to save models and logs',
+        })
+
+    lr: float = field(
+        default=None,
+        metadata={
+            'cfg_node': 'train.optimizer.lr',
+            'help': 'The learning rate of the optimizer',
+        })
+
+    optimizer: str = field(
+        default=None,
+        metadata={
+            'cfg_node': 'train.optimizer.type',
+            'help': 'The optimizer type',
+        })
+
+    optimizer_params: str = field(
+        default=None,
+        metadata={
+            'cfg_node':
+            'train.optimizer',
+            'cfg_getter':
+            partial(get_flatten_value, exclusions=['type', 'lr', 'options']),
+            'cfg_setter':
+            set_flatten_value,
+            'help':
+            'The optimizer init params except `lr`',
+        })
+
+    lr_scheduler_params: str = field(
+        default=None,
+        metadata={
+            'cfg_node':
+            'train.lr_scheduler',
+            'cfg_getter':
+            partial(get_flatten_value, exclusions=['type', 'lr', 'options']),
+            'cfg_setter':
+            set_flatten_value,
+            'help':
+            'The lr_scheduler init params',
+        })
+
+    local_rank: int = field(
+        default=0, metadata={
+            'help': 'The training local rank',
+        })
+
+    save_ckpt: bool = field(
+        default=True,
+        metadata={
+            'help':
+            'Periodically save checkpoint when True, corresponding to CheckpointHook',
+            'cfg_node': 'train.checkpoint.period',
+            'hook_type': 'CheckpointHook',
+            'key': 'type',
+            'cfg_getter': get_base_hook_args,
+            'cfg_setter': set_base_hook_args,
+        })
+
+    save_ckpt_best: bool = field(
+        default=None,
+        metadata={
+            'help':
+            'Save best checkpoint when True, corresponding to BestCkptSaverHook',
+            'cfg_node': 'train.checkpoint.best',
+            'hook_type': 'BestCkptSaverHook',
+            'key': 'type',
+            'cfg_getter': get_base_hook_args,
+            'cfg_setter': set_base_hook_args,
+        })
+
+    evaluate: bool = field(
+        default=True,
+        metadata={
+            'help': 'Evaluate when True, corresponding to EvaluationHook',
+            'cfg_node': 'evaluation.period',
+            'hook_type': 'EvaluationHook',
+            'key': 'type',
+            'cfg_getter': get_base_hook_args,
+            'cfg_setter': set_base_hook_args,
+        })
+
+    save_ckpt_strategy: str = field(
+        default=None,
+        metadata={
+            'help': 'Periodically save checkpoint by epoch or by step'
+            'use with `CheckpointHook`, can be `by_epoch` or `by_step`',
+            'cfg_node': 'train.checkpoint.period.by_epoch',
+            'hook_type': 'CheckpointHook',
+            'key': 'by_epoch',
+            'choices': ['by_epoch', 'by_step'],
+            'cfg_getter': get_strategy,
+            'cfg_setter': set_strategy,
+        })
+
+    save_ckpt_best_strategy: str = field(
+        default=None,
+        metadata={
+            'help': 'Save best checkpoint by epoch or by step'
+            'use with `BestCkptSaverHook`, can be `by_epoch` or `by_step`',
+            'cfg_node': 'train.checkpoint.best.by_epoch',
+            'hook_type': 'BestCkptSaverHook',
+            'key': 'by_epoch',
+            'choices': ['by_epoch', 'by_step'],
+            'cfg_getter': get_strategy,
+            'cfg_setter': set_strategy,
+        })
+
+    ckpt_period_interval: int = field(
+        default=1,
+        metadata={
+            'help':
+            'The interval of epoch or iter of saving checkpoint period',
+            'cfg_node': 'train.checkpoint.period.interval',
+            'hook_type': 'CheckpointHook',
+            'key': 'interval',
+            'cfg_getter': get_base_hook_args,
+            'cfg_setter': set_base_hook_args,
+        })
+
+    ckpt_best_interval: int = field(
+        default=None,
+        metadata={
+            'help': 'The interval of epoch or iter of saving checkpoint best',
+            'cfg_node': 'train.checkpoint.best.interval',
+            'hook_type': 'BestCkptSaverHook',
+            'key': 'interval',
+            'cfg_getter': get_base_hook_args,
+            'cfg_setter': set_base_hook_args,
+        })
+
+    metric_for_best_model: str = field(
+        default=None,
+        metadata={
+            'help':
+            'Which metric key to judge the checkpoint is better or not, use with `BestCkptSaverHook`, '
+            'please make sure this key is returned by the `evaluation_metrics` classes',
+            'cfg_node':
+            'train.checkpoint.best.metric_key',
+            'hook_type':
+            'BestCkptSaverHook',
+            'key':
+            'metric_key',
+            'cfg_getter':
+            get_base_hook_args,
+            'cfg_setter':
+            set_base_hook_args,
+        })
+
+    metric_rule_for_best_model: str = field(
+        default=None,
+        metadata={
+            'help':
+            'Which rule to compare the value of `checkpoint_saving_metric`, '
+            'use with `BestCkptSaverHook`, can be `max` or `min`',
+            'cfg_node':
+            'train.checkpoint.best.rule',
+            'hook_type':
+            'BestCkptSaverHook',
+            'key':
+            'rule',
+            'cfg_getter':
+            get_base_hook_args,
+            'cfg_setter':
+            set_base_hook_args,
+        })
+
+    save_ckpt_peroid_limit: int = field(
+        default=None,
+        metadata={
+            'help':
+            'The max saving number of checkpoint, older checkpoints will be deleted.',
+            'cfg_node': 'train.checkpoint.period.max_checkpoint_num',
+            'hook_type': 'CheckpointHook',
+            'key': 'max_checkpoint_num',
+            'cfg_getter': get_base_hook_args,
+            'cfg_setter': set_base_hook_args,
+        })
+
+    save_ckpt_best_limit: int = field(
+        default=None,
+        metadata={
+            'help':
+            'The max saving number of checkpoint, worse checkpoints will be deleted.',
+            'cfg_node': 'train.checkpoint.best.max_checkpoint_num',
+            'hook_type': 'BestCkptSaverHook',
+            'key': 'max_checkpoint_num',
+            'cfg_getter': get_base_hook_args,
+            'cfg_setter': set_base_hook_args,
+        })
+
+    logging_interval: int = field(
+        default=None,
+        metadata={
+            'help': 'The interval of iter of logging information',
+            'cfg_node': 'train.logging.interval',
+            'hook_type': 'TextLoggerHook',
+            'key': 'interval',
+            'cfg_getter': get_base_hook_args,
+            'cfg_setter': set_base_hook_args,
+        })
+
+    eval_strategy: str = field(
+        default=None,
+        metadata={
+            'help': 'Evaluate model by epoch or by step'
+            'use with `EvaluationHook`, can be `by_epoch` or `by_step`',
+            'cfg_node': 'evaluation.period.by_epoch',
+            'hook_type': 'EvaluationHook',
+            'key': 'by_epoch',
+            'choices': ['by_epoch', 'by_step'],
+            'cfg_getter': get_strategy,
+            'cfg_setter': set_strategy,
+        })
+
+    eval_interval: int = field(
+        default=1,
+        metadata={
+            'help': 'Evaluation interval by epoch or iter',
+            'cfg_node': 'evaluation.period.interval',
+            'hook_type': 'EvaluationHook',
+            'key': 'interval',
+            'cfg_getter': get_base_hook_args,
+            'cfg_setter': set_base_hook_args,
+        })
+
+    eval_metrics: str = field(
+        default=None,
+        metadata={
+            'help': 'The metric module name used in evaluation',
+            'cfg_node': 'evaluation.metrics'
+        })
+
+    @classmethod
+    def from_cli(cls, parser_args=None, **extra_kwargs):
+        """Construct a TrainingArg class by the parameters of CLI.
+
+        Args:
+            **extra_kwargs: Extra args which can be defined in code.
+
+        Returns:
+            The output TrainingArg class with the parameters from CLI.
+        """
+        self = cls(**extra_kwargs)
+        parser = CliArgumentParser(self)
+        args, unknown = parser.parse_known_args(parser_args)
+        unknown = [item for item in unknown if item not in ('\\', '\n')]
+        _unknown = {}
+        for i in range(0, len(unknown), 2):
+            _unknown[unknown[i].replace('-', '')] = parse_value(unknown[i + 1])
+        cfg_dict = vars(args)
+
+        if args.model is not None:
+            try:
+                cfg = read_config(args.model)
+            except Exception as e:
+                print('Read config failed with error:', e)
+            else:
+                cfg.merge_from_dict(_unknown)
+                self = cls.from_config(cfg, **extra_kwargs)
+        for key, value in cfg_dict.items():
+            if key is not None and hasattr(self,
+                                           key) and key in parser.manual_args:
+                setattr(self, key, value)
+        return self
+
+    def to_args(self):
+        """Convert the TrainingArg class to key-value pairs.
+
+        Returns: The key-value pair.
+
+        """
+        _args = {}
+        for f in fields(self):
+            _args[f.name] = getattr(self, f.name)
+        return _args
+
+    @classmethod
+    def from_config(cls, config=DEFAULT_CONFIG, **kwargs):
+        """Construct the TrainingArg class by a `Config` class.
+
+        Args:
+            config: The Config class. By default, `DEFAULT_CONFIG` is used.
+            **kwargs: Extra args which can be defined in code.
+
+        Returns: The output TrainingArg class with the parameters from the config.
+
+        """
+
+        self = cls(**kwargs)
+        for f in fields(self):
+            if 'cfg_node' in f.metadata and getattr(self, f.name) is None:
+                self._to_field(f, config)
+        return self
+
+    def _to_field(self, f, config):
+        assert 'cfg_node' in f.metadata
+        if 'cfg_getter' in f.metadata:
+            cfg_getter = f.metadata['cfg_getter']
+            setattr(self, f.name, cfg_getter(config, f.metadata))
+        else:
+            cfg_node = f.metadata['cfg_node']
+            setattr(self, f.name, config.safe_get(cfg_node))
+
+    def _to_config(self, f, config: Config):
+        assert 'cfg_node' in f.metadata
+        value = getattr(self, f.name)
+        if 'cfg_setter' in f.metadata:
+            cfg_setter = f.metadata['cfg_setter']
+            config = cfg_setter(config, value, f.metadata)
+        else:
+            cfg_node = f.metadata['cfg_node']
+            if isinstance(cfg_node, str):
+                cfg_node = [cfg_node]
+            for _node in cfg_node:
+                config.merge_from_dict({_node: value})
+        return config
+
+    def __call__(self, cfg: Config):
+        for f in fields(self):
+            if 'cfg_node' not in f.metadata:
+                continue
+
+            value = getattr(self, f.name)
+            if value is not None:
+                self._to_config(f, cfg)
+            else:
+                self._to_field(f, cfg)
+        return cfg
 
 
 class CliArgumentParser(ArgumentParser):
     """ Argument Parser to define and parse command-line args for training.
 
     Args:
-        arg_dict (dict of `ArgAttr` or list of them): dict or list of dict which defines different
+        training_args (TrainingArgs): dict or list of dict which defines different
             paramters for training.
     """
 
-    def __init__(self, arg_dict: Union[Dict[str, ArgAttr],
-                                       List[Dict[str, ArgAttr]]], **kwargs):
+    def __init__(self, training_args: TrainingArgs = None, **kwargs):
         if 'formatter_class' not in kwargs:
             kwargs['formatter_class'] = ArgumentDefaultsHelpFormatter
         super().__init__(**kwargs)
-        self.arg_dict = arg_dict if isinstance(
-            arg_dict, Dict) else self._join_args(arg_dict)
+        self.training_args = training_args
         self.define_args()
 
-    def _join_args(self, arg_dict_list: List[Dict[str, ArgAttr]]):
-        total_args = arg_dict_list[0].copy()
-        for args in arg_dict_list[1:]:
-            total_args.update(args)
-        return total_args
+    def get_manual_args(self, args):
+        return [arg[2:] for arg in args if arg.startswith('--')]
+
+    def _parse_known_args(self, args: List = None, namespace=None):
+        self.model_id = namespace.model if namespace is not None else None
+        if '--model' in args:
+            self.model_id = args[args.index('--model') + 1]
+        self.manual_args = self.get_manual_args(args)
+        return super()._parse_known_args(args, namespace)
+
+    def print_help(self, file=None):
+        config = DEFAULT_CONFIG
+        if self.model_id is not None:
+            try:
+                config = read_config(self.model_id)
+            except Exception as e:
+                print('Read config failed with error:', e)
+
+        if config is not None:
+            for action_group in self._optionals._group_actions:
+                if hasattr(self.training_args, action_group.dest):
+                    value = getattr(self.training_args, action_group.dest)
+                    f = {f.name: f
+                         for f in fields(self.training_args)
+                         }.get(action_group.dest)
+                    if value is not None:
+                        action_group.default = value
+                    elif 'cfg_node' in f.metadata:
+                        cfg_node = f.metadata['cfg_node']
+                        if isinstance(cfg_node, str):
+                            cfg_node = [cfg_node]
+
+                        assert isinstance(cfg_node, (list, tuple))
+                        if isinstance(cfg_node[0], str):
+                            action_group.default = config.safe_get(cfg_node[0])
+                        else:
+                            action_group.default = cfg_node[0](config)
+        return super().print_help(file)
 
     def define_args(self):
-        for arg_name, arg_attr in self.arg_dict.items():
-            name = f'--{arg_name}'
-            kwargs = dict(type=arg_attr.type, help=arg_attr.help)
-            if arg_attr.default is not None:
-                kwargs['default'] = arg_attr.default
-            else:
-                kwargs['required'] = True
+        if self.training_args is not None:
+            for f in fields(self.training_args):
+                arg_name = f.name
+                arg_attr = getattr(self.training_args, f.name)
+                name = f'--{arg_name}'
+                kwargs = dict(type=f.type, help=f.metadata['help'])
+                kwargs['default'] = arg_attr
 
-            if arg_attr.choices is not None:
-                kwargs['choices'] = arg_attr.choices
+                if 'choices' in f.metadata:
+                    kwargs['choices'] = f.metadata['choices']
 
-            kwargs['action'] = SingleAction
-            self.add_argument(name, **kwargs)
-
-    def get_cfg_dict(self, args=None):
-        """
-        Args:
-            args (default None):
-                List of strings to parse. The default is taken from sys.argv. (same as argparse.ArgumentParser)
-
-        Returns:
-            cfg_dict (dict of config): each key is a config node name such as 'train.max_epochs', this cfg_dict
-                should be used with function `cfg.merge_from_dict` to update config object.
-        """
-        self.args, remainning = self.parse_known_args(args)
-        args_dict = vars(self.args)
-        cfg_dict = {}
-        for k, v in args_dict.items():
-            if k not in self.arg_dict or self.arg_dict[k].cfg_node_name == '':
-                continue
-            cfg_node = self.arg_dict[k].cfg_node_name
-            if isinstance(cfg_node, list):
-                for node in cfg_node:
-                    cfg_dict[node] = v
-            else:
-                cfg_dict[cfg_node] = v
-
-        return cfg_dict
+                kwargs['action'] = SingleAction
+                self.add_argument(name, **kwargs)
 
 
 class DictAction(Action):
@@ -215,8 +671,8 @@ class DictAction(Action):
             inside these brackets are ignored.
             """
             assert (string.count('(') == string.count(')')) and (
-                string.count('[') == string.count(']')), \
-                f'Imbalanced brackets exist in {string}'
+                string.count('[')
+                == string.count(']')), f'Imbalanced brackets exist in {string}'
             end = len(string)
             for idx, char in enumerate(string):
                 pre = string[:idx]
diff --git a/modelscope/utils/ast_utils.py b/modelscope/utils/ast_utils.py
index bf2fb854..4b73ed26 100644
--- a/modelscope/utils/ast_utils.py
+++ b/modelscope/utils/ast_utils.py
@@ -3,7 +3,6 @@
 import ast
 import contextlib
 import hashlib
-import importlib
 import os
 import os.path as osp
 import time
@@ -30,6 +29,7 @@ storage = LocalStorage()
 p = Path(__file__)
 
 # get the path of package 'modelscope'
+SKIP_FUNCTION_SCANNING = True
 MODELSCOPE_PATH = p.resolve().parents[1]
 INDEXER_FILE_DIR = get_default_cache_dir()
 REGISTER_MODULE = 'register_module'
@@ -58,7 +58,7 @@ TEMPLATE_PATH = 'TEMPLATE_PATH'
 TEMPLATE_FILE = 'ast_index_file.py'
 
 
-class AstScaning(object):
+class AstScanning(object):
 
     def __init__(self) -> None:
         self.result_import = dict()
@@ -82,6 +82,12 @@ class AstScaning(object):
         else:
             return True
 
+    def _skip_function(self, node: ast.AST) -> bool:
+        if type(node).__name__ == 'FunctionDef' and SKIP_FUNCTION_SCANNING:
+            return True
+        else:
+            return False
+
     def _fields(self, n: ast.AST, show_offsets: bool = True) -> tuple:
         if show_offsets:
             return n._attributes + n._fields
@@ -90,31 +96,16 @@ class AstScaning(object):
 
     def _leaf(self, node: ast.AST, show_offsets: bool = True) -> str:
         output = dict()
-        local_print = list()
         if isinstance(node, ast.AST):
             local_dict = dict()
             for field in self._fields(node, show_offsets=show_offsets):
-                field_output, field_prints = self._leaf(
+                field_output = self._leaf(
                     getattr(node, field), show_offsets=show_offsets)
                 local_dict[field] = field_output
-                local_print.append('{}={}'.format(field, field_prints))
-
-            prints = '{}({})'.format(
-                type(node).__name__,
-                ', '.join(local_print),
-            )
             output[type(node).__name__] = local_dict
-            return output, prints
-        elif isinstance(node, list):
-            if '_fields' not in node:
-                return node, repr(node)
-            for item in node:
-                item_output, item_prints = self._leaf(
-                    getattr(node, item), show_offsets=show_offsets)
-                local_print.append(item_prints)
-            return node, '[{}]'.format(', '.join(local_print), )
+            return output
         else:
-            return node, repr(node)
+            return node
 
     def _refresh(self):
         self.result_import = dict()
@@ -135,14 +126,10 @@ class AstScaning(object):
         parent_node_name: str = '',
     ) -> tuple:
         if node is None:
-            return node, repr(node)
+            return node
         elif self._is_leaf(node):
             return self._leaf(node, show_offsets=show_offsets)
         else:
-            if isinstance(indent, int):
-                indent_s = indent * ' '
-            else:
-                indent_s = indent
 
             class state:
                 indent = _indent
@@ -153,9 +140,6 @@ class AstScaning(object):
                 yield
                 state.indent -= 1
 
-            def indentstr() -> str:
-                return state.indent * indent_s
-
             def _scan_import(el: Union[ast.AST, None, str],
                              _indent: int = 0,
                              parent_node_name: str = '') -> str:
@@ -166,7 +150,6 @@ class AstScaning(object):
                     _indent=_indent,
                     parent_node_name=parent_node_name)
 
-            out = type(node).__name__ + '(\n'
             outputs = dict()
             # add relative path expression
             if type(node).__name__ == 'ImportFrom':
@@ -183,27 +166,23 @@ class AstScaning(object):
                 for field in self._fields(node, show_offsets=show_offsets):
                     attr = getattr(node, field)
                     if attr == []:
-                        representation = '[]'
                         outputs[field] = []
+                    elif (isinstance(attr, list) and len(attr) == 1
+                          and isinstance(attr[0], ast.AST)
+                          and self._skip_function(attr[0])):
+                        continue
                     elif (isinstance(attr, list) and len(attr) == 1
                           and isinstance(attr[0], ast.AST)
                           and self._is_leaf(attr[0])):
-                        local_out, local_print = _scan_import(attr[0])
-                        representation = f'[{local_print}]'
+                        local_out = _scan_import(attr[0])
                         outputs[field] = local_out
-
                     elif isinstance(attr, list):
-                        representation = '[\n'
                         el_dict = dict()
                         with indented():
                             for el in attr:
-                                local_out, local_print = _scan_import(
+                                local_out = _scan_import(
                                     el, state.indent,
                                     type(el).__name__)
-                                representation += '{}{},\n'.format(
-                                    indentstr(),
-                                    local_print,
-                                )
                                 name = type(el).__name__
                                 if (name == 'Import' or name == 'ImportFrom'
                                         or parent_node_name == 'ImportFrom'
@@ -211,14 +190,11 @@ class AstScaning(object):
                                     if name not in el_dict:
                                         el_dict[name] = []
                                     el_dict[name].append(local_out)
-                        representation += indentstr() + ']'
                         outputs[field] = el_dict
                     elif isinstance(attr, ast.AST):
-                        output, representation = _scan_import(
-                            attr, state.indent)
+                        output = _scan_import(attr, state.indent)
                         outputs[field] = output
                     else:
-                        representation = repr(attr)
                         outputs[field] = attr
 
                     if (type(node).__name__ == 'Import'
@@ -261,15 +237,12 @@ class AstScaning(object):
                     ).__name__ == 'Call' and parent_node_name == 'Expr':
                         self.result_express.append(attr)
 
-                    out += f'{indentstr()}{field}={representation},\n'
-
-            out += indentstr() + ')'
             return {
                 IMPORT_KEY: self.result_import,
                 FROM_IMPORT_KEY: self.result_from_import,
                 DECORATOR_KEY: self.result_decorator,
                 EXPRESS_KEY: self.result_express
-            }, out
+            }
 
     def _parse_decorator(self, node: ast.AST) -> tuple:
 
@@ -321,7 +294,7 @@ class AstScaning(object):
         if key_item == 'default_group':
             return default_group
         split_list = key_item.split('.')
-        # in the case, the key_item is raw data, not registred
+        # in the case, the key_item is raw data, not registered
         if len(split_list) == 1:
             return key_item
         else:
@@ -335,7 +308,7 @@ class AstScaning(object):
         """
         functions, args_list, keyword_list = parsed_input
 
-        # ignore decocators other than register_module
+        # ignore decorators other than register_module
         if REGISTER_MODULE != functions[1]:
             return None
         output = [functions[0]]
@@ -411,17 +384,17 @@ class AstScaning(object):
         data = ''.join(data)
 
         node = gast.parse(data)
-        output, _ = self.scan_import(node, indent='  ', show_offsets=False)
+        output = self.scan_import(node, indent='  ', show_offsets=False)
         output[DECORATOR_KEY] = self.parse_decorators(output[DECORATOR_KEY])
         output[EXPRESS_KEY] = self.parse_decorators(output[EXPRESS_KEY])
         output[DECORATOR_KEY].extend(output[EXPRESS_KEY])
         return output
 
 
-class FilesAstScaning(object):
+class FilesAstScanning(object):
 
     def __init__(self) -> None:
-        self.astScaner = AstScaning()
+        self.astScaner = AstScanning()
         self.file_dirs = []
 
     def _parse_import_path(self,
@@ -550,9 +523,9 @@ class FilesAstScaning(object):
 
         Args:
             target_file_list can override the dir and folders combine
-            target_dir (str, optional): the absolute path of the target directory to be scaned. Defaults to None.
+            target_dir (str, optional): the absolute path of the target directory to be scanned. Defaults to None.
             target_folder (list, optional): the list of
-            sub-folders to be scaned in the target folder.
+            sub-folders to be scanned in the target folder.
             Defaults to SCAN_SUB_FOLDERS.
 
         Returns:
@@ -564,7 +537,7 @@ class FilesAstScaning(object):
         else:
             self.traversal_files(target_dir, target_folders)
         logger.info(
-            f'AST-Scaning the path "{target_dir}" with the following sub folders {target_folders}'
+            f'AST-Scanning the path "{target_dir}" with the following sub folders {target_folders}'
         )
 
         result = dict()
@@ -587,7 +560,7 @@ class FilesAstScaning(object):
             REQUIREMENT_KEY: module_import
         }
         logger.info(
-            f'Scaning done! A number of {len(inverted_index_with_results)} '
+            f'Scanning done! A number of {len(inverted_index_with_results)} '
             f'components indexed or updated! Time consumed {time.time()-start}s'
         )
         return index
@@ -612,7 +585,7 @@ class FilesAstScaning(object):
         return md5.hexdigest(), files_mtime_dict
 
 
-file_scanner = FilesAstScaning()
+file_scanner = FilesAstScanning()
 
 
 def _save_index(index, file_path, file_list=None, with_template=False):
@@ -694,19 +667,19 @@ def load_index(
         indexer_file_dir: The dir where the indexer file saved, default as INDEXER_FILE_DIR
         indexer_file: The indexer file name, default as INDEXER_FILE
     Returns:
-        dict: the index information for all registred modules, including key:
-        index, requirments, files last modified time, modelscope home path,
+        dict: the index information for all registered modules, including key:
+        index, requirements, files last modified time, modelscope home path,
         version and md5, the detail is shown below example: {
             'index': {
                 ('MODELS', 'nlp', 'bert'):{
                     'filepath' : 'path/to/the/registered/model', 'imports':
-                    ['os', 'torch', 'typeing'] 'module':
+                    ['os', 'torch', 'typing'] 'module':
                     'modelscope.models.nlp.bert'
                 },
                 ...
-            }, 'requirments': {
-                'modelscope.models.nlp.bert': ['os', 'torch', 'typeing'],
-                'modelscope.models.nlp.structbert': ['os', 'torch', 'typeing'],
+            }, 'requirements': {
+                'modelscope.models.nlp.bert': ['os', 'torch', 'typing'],
+                'modelscope.models.nlp.structbert': ['os', 'torch', 'typing'],
                 ...
             }, 'files_mtime' : {
                 '/User/Path/To/Your/Modelscope/modelscope/preprocessors/nlp/text_generation_preprocessor.py':
@@ -768,15 +741,6 @@ def load_index(
     return index
 
 
-def check_import_module_avaliable(module_dicts: dict) -> list:
-    missed_module = []
-    for module in module_dicts.keys():
-        loader = importlib.find_loader(module)
-        if loader is None:
-            missed_module.append(module)
-    return missed_module
-
-
 def load_from_prebuilt(file_path=None):
     if file_path is None:
         local_path = p.resolve().parents[0]
@@ -801,4 +765,5 @@ def generate_ast_template(file_path=None, force_rebuild=True):
 
 
 if __name__ == '__main__':
-    index = load_index()
+    index = load_index(force_rebuild=True)
+    print(index)
diff --git a/modelscope/utils/audio/audio_utils.py b/modelscope/utils/audio/audio_utils.py
index c5fbe8c5..d95fd279 100644
--- a/modelscope/utils/audio/audio_utils.py
+++ b/modelscope/utils/audio/audio_utils.py
@@ -12,6 +12,7 @@ import numpy as np
 from modelscope.fileio.file import HTTPStorage
 
 SEGMENT_LENGTH_TRAIN = 16000
+SUPPORT_AUDIO_TYPE_SETS = ('.flac', '.mp3', '.ogg', '.opus', '.wav', '.pcm')
 
 
 class TtsTrainType(object):
@@ -176,10 +177,10 @@ def generate_scp_from_url(url: str, key: str = None):
         wav_scp_path = url
         return wav_scp_path, raw_inputs
     # for local wav file inputs
-    if os.path.exists(url) and (url.lower().endswith('.wav')):
+    if os.path.exists(url) and (url.lower().endswith(SUPPORT_AUDIO_TYPE_SETS)):
         wav_scp_path = url
         return wav_scp_path, raw_inputs
-    # for wav url, download and generate wav.scp
+    # for wav url, download bytes data
     result = urlparse(url)
     if result.scheme is not None and len(result.scheme) > 0:
         storage = HTTPStorage()
@@ -228,8 +229,7 @@ def generate_scp_for_sv(url: str, key: str = None):
         wav_scp_path = url
         return wav_scp_path
     # for local wav file inputs
-    if os.path.exists(url) and (url.lower().endswith('.wav')
-                                or url.lower().endswith('.pcm')):
+    if os.path.exists(url) and (url.lower().endswith(SUPPORT_AUDIO_TYPE_SETS)):
         wav_path = url
         work_dir = tempfile.TemporaryDirectory().name
         if not os.path.exists(work_dir):
@@ -243,17 +243,8 @@ def generate_scp_for_sv(url: str, key: str = None):
     result = urlparse(url)
     if result.scheme is not None and len(result.scheme) > 0:
         storage = HTTPStorage()
-        data = storage.read(url)
-        work_dir = tempfile.TemporaryDirectory().name
-        if not os.path.exists(work_dir):
-            os.makedirs(work_dir)
-        wav_path = os.path.join(work_dir, os.path.basename(url))
-        with open(wav_path, 'wb') as fb:
-            fb.write(data)
-        wav_scp_path = os.path.join(work_dir, 'wav.scp')
-        with open(wav_scp_path, 'w') as ft:
-            scp_content = '\t'.join([wav_name, wav_path]) + '\n'
-            ft.writelines(scp_content)
+        wav_scp_path = storage.read(url)
+        return wav_scp_path
 
     return wav_scp_path
 
diff --git a/modelscope/utils/checkpoint.py b/modelscope/utils/checkpoint.py
index b4f2c9e5..c0455545 100644
--- a/modelscope/utils/checkpoint.py
+++ b/modelscope/utils/checkpoint.py
@@ -2,6 +2,7 @@
 
 import io
 import os
+import re
 import time
 from collections import OrderedDict
 from shutil import copytree, ignore_patterns, rmtree
@@ -9,6 +10,7 @@ from typing import Callable, Dict, Optional, Union
 
 import json
 import torch
+from torch import nn
 from torch.optim import Optimizer
 from torch.optim.lr_scheduler import _LRScheduler
 
@@ -137,7 +139,390 @@ def load_checkpoint(filename,
     return checkpoint.get('meta', {})
 
 
+def load_task_model_checkpoint(model_to_load,
+                               model_local_dir,
+                               default_dtype=None,
+                               load_state_fn=None,
+                               **kwargs):
+    """
+    Load model checkpoint file and feed the parameters into the model.
+    Args:
+        model_to_load: The model to be load
+        model_local_dir: The actual checkpoint dir on local disk.
+        default_dtype: Set the default float type by 'torch.set_default_dtype'
+        load_state_fn: An optional load_state_fn used to load state_dict into the model.
+
+    Returns:
+
+    """
+
+    def _add_head_prefix_to_state_dict(state_dicts, head_prefix,
+                                       expected_keys_without_head_prefix,
+                                       missing_keys):
+        new_state_dict = OrderedDict()
+        for name, module in state_dicts.items():
+            if name in expected_keys_without_head_prefix:
+                name_with_head = '.'.join([head_prefix, name])
+                new_state_dict[name_with_head] = module
+                expected_keys_without_head_prefix.remove(name)
+                missing_keys = list(set(missing_keys) - set([name_with_head]))
+            else:
+                new_state_dict[name] = module
+
+        missing_head_keys = []
+        if len(expected_keys_without_head_prefix) > 0:
+            missing_head_keys = expected_keys_without_head_prefix.copy()
+        return new_state_dict, missing_head_keys, missing_keys
+
+    def _find_mismatched_keys(
+        state_dicts,
+        model_state_dict,
+        loaded_keys,
+        prefix,
+        add_prefix_to_model,
+        remove_prefix_from_model,
+        ignore_mismatched_sizes,
+    ):
+        mismatched_key = []
+        if ignore_mismatched_sizes:
+            for checkpoint_key in loaded_keys:
+                model_key = checkpoint_key
+                if remove_prefix_from_model:
+                    # The model key starts with `prefix` but `checkpoint_key` doesn't, so we add it.
+                    model_key = f'{prefix}.{checkpoint_key}'
+                elif add_prefix_to_model:
+                    # The model key doesn't start with `prefix` but `checkpoint_key` does, so we remove it.
+                    model_key = '.'.join(checkpoint_key.split('.')[1:])
+
+                if model_key in model_state_dict:
+                    model_shape = model_state_dict[model_key].shape
+                    checkpoint_shape = state_dicts[checkpoint_key].shape
+                    if checkpoint_shape != model_shape:
+                        mismatched_key.append(
+                            (checkpoint_key, state_dicts[checkpoint_key].shape,
+                             model_state_dict[model_key].shape))
+                        del state_dicts[checkpoint_key]
+        return mismatched_key
+
+    def _load_state_dict_into_model(
+        model,
+        state_dict,
+        start_prefix,
+        head_prefix_keys,
+        load_state_fn=None,
+    ):
+        # Convert old format to new format if needed from a PyTorch state_dict
+        old_keys = []
+        new_keys = []
+        for key in state_dict.keys():
+            new_key = None
+            if 'gamma' in key:
+                new_key = key.replace('gamma', 'weight')
+            if 'beta' in key:
+                new_key = key.replace('beta', 'bias')
+            if new_key:
+                old_keys.append(key)
+                new_keys.append(new_key)
+        for old_key, new_key in zip(old_keys, new_keys):
+            state_dict[new_key] = state_dict.pop(old_key)
+
+        # copy state_dict so _load_from_state_dict can modify it
+        metadata = getattr(state_dict, '_metadata', None)
+        state_dict = state_dict.copy()
+        if metadata is not None:
+            state_dict._metadata = metadata
+
+        error_msgs = []
+
+        if load_state_fn is not None:
+            load_state_fn(
+                model,
+                state_dict,
+                prefix=start_prefix,
+                head_prefix_keys=head_prefix_keys,
+                local_metadata=None,
+                error_msgs=error_msgs)
+        else:
+
+            def load(module: nn.Module, prefix=''):
+                local_metadata = {} if metadata is None else metadata.get(
+                    prefix[:-1], {})
+                args = (state_dict, prefix, local_metadata, True, [], [],
+                        error_msgs)
+                module._load_from_state_dict(*args)
+                for name, child in module._modules.items():
+                    if child is not None:
+                        load(child, prefix + name + '.')
+
+            load(model, prefix=start_prefix)
+
+        return error_msgs
+
+    def _load_checkpoint(
+        model,
+        state_dict,
+        load_state_fn,
+        ignore_mismatched_sizes,
+        _fast_init,
+    ):
+        # Retrieve missing & unexpected_keys
+        model_state_dict = model.state_dict()
+        expected_keys = list(model_state_dict.keys())
+        keys_from_pretrained = list(state_dict.keys())
+
+        prefix = model.base_model_prefix
+
+        # during loading stage, base model prefix is complicated, should consider remove or add
+        if len(prefix) > 0:
+            # nlp: encoder, decoder
+            pretrained_has_prefix_module = any(
+                s.startswith(prefix) for s in keys_from_pretrained)
+            model_expects_prefix_module = any(
+                s.startswith(prefix) for s in expected_keys)
+        else:
+            # nlp:encoder-decoder, cv:backbone-head,
+            pretrained_has_prefix_module = False
+            model_expects_prefix_module = False
+
+        remove_prefix_from_model = not pretrained_has_prefix_module and model_expects_prefix_module
+        add_prefix_to_model = pretrained_has_prefix_module and not model_expects_prefix_module
+
+        if remove_prefix_from_model:
+            expected_keys_not_base_model_prefixed = [
+                s for s in expected_keys if not s.startswith(prefix)
+            ]
+            expected_keys = [
+                '.'.join(s.split('.')[1:]) if s.startswith(prefix) else s
+                for s in expected_keys
+            ]
+        elif add_prefix_to_model:
+            # backbone only
+            expected_keys = ['.'.join([prefix, s]) for s in expected_keys]
+            expected_keys_not_base_model_prefixed = []
+
+        missing_keys = list(set(expected_keys) - set(keys_from_pretrained))
+        unexpected_keys = list(set(keys_from_pretrained) - set(expected_keys))
+
+        # during loading stage head prefix is simple, add or not add
+        prefix_heads = model.head_prefix
+        expected_head_keys_without_head_prefix = []
+        missing_head_keys = []
+        unexpected_head_keys = []
+        pretrained_has_prefix_head = dict()
+        head_prefix_keys = dict()
+
+        # only for case of head mismatched with state-dict
+        if len(prefix_heads) > 0 and len(unexpected_keys) > 0:
+            if isinstance(prefix_heads, str):
+                prefix_heads = [prefix_heads]
+
+            # to double-check if head matched with state-dict
+            for prefix_head in prefix_heads:
+                pretrained_has_prefix_head[prefix_head] = any(
+                    s.startswith(prefix_head) for s in keys_from_pretrained)
+
+            for prefix_head in prefix_heads:
+                expected_keys_without_head_prefix = [
+                    '.'.join(s.split('.')[1:]) for s in expected_keys
+                    if s.startswith(prefix_head)
+                ]
+                expected_head_keys_without_head_prefix.extend(
+                    expected_keys_without_head_prefix)
+                head_prefix_keys[
+                    prefix_head] = expected_keys_without_head_prefix
+            unexpected_head_keys = list(
+                set(unexpected_keys)
+                - set(expected_head_keys_without_head_prefix))
+            unexpected_keys = list(
+                set(unexpected_keys)
+                - set(expected_head_keys_without_head_prefix))
+
+        _keys_to_ignore_on_load_missing = kwargs.pop(
+            '_keys_to_ignore_on_load_missing', None)
+        _keys_to_ignore_on_load_unexpected = kwargs.pop(
+            '_keys_to_ignore_on_load_unexpected', None)
+        # Some models may have keys that are not in the state by design, removing them before needlessly warning
+        # the user.
+        if _keys_to_ignore_on_load_missing is not None:
+            for pat in _keys_to_ignore_on_load_missing:
+                missing_keys = [
+                    k for k in missing_keys if re.search(pat, k) is None
+                ]
+
+        if _keys_to_ignore_on_load_unexpected is not None:
+            for pat in _keys_to_ignore_on_load_unexpected:
+                unexpected_keys = [
+                    k for k in unexpected_keys if re.search(pat, k) is None
+                ]
+
+        # retrieve uninitialized modules and initialize before maybe overriding that with the pretrained weights.
+        if _fast_init:
+            uninitialized_modules = retrieve_modules_from_names(
+                model,
+                missing_keys,
+                prefix=prefix,
+                add_prefix=add_prefix_to_model,
+                remove_prefix=remove_prefix_from_model)
+            for module in uninitialized_modules:
+                model._init_weights(module)
+
+        # Make sure we are able to load head correctly by revise state-dict
+        missing_head_keys_by_head = dict()
+        if len(head_prefix_keys) > 0:
+            for head_prefix in head_prefix_keys:
+                if not pretrained_has_prefix_head[head_prefix]:
+                    state_dict, missing_head_keys, missing_keys = _add_head_prefix_to_state_dict(
+                        state_dict, head_prefix, head_prefix_keys[head_prefix],
+                        missing_keys)
+                    missing_head_keys_by_head[head_prefix] = missing_head_keys
+
+        # Make sure we are able to load base models as well as derived models (with heads)
+        start_prefix = ''
+        model_to_load = model
+        heads_to_load = dict()
+        if len(model.base_model_prefix) > 0 and not hasattr(
+                model,
+                model.base_model_prefix) and pretrained_has_prefix_module:
+            start_prefix = model.base_model_prefix + '.'
+        if len(model.base_model_prefix) > 0 and hasattr(
+                model,
+                model.base_model_prefix) and not pretrained_has_prefix_module:
+            model_to_load = getattr(model, model.base_model_prefix)
+            for head_prefix in prefix_heads:
+                heads_to_load[head_prefix] = getattr(model, head_prefix)
+            if any(key in expected_keys_not_base_model_prefixed
+                   for key in keys_from_pretrained):
+                raise ValueError(
+                    'The state dictionary of the model you are trying to load is corrupted. Are you sure it was '
+                    'properly saved?')
+
+        # Whole checkpoint
+        mismatched_keys = _find_mismatched_keys(
+            state_dict,
+            model_state_dict,
+            keys_from_pretrained,
+            prefix,
+            add_prefix_to_model,
+            remove_prefix_from_model,
+            ignore_mismatched_sizes,
+        )
+        error_msgs = _load_state_dict_into_model(model_to_load, state_dict,
+                                                 start_prefix, load_state_fn)
+
+        if len(heads_to_load) > 0:
+            for head in heads_to_load:
+                local_error_msgs = _load_state_dict_into_model(
+                    heads_to_load[head], state_dict, head + '.', load_state_fn)
+                error_msgs.extend(local_error_msgs)
+
+        if len(error_msgs) > 0:
+            error_msg = '\n\t'.join(error_msgs)
+            raise RuntimeError(
+                f'Error(s) in loading state_dict for {model.__class__.__name__}:\n\t{error_msg}'
+            )
+
+        if len(unexpected_keys) > 0:
+            logger.warning(
+                f'Some weights of the model checkpoint were not used when'
+                f' initializing {model.__class__.__name__}: {unexpected_keys}\n- This IS expected if you are'
+                f' initializing {model.__class__.__name__} from the checkpoint of a model trained on another task or'
+                ' with another architecture (e.g. initializing a BertForTokenClassification model from a'
+                ' BertForPreTraining model).\n- This IS NOT expected if you are initializing'
+                f' {model.__class__.__name__} from the checkpoint of a model that you expect to be exactly identical'
+                ' (initializing a BertForTokenClassification model from a BertForTokenClassification model).'
+            )
+        elif len(unexpected_head_keys) > 0:
+            logger.warning(
+                f'Some weights of the model checkpoint were not used when'
+                f' initializing {model.__class__.__name__}: {unexpected_head_keys}\n- This IS Not expected if you are'
+                f' initializing {model.__class__.__name__} from the checkpoint of a model with a same task while the'
+                ' structure is different (e.g. initializing a BertForTokenClassification model from a'
+                ' BertForTokenClassification model).')
+        else:
+            logger.info(
+                f'All model checkpoint weights were used when initializing {model.__class__.__name__}.\n'
+            )
+        if len(missing_keys) > 0:
+            logger.warning(
+                f'Some weights of {model.__class__.__name__} were not initialized from the model checkpoint'
+                f' and are newly initialized: {missing_keys}\nYou should probably'
+                ' TRAIN this model on a down-stream task to be able to use it for predictions and inference.'
+            )
+        elif len(mismatched_keys) == 0:
+            logger.info(
+                f'All the weights of {model.__class__.__name__} were initialized from the model checkpoint '
+                f'If your task is similar to the task the model of the checkpoint'
+                f' was trained on, you can already use {model.__class__.__name__} for predictions without further'
+                ' training.')
+        if len(mismatched_keys) > 0:
+            mismatched_warning = '\n'.join([
+                f'- {key}: found shape {shape1} in the checkpoint and {shape2} in the model instantiated'
+                for key, shape1, shape2 in mismatched_keys
+            ])
+            logger.warning(
+                f'Some weights of {model.__class__.__name__} were not initialized from the model checkpoint'
+                f' and are newly initialized because the shapes did not'
+                f' match:\n{mismatched_warning}\nYou should probably TRAIN this model on a down-stream task to be able'
+                ' to use it for predictions and inference.')
+
+        return missing_keys, unexpected_keys, mismatched_keys, error_msgs
+
+    def retrieve_modules_from_names(model,
+                                    names,
+                                    prefix=None,
+                                    add_prefix=False,
+                                    remove_prefix=False):
+        module_keys = set(['.'.join(key.split('.')[:-1]) for key in names])
+
+        # torch.nn.ParameterList is a special case where two parameter keywords
+        # are appended to the module name, *e.g.* bert.special_embeddings.0
+        module_keys = module_keys.union(
+            set([
+                '.'.join(key.split('.')[:-2]) for key in names
+                if key[-1].isdigit()
+            ]))
+
+        retrieved_modules = []
+        # retrieve all modules that has at least one missing weight name
+        for name, module in model.named_modules():
+            if remove_prefix:
+                name = '.'.join(
+                    name.split('.')[1:]) if name.startswith(prefix) else name
+            elif add_prefix:
+                name = '.'.join([prefix, name]) if len(name) > 0 else prefix
+
+            if name in module_keys:
+                retrieved_modules.append(module)
+
+        return retrieved_modules
+
+    # TODO Sharded ckpt
+    ckpt_file = os.path.join(model_local_dir, ModelFile.TORCH_MODEL_BIN_FILE)
+    state_dict = torch.load(ckpt_file, map_location='cpu')
+    if default_dtype is not None:
+        torch.set_default_dtype(default_dtype)
+
+    missing_keys, unexpected_keys, mismatched_keys, error_msgs = _load_checkpoint(
+        model_to_load,
+        state_dict,
+        load_state_fn=load_state_fn,
+        ignore_mismatched_sizes=True,
+        _fast_init=True,
+    )
+
+    return {
+        'model': model_to_load,
+        'missing_keys': missing_keys,
+        'unexpected_keys': unexpected_keys,
+        'mismatched_keys': mismatched_keys,
+        'error_msgs': error_msgs,
+    }
+
+
 def save_configuration(target_folder, config: Dict):
+    from modelscope.utils.config import Config
+    if isinstance(config, Config):
+        config = config.to_dict()
     if ConfigFields.pipeline not in config:
         config[ConfigFields.pipeline] = {'type': config[ConfigFields.task]}
     cfg_str = json.dumps(config, indent=4, cls=JSONIteratorEncoder)
@@ -201,4 +586,4 @@ def save_pretrained(model,
     except Exception as e:
         raise Exception(
             f'During saving checkpoints, the error of "{type(e).__name__} '
-            f'with msg {e} throwed')
+            f'with msg {e} thrown')
diff --git a/modelscope/utils/chinese_utils.py b/modelscope/utils/chinese_utils.py
index 793c2050..86cf91a2 100644
--- a/modelscope/utils/chinese_utils.py
+++ b/modelscope/utils/chinese_utils.py
@@ -9,21 +9,12 @@ CHINESE_PUNCTUATION = '＂＃＄％＆＇（）＊＋，－／：；＜＝＞＠
 ENGLISH_PUNCTUATION = string.punctuation
 
 
-def is_chinese_char(word: str):
-    chinese_punctuations = {
-        '，', '。', '；', '：'
-        '！', '？', '《', '》', '‘', '’', '“', '”', '（', '）', '【', '】'
-    }
-    return len(word) == 1 \
-        and ('\u4e00' <= word <= '\u9fa5' or word in chinese_punctuations)
-
-
 def remove_space_between_chinese_chars(decoded_str: str):
     old_word_list = decoded_str.split(' ')
     new_word_list = []
     start = -1
     for i, word in enumerate(old_word_list):
-        if is_chinese_char(word):
+        if _is_chinese_str(word):
             if start == -1:
                 start = i
         else:
@@ -39,10 +30,33 @@ def remove_space_between_chinese_chars(decoded_str: str):
 # add space for each chinese char
 def rebuild_chinese_str(string: str):
     return ' '.join(''.join([
-        f' {char} ' if is_chinese_char(char) else char for char in string
+        f' {char} '
+        if _is_chinese_char(char) or char in CHINESE_PUNCTUATION else char
+        for char in string
     ]).split())
 
 
+def _is_chinese_str(string: str) -> bool:
+    return all(
+        _is_chinese_char(cp) or cp in CHINESE_PUNCTUATION
+        or cp in ENGLISH_PUNCTUATION or cp for cp in string)
+
+
+def _is_chinese_char(cp: str) -> bool:
+    """Checks whether CP is the codepoint of a CJK character."""
+    cp = ord(cp)
+    if ((cp >= 0x4E00 and cp <= 0x9FFF) or (cp >= 0x3400 and cp <= 0x4DBF)
+            or (cp >= 0x20000 and cp <= 0x2A6DF)
+            or (cp >= 0x2A700 and cp <= 0x2B73F)
+            or (cp >= 0x2B740 and cp <= 0x2B81F)
+            or (cp >= 0x2B820 and cp <= 0x2CEAF)
+            or (cp >= 0xF900 and cp <= 0xFAFF)
+            or (cp >= 0x2F800 and cp <= 0x2FA1F)):
+        return True
+
+    return False
+
+
 def normalize_chinese_number(text):
     chinese_number = ['零', '一', '二', '三', '四', '五', '六', '七', '八', '九']
     new_text = ''
diff --git a/modelscope/utils/config.py b/modelscope/utils/config.py
index 71d820e5..85cb8b77 100644
--- a/modelscope/utils/config.py
+++ b/modelscope/utils/config.py
@@ -3,7 +3,6 @@
 # https://github.com/open-mmlab/mmcv/blob/master/mmcv/utils/config.py
 
 import copy
-import dataclasses
 import os
 import os.path as osp
 import platform
@@ -11,7 +10,6 @@ import shutil
 import sys
 import tempfile
 import types
-from dataclasses import fields
 from pathlib import Path
 from types import FunctionType
 from typing import Dict, Union
@@ -36,9 +34,9 @@ class ConfigDict(addict.Dict):
     """ Dict which support get value through getattr
 
     Examples:
-    >>> cdict = ConfigDict({'a':1232})
-    >>> print(cdict.a)
-    1232
+        >>> cdict = ConfigDict({'a':1232})
+        >>> print(cdict.a)
+        >>> # 1232
     """
 
     def __missing__(self, name):
@@ -339,7 +337,7 @@ class Config:
         super(Config, self).__setattr__('_filename', _filename)
         super(Config, self).__setattr__('_text', _text)
 
-    def safe_get(self, key_chain: str, default=None):
+    def safe_get(self, key_chain: str, default=None, type_field='type'):
         """Get a value with a key-chain in str format, if key does not exist, the default value will be returned.
 
         This method is safe to call, and will not edit any value.
@@ -347,7 +345,9 @@ class Config:
         Args:
             key_chain: The input key chain, for example: 'train.hooks[0].type'
             default: The default value returned when any key does not exist, default None.
-
+            type_field: Get an object from a list or tuple for example by 'train.hooks.CheckPointHook', in which
+                'hooks' is a list, and 'CheckPointHook' is a value of the content of key `type_field`.
+                If there are multiple matched objects, the first element will be returned.
         Returns:
             The value, or the default value.
         """
@@ -359,7 +359,15 @@ class Config:
                 if '[' in key:
                     key, val = key.split('[')
                     val, _ = val.split(']')
-                _cfg_dict = getattr(_cfg_dict, key)
+
+                if isinstance(_cfg_dict, (list, tuple)):
+                    assert type_field is not None, 'Getting object without an index from a list or tuple ' \
+                                                   'needs an valid `type_field` param.'
+                    _sub_cfg_dict = list(
+                        filter(lambda sub: sub[type_field] == key, _cfg_dict))
+                    _cfg_dict = _sub_cfg_dict[0]
+                else:
+                    _cfg_dict = _cfg_dict[key]
                 if val is not None:
                     _cfg_dict = _cfg_dict[int(val)]
             return _cfg_dict
diff --git a/modelscope/utils/constant.py b/modelscope/utils/constant.py
index 4a83b278..c2d0c6f5 100644
--- a/modelscope/utils/constant.py
+++ b/modelscope/utils/constant.py
@@ -23,11 +23,10 @@ class CVTasks(object):
     animal_recognition = 'animal-recognition'
     face_detection = 'face-detection'
     face_liveness = 'face-liveness'
+    face_quality_assessment = 'face-quality-assessment'
     card_detection = 'card-detection'
     face_recognition = 'face-recognition'
-    face_recognition_ood = 'face-recognition-ood'
     facial_expression_recognition = 'facial-expression-recognition'
-    facial_landmark_confidence = 'facial-landmark-confidence'
     face_processing_base = 'face-processing-base'
     face_attribute_recognition = 'face-attribute-recognition'
     face_2d_keypoints = 'face-2d-keypoints'
@@ -48,14 +47,18 @@ class CVTasks(object):
     image_object_detection = 'image-object-detection'
     video_object_detection = 'video-object-detection'
     image_fewshot_detection = 'image-fewshot-detection'
+    open_vocabulary_detection = 'open-vocabulary-detection'
+    object_detection_3d = 'object-detection-3d'
 
     image_segmentation = 'image-segmentation'
     semantic_segmentation = 'semantic-segmentation'
+    image_driving_perception = 'image-driving-perception'
     image_depth_estimation = 'image-depth-estimation'
     indoor_layout_estimation = 'indoor-layout-estimation'
     video_depth_estimation = 'video-depth-estimation'
     panorama_depth_estimation = 'panorama-depth-estimation'
     portrait_matting = 'portrait-matting'
+    universal_matting = 'universal-matting'
     text_driven_segmentation = 'text-driven-segmentation'
     shop_segmentation = 'shop-segmentation'
     hand_static = 'hand-static'
@@ -63,19 +66,23 @@ class CVTasks(object):
     face_emotion = 'face-emotion'
     product_segmentation = 'product-segmentation'
     image_matching = 'image-matching'
+    image_quality_assessment_degradation = 'image-quality-assessment-degradation'
 
     crowd_counting = 'crowd-counting'
 
     # image editing
     skin_retouching = 'skin-retouching'
     image_super_resolution = 'image-super-resolution'
+    image_debanding = 'image-debanding'
     image_colorization = 'image-colorization'
     image_color_enhancement = 'image-color-enhancement'
     image_denoising = 'image-denoising'
     image_deblurring = 'image-deblurring'
     image_portrait_enhancement = 'image-portrait-enhancement'
     image_inpainting = 'image-inpainting'
+    image_paintbyexample = 'image-paintbyexample'
     image_skychange = 'image-skychange'
+    image_demoireing = 'image-demoireing'
 
     # image generation
     image_to_image_translation = 'image-to-image-translation'
@@ -102,12 +109,15 @@ class CVTasks(object):
     video_object_segmentation = 'video-object-segmentation'
     referring_video_object_segmentation = 'referring-video-object-segmentation'
     video_human_matting = 'video-human-matting'
+    video_panoptic_segmentation = 'video-panoptic-segmentation'
 
     # video editing
     video_inpainting = 'video-inpainting'
     video_frame_interpolation = 'video-frame-interpolation'
     video_stabilization = 'video-stabilization'
     video_super_resolution = 'video-super-resolution'
+    video_deinterlace = 'video-deinterlace'
+    video_colorization = 'video-colorization'
 
     # reid and tracking
     video_single_object_tracking = 'video-single-object-tracking'
@@ -123,6 +133,25 @@ class CVTasks(object):
     # domain specific object detection
     domain_specific_object_detection = 'domain-specific-object-detection'
 
+    # content check
+    content_check = 'content-check'
+
+    # 3d face reconstruction
+    face_reconstruction = 'face-reconstruction'
+
+    # image quality assessment mos
+    image_quality_assessment_mos = 'image-quality-assessment-mos'
+    # motion generation
+    motion_generation = 'motion-generation'
+    # 3d reconstruction
+    nerf_recon_acc = 'nerf-recon-acc'
+
+    # vision efficient tuning
+    vision_efficient_tuning = 'vision-efficient-tuning'
+
+    # bad image detecting
+    bad_image_detecting = 'bad-image-detecting'
+
 
 class NLPTasks(object):
     # nlp tasks
@@ -140,8 +169,10 @@ class NLPTasks(object):
     zero_shot = 'zero-shot'
     translation = 'translation'
     token_classification = 'token-classification'
+    transformer_crf = 'transformer-crf'
     conversational = 'conversational'
     text_generation = 'text-generation'
+    fid_dialogue = 'fid-dialogue'
     text2text_generation = 'text2text-generation'
     task_oriented_conversation = 'task-oriented-conversation'
     dialog_intent_prediction = 'dialog-intent-prediction'
@@ -163,6 +194,10 @@ class NLPTasks(object):
     translation_evaluation = 'translation-evaluation'
     sudoku = 'sudoku'
     text2sql = 'text2sql'
+    siamese_uie = 'siamese-uie'
+    document_grounded_dialog_retrieval = 'document-grounded-dialog-retrieval'
+    document_grounded_dialog_rerank = 'document-grounded-dialog-rerank'
+    document_grounded_dialog_generate = 'document-grounded-dialog-generate'
 
 
 class AudioTasks(object):
@@ -177,6 +212,8 @@ class AudioTasks(object):
     inverse_text_processing = 'inverse-text-processing'
     punctuation = 'punctuation'
     speaker_verification = 'speaker-verification'
+    voice_activity_detection = 'voice-activity-detection'
+    language_model = 'language-model'
 
 
 class MultiModalTasks(object):
@@ -331,9 +368,11 @@ class ModelFile(object):
     ONNX_MODEL_FILE = 'model.onnx'
     LABEL_MAPPING = 'label_mapping.json'
     TRAIN_OUTPUT_DIR = 'output'
+    TRAIN_BEST_OUTPUT_DIR = 'output_best'
     TS_MODEL_FILE = 'model.ts'
     YAML_FILE = 'model.yaml'
     TOKENIZER_FOLDER = 'tokenizer'
+    CONFIG = 'config.json'
 
 
 class Invoke(object):
diff --git a/modelscope/utils/cv/image_utils.py b/modelscope/utils/cv/image_utils.py
index 1d18434e..9bea68c0 100644
--- a/modelscope/utils/cv/image_utils.py
+++ b/modelscope/utils/cv/image_utils.py
@@ -494,6 +494,38 @@ def show_video_depth_estimation_result(depths, video_save_path):
     out.release()
 
 
+def show_image_driving_perception_result(img,
+                                         results,
+                                         out_file='result.jpg',
+                                         if_draw=[1, 1, 1]):
+    assert img.shape == (720, 1280,
+                         3), 'input image shape need fix to (720, 1280, 3)'
+    bboxes = results.get(OutputKeys.BOXES)[0]
+    if if_draw[0]:
+        for x in bboxes:
+            c1, c2 = (int(x[0]), int(x[1])), (int(x[2]), int(x[3]))
+            cv2.rectangle(
+                img, c1, c2, [255, 255, 0], thickness=2, lineType=cv2.LINE_AA)
+
+    result = results.get(OutputKeys.MASKS)
+
+    color_area = np.zeros((result[0].shape[0], result[0].shape[1], 3),
+                          dtype=np.uint8)
+
+    if if_draw[1]:
+        color_area[result[0] == 1] = [0, 255, 0]
+    if if_draw[2]:
+        color_area[result[1] == 1] = [255, 0, 0]
+    color_seg = color_area
+
+    color_mask = np.mean(color_seg, 2)
+    msk_idx = color_mask != 0
+    img[msk_idx] = img[msk_idx] * 0.5 + color_seg[msk_idx] * 0.5
+    if out_file is not None:
+        cv2.imwrite(out_file, img[:, :, ::-1])
+    return img
+
+
 def masks_visualization(masks, palette):
     vis_masks = []
     for f in range(masks.shape[0]):
diff --git a/modelscope/utils/cv/motion_utils/__init__.py b/modelscope/utils/cv/motion_utils/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/modelscope/utils/cv/motion_utils/motion_process.py b/modelscope/utils/cv/motion_utils/motion_process.py
new file mode 100644
index 00000000..30c37ac1
--- /dev/null
+++ b/modelscope/utils/cv/motion_utils/motion_process.py
@@ -0,0 +1,72 @@
+# This code is borrowed and modified from Actor,
+# made publicly available under MIT license at https://github.com/Mathux/ACTOR
+
+import torch
+
+
+def qinv(q):
+    assert q.shape[-1] == 4, 'q must be a tensor of shape (*, 4)'
+    mask = torch.ones_like(q)
+    mask[..., 1:] = -mask[..., 1:]
+    return q * mask
+
+
+def qrot(q, v):
+    """
+    Rotate vector(s) v about the rotation described by quaternion(s) q.
+    Expects a tensor of shape (*, 4) for q and a tensor of shape (*, 3) for v,
+    where * denotes any number of dimensions.
+    Returns a tensor of shape (*, 3).
+    """
+    assert q.shape[-1] == 4
+    assert v.shape[-1] == 3
+    assert q.shape[:-1] == v.shape[:-1]
+
+    original_shape = list(v.shape)
+    # print(q.shape)
+    q = q.contiguous().view(-1, 4)
+    v = v.contiguous().view(-1, 3)
+
+    qvec = q[:, 1:]
+    uv = torch.cross(qvec, v, dim=1)
+    uuv = torch.cross(qvec, uv, dim=1)
+    return (v + 2 * (q[:, :1] * uv + uuv)).view(original_shape)
+
+
+def recover_root_rot_pos(data):
+    rot_vel = data[..., 0]
+    r_rot_ang = torch.zeros_like(rot_vel).to(data.device)
+    '''Get Y-axis rotation from rotation velocity'''
+    r_rot_ang[..., 1:] = rot_vel[..., :-1]
+    r_rot_ang = torch.cumsum(r_rot_ang, dim=-1)
+
+    r_rot_quat = torch.zeros(data.shape[:-1] + (4, )).to(data.device)
+    r_rot_quat[..., 0] = torch.cos(r_rot_ang)
+    r_rot_quat[..., 2] = torch.sin(r_rot_ang)
+
+    r_pos = torch.zeros(data.shape[:-1] + (3, )).to(data.device)
+    r_pos[..., 1:, [0, 2]] = data[..., :-1, 1:3]
+    '''Add Y-axis rotation to root position'''
+    r_pos = qrot(qinv(r_rot_quat), r_pos)
+
+    r_pos = torch.cumsum(r_pos, dim=-2)
+
+    r_pos[..., 1] = data[..., 3]
+    return r_rot_quat, r_pos
+
+
+def recover_from_ric(data, joints_num):
+    r_rot_quat, r_pos = recover_root_rot_pos(data)
+    positions = data[..., 4:(joints_num - 1) * 3 + 4]
+    positions = positions.view(positions.shape[:-1] + (-1, 3))
+    '''Add Y-axis rotation to local joints'''
+    positions = qrot(
+        qinv(r_rot_quat[..., None, :]).expand(positions.shape[:-1] + (4, )),
+        positions)
+    '''Add root XZ to joints'''
+    positions[..., 0] += r_pos[..., 0:1]
+    positions[..., 2] += r_pos[..., 2:3]
+    '''Concate root and joints'''
+    positions = torch.cat([r_pos.unsqueeze(-2), positions], dim=-2)
+
+    return positions
diff --git a/modelscope/utils/cv/motion_utils/plot_script.py b/modelscope/utils/cv/motion_utils/plot_script.py
new file mode 100644
index 00000000..94aab9f6
--- /dev/null
+++ b/modelscope/utils/cv/motion_utils/plot_script.py
@@ -0,0 +1,122 @@
+# This code is borrowed and modified from Actor,
+# made publicly available under MIT license at https://github.com/Mathux/ACTOR
+
+import math
+from textwrap import wrap
+
+import matplotlib
+import matplotlib.pyplot as plt
+import mpl_toolkits.mplot3d.axes3d as p3
+import numpy as np
+from matplotlib.animation import FuncAnimation
+from mpl_toolkits.mplot3d.art3d import Poly3DCollection
+
+
+def list_cut_average(ll, intervals):
+    if intervals == 1:
+        return ll
+
+    bins = math.ceil(len(ll) * 1.0 / intervals)
+    ll_new = []
+    for i in range(bins):
+        l_low = intervals * i
+        l_high = l_low + intervals
+        l_high = l_high if l_high < len(ll) else len(ll)
+        ll_new.append(np.mean(ll[l_low:l_high]))
+    return ll_new
+
+
+def plot_3d_motion(save_path,
+                   kinematic_tree,
+                   joints,
+                   title,
+                   dataset,
+                   figsize=(3, 3),
+                   fps=120,
+                   radius=3,
+                   vis_mode='default',
+                   gt_frames=[]):
+    matplotlib.use('Agg')
+
+    title = '\n'.join(wrap(title, 30))
+
+    def init():
+        ax.set_xlim3d([-radius / 2, radius / 2])
+        ax.set_ylim3d([0, radius])
+        ax.set_zlim3d([-radius / 3., radius * 2 / 3.])
+        fig.suptitle(title, fontsize=10)
+        ax.grid(b=False)
+
+    def plot_xzPlane(minx, maxx, miny, minz, maxz):
+        verts = [[minx, miny, minz], [minx, miny, maxz], [maxx, miny, maxz],
+                 [maxx, miny, minz]]
+        xz_plane = Poly3DCollection([verts])
+        xz_plane.set_facecolor((0.5, 0.5, 0.5, 0.5))
+        ax.add_collection3d(xz_plane)
+
+    data = joints.copy().reshape(len(joints), -1, 3)
+
+    if dataset == 'kit':
+        data *= 0.003  # scale for visualization
+    elif dataset == 'humanml':
+        data *= 1.3  # scale for visualization
+    elif dataset in ['humanact12', 'uestc']:
+        data *= -1.5  # reverse axes, scale for visualization
+
+    fig = plt.figure(figsize=figsize)
+    plt.tight_layout()
+    ax = p3.Axes3D(fig)
+    init()
+    MINS = data.min(axis=0).min(axis=0)
+    MAXS = data.max(axis=0).max(axis=0)
+    colors_blue = ['#4D84AA', '#5B9965', '#61CEB9', '#34C1E2',
+                   '#80B79A']  # GT color
+    colors_orange = ['#DD5A37', '#D69E00', '#B75A39', '#FF6D00',
+                     '#DDB50E']  # Generation color
+    colors = colors_orange
+    if vis_mode == 'upper_body':  # lower body taken fixed to input motion
+        colors[0] = colors_blue[0]
+        colors[1] = colors_blue[1]
+    elif vis_mode == 'gt':
+        colors = colors_blue
+
+    frame_number = data.shape[0]
+    #     print(dataset.shape)
+
+    height_offset = MINS[1]
+    data[:, :, 1] -= height_offset
+    trajec = data[:, 0, [0, 2]]
+
+    data[..., 0] -= data[:, 0:1, 0]
+    data[..., 2] -= data[:, 0:1, 2]
+
+    def update(index):
+        ax.lines.clear()
+        ax.collections.clear()
+        ax.view_init(elev=120, azim=-90)
+        ax.dist = 7.5
+        plot_xzPlane(MINS[0] - trajec[index, 0], MAXS[0] - trajec[index, 0], 0,
+                     MINS[2] - trajec[index, 1], MAXS[2] - trajec[index, 1])
+
+        used_colors = colors_blue if index in gt_frames else colors
+        for i, (chain, color) in enumerate(zip(kinematic_tree, used_colors)):
+            if i < 5:
+                linewidth = 4.0
+            else:
+                linewidth = 2.0
+            ax.plot3D(
+                data[index, chain, 0],
+                data[index, chain, 1],
+                data[index, chain, 2],
+                linewidth=linewidth,
+                color=color)
+        plt.axis('off')
+        ax.set_xticklabels([])
+        ax.set_yticklabels([])
+        ax.set_zticklabels([])
+
+    ani = FuncAnimation(
+        fig, update, frames=frame_number, interval=1000 / fps, repeat=False)
+    ani.save(save_path, fps=fps)
+
+    plt.close()
diff --git a/modelscope/utils/cv/motion_utils/rotation_conversions.py b/modelscope/utils/cv/motion_utils/rotation_conversions.py
new file mode 100644
index 00000000..5f0ee947
--- /dev/null
+++ b/modelscope/utils/cv/motion_utils/rotation_conversions.py
@@ -0,0 +1,132 @@
+# This code is borrowed and modified from Actor,
+# made publicly available under MIT license at https://github.com/Mathux/ACTOR
+
+import functools
+
+import torch
+import torch.nn.functional as F
+
+
+def quaternion_to_matrix(quaternions):
+    """
+    Convert rotations given as quaternions to rotation matrices.
+
+    Args:
+        quaternions: quaternions with real part first,
+            as tensor of shape (..., 4).
+
+    Returns:
+        Rotation matrices as tensor of shape (..., 3, 3).
+    """
+    r, i, j, k = torch.unbind(quaternions, -1)
+    two_s = 2.0 / (quaternions * quaternions).sum(-1)
+
+    o = torch.stack(
+        (
+            1 - two_s * (j * j + k * k),
+            two_s * (i * j - k * r),
+            two_s * (i * k + j * r),
+            two_s * (i * j + k * r),
+            1 - two_s * (i * i + k * k),
+            two_s * (j * k - i * r),
+            two_s * (i * k - j * r),
+            two_s * (j * k + i * r),
+            1 - two_s * (i * i + j * j),
+        ),
+        -1,
+    )
+    return o.reshape(quaternions.shape[:-1] + (3, 3))
+
+
+def _axis_angle_rotation(axis: str, angle):
+    """
+    Return the rotation matrices for one of the rotations about an axis
+    of which Euler angles describe, for each value of the angle given.
+
+    Args:
+        axis: Axis label "X" or "Y or "Z".
+        angle: any shape tensor of Euler angles in radians
+
+    Returns:
+        Rotation matrices as tensor of shape (..., 3, 3).
+    """
+
+    cos = torch.cos(angle)
+    sin = torch.sin(angle)
+    one = torch.ones_like(angle)
+    zero = torch.zeros_like(angle)
+
+    if axis == 'X':
+        R_flat = (one, zero, zero, zero, cos, -sin, zero, sin, cos)
+    if axis == 'Y':
+        R_flat = (cos, zero, sin, zero, one, zero, -sin, zero, cos)
+    if axis == 'Z':
+        R_flat = (cos, -sin, zero, sin, cos, zero, zero, zero, one)
+
+    return torch.stack(R_flat, -1).reshape(angle.shape + (3, 3))
+
+
+def euler_angles_to_matrix(euler_angles, convention: str):
+    """
+    Convert rotations given as Euler angles in radians to rotation matrices.
+
+    Args:
+        euler_angles: Euler angles in radians as tensor of shape (..., 3).
+        convention: Convention string of three uppercase letters from
+            {"X", "Y", and "Z"}.
+
+    Returns:
+        Rotation matrices as tensor of shape (..., 3, 3).
+    """
+    if euler_angles.dim() == 0 or euler_angles.shape[-1] != 3:
+        raise ValueError('Invalid input euler angles.')
+    if len(convention) != 3:
+        raise ValueError('Convention must have 3 letters.')
+    if convention[1] in (convention[0], convention[2]):
+        raise ValueError(f'Invalid convention {convention}.')
+    for letter in convention:
+        if letter not in ('X', 'Y', 'Z'):
+            raise ValueError(f'Invalid letter {letter} in convention string.')
+    matrices = map(_axis_angle_rotation, convention,
+                   torch.unbind(euler_angles, -1))
+    return functools.reduce(torch.matmul, matrices)
+
+
+def axis_angle_to_matrix(axis_angle):
+    """
+    Convert rotations given as axis/angle to rotation matrices.
+
+    Args:
+        axis_angle: Rotations given as a vector in axis angle form,
+            as a tensor of shape (..., 3), where the magnitude is
+            the angle turned anticlockwise in radians around the
+            vector's direction.
+
+    Returns:
+        Rotation matrices as tensor of shape (..., 3, 3).
+    """
+    return quaternion_to_matrix(axis_angle_to_quaternion(axis_angle))
+
+
+def rotation_6d_to_matrix(d6: torch.Tensor) -> torch.Tensor:
+    """
+    Converts 6D rotation representation by Zhou et al. [1] to rotation matrix
+    using Gram--Schmidt orthogonalisation per Section B of [1].
+    Args:
+        d6: 6D rotation representation, of size (*, 6)
+
+    Returns:
+        batch of rotation matrices of size (*, 3, 3)
+
+    [1] Zhou, Y., Barnes, C., Lu, J., Yang, J., & Li, H.
+    On the Continuity of Rotation Representations in Neural Networks.
+    IEEE Conference on Computer Vision and Pattern Recognition, 2019.
+    Retrieved from http://arxiv.org/abs/1812.07035
+    """
+
+    a1, a2 = d6[..., :3], d6[..., 3:]
+    b1 = F.normalize(a1, dim=-1)
+    b2 = a2 - (b1 * a2).sum(-1, keepdim=True) * b1
+    b2 = F.normalize(b2, dim=-1)
+    b3 = torch.cross(b1, b2, dim=-1)
+    return torch.stack((b1, b2, b3), dim=-2)
diff --git a/modelscope/utils/data_collators.py b/modelscope/utils/data_collators.py
new file mode 100644
index 00000000..044b1993
--- /dev/null
+++ b/modelscope/utils/data_collators.py
@@ -0,0 +1,76 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+# Part of the implementation is borrowed from huggingface/transformers.
+
+from collections import OrderedDict
+from collections.abc import Mapping
+from typing import Any, List, Optional, Tuple
+
+from .logger import get_logger
+
+logger = get_logger(__name__)
+
+
+class RemoveColumnsCollator:
+    """Remove specified columns from the input mini-batch, and convert them to attributes.
+
+    For example: if columns_to_remove = ['id'], then user should call batch.id instead of batch['id'].
+
+    Args:
+        data_collator: An inner data collator to collate the mini-batch
+        columns_to_remove(`List[str]`): The redundant columns to be removed from the mini-batch
+        model_name(`Optional[str]`): An optional model name to print into log
+        description(`Optional[str]`): An optional description to print into log
+    """
+
+    def __init__(
+        self,
+        data_collator,
+        columns_to_remove: List[str],
+        model_name: Optional[str] = None,
+        description: Optional[str] = None,
+    ):
+        self.data_collator = data_collator
+        self.columns_to_remove = columns_to_remove
+        self.description = description
+        self.model_name = model_name
+        self.message_logged = False
+
+    def _remove_columns(self, feature: Mapping) -> Tuple[Mapping, Any]:
+        if not isinstance(feature, Mapping):
+            return feature, None
+        if not self.message_logged and self.model_name:
+            ignored_columns = list(
+                set(feature.keys()) - set(self.columns_to_remove))
+            if len(ignored_columns) > 0:
+                dset_description = '' if self.description is None else f'in the {self.description} set'
+                logger.info(
+                    f"The following columns {dset_description} don't have a corresponding argument in "
+                    f"`{self.model_name}.forward` and have been ignored: {', '.join(ignored_columns)}."
+                    f"Legal columns: {', '.join(self.columns_to_remove)}."
+                    f" If {', '.join(ignored_columns)} are not expected by `{self.model_name}.forward`, "
+                    ' you can safely ignore this message.')
+                self.message_logged = True
+        feature_clean = {
+            k: v
+            for k, v in feature.items() if k in self.columns_to_remove
+        }
+        feature_unused = {
+            k: v
+            for k, v in feature.items() if k not in self.columns_to_remove
+        }
+        return feature_clean, feature_unused
+
+    def __call__(self, features: List[Mapping]):
+        features_clean = []
+        features_unused = []
+        for feature in features:
+            feature, feature_unused = self._remove_columns(feature)
+            features_clean.append(feature)
+            features_unused.append(feature_unused)
+        data = OrderedDict(self.data_collator(features_clean))
+        if features_unused[0] is not None:
+            for key in features_unused[0].keys():
+                setattr(data, key, [
+                    feature_unused[key] for feature_unused in features_unused
+                ])
+        return data
diff --git a/modelscope/utils/data_utils.py b/modelscope/utils/data_utils.py
index 3a660122..424fb536 100644
--- a/modelscope/utils/data_utils.py
+++ b/modelscope/utils/data_utils.py
@@ -20,7 +20,15 @@ def to_device(batch, device, non_blocking=False):
             batch[idx] = to_device(batch[idx], device)
         return batch
     elif isinstance(batch, dict) or isinstance(batch, Mapping):
-        return type(batch)({k: to_device(v, device) for k, v in batch.items()})
+        if hasattr(batch, '__setitem__'):
+            # Reuse mini-batch to keep attributes for prediction.
+            for k, v in batch.items():
+                batch[k] = to_device(v, device)
+            return batch
+        else:
+            return type(batch)(
+                {k: to_device(v, device)
+                 for k, v in batch.items()})
     elif isinstance(batch, (tuple, list)):
         return type(batch)(to_device(v, device) for v in batch)
     elif isinstance(batch, torch.Tensor):
diff --git a/modelscope/utils/demo_utils.py b/modelscope/utils/demo_utils.py
index e57b3348..82bf1ada 100644
--- a/modelscope/utils/demo_utils.py
+++ b/modelscope/utils/demo_utils.py
@@ -2,7 +2,6 @@
 
 import io
 
-import cv2
 import json
 
 from modelscope.outputs import OutputKeys
@@ -265,6 +264,7 @@ def postprocess(req, resp):
         new_resp.get(output_key)
         if file_type == 'png' or file_type == 'jpg':
             content = new_resp.get(output_key)
+            import cv2
             _, img_encode = cv2.imencode('.' + file_type, content)
             img_bytes = img_encode.tobytes()
             return type(img_bytes)
diff --git a/modelscope/utils/device.py b/modelscope/utils/device.py
index 83faa261..47e08784 100644
--- a/modelscope/utils/device.py
+++ b/modelscope/utils/device.py
@@ -3,7 +3,6 @@ import os
 from contextlib import contextmanager
 
 from modelscope.utils.constant import Devices, Frameworks
-from modelscope.utils.import_utils import is_tf_available, is_torch_available
 from modelscope.utils.logger import get_logger
 
 logger = get_logger()
@@ -50,11 +49,9 @@ def device_placement(framework, device_name='gpu:0'):
 
     Examples:
 
-    ```python
-    # Requests for using model on cuda:0 for gpu
-    with device_placement('pytorch', device='gpu:0'):
-        model = Model.from_pretrained(...)
-    ```
+        >>> # Requests for using model on cuda:0 for gpu
+        >>> with device_placement('pytorch', device='gpu:0'):
+        >>>     model = Model.from_pretrained(...)
     """
     device_type, device_id = verify_device(device_name)
 
diff --git a/modelscope/utils/error.py b/modelscope/utils/error.py
index fe6c6f93..44e6b238 100644
--- a/modelscope/utils/error.py
+++ b/modelscope/utils/error.py
@@ -2,7 +2,7 @@
 
 # docstyle-ignore
 AUDIO_IMPORT_ERROR = """
-Audio model import failed: {0}, if you want to use audio releated function, please execute
+Audio model import failed: {0}, if you want to use audio related function, please execute
 `pip install modelscope[audio] -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html`
 """
 
@@ -140,3 +140,16 @@ MEGATRON_UTIL_IMPORT_ERROR = """
 {0} requires the megatron_util library but it was not found in your environment. You can install it with pip:
 `pip install megatron_util -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html`
 """
+
+# docstyle-ignore
+TEXT2SQL_LGESQL_IMPORT_ERROR = """
+{0} requires the text2sql_lgesql library but it was not found in your environment. You can install it with pip:
+`pip install text2sql_lgesql -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html`
+"""
+
+# docstyle-ignore
+MPI4PY_IMPORT_ERROR = """
+{0} requires the mpi4py library but it was not found in your environment. You can install it with pip:
+`pip install mpi4py' and with following the instruction to install openmpi,
+https://docs.open-mpi.org/en/v5.0.x/installing-open-mpi/quickstart.html`
+"""
diff --git a/modelscope/utils/file_utils.py b/modelscope/utils/file_utils.py
index cf59dc57..09e3364e 100644
--- a/modelscope/utils/file_utils.py
+++ b/modelscope/utils/file_utils.py
@@ -1,7 +1,6 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 
 import inspect
-import os
 from pathlib import Path
 
 
diff --git a/modelscope/utils/import_utils.py b/modelscope/utils/import_utils.py
index 9f06d5d6..3517ea3d 100644
--- a/modelscope/utils/import_utils.py
+++ b/modelscope/utils/import_utils.py
@@ -301,6 +301,9 @@ REQUIREMENTS_MAAPING = OrderedDict([
     ('fasttext', (is_package_available('fasttext'), FASTTEXT_IMPORT_ERROR)),
     ('megatron_util', (is_package_available('megatron_util'),
                        MEGATRON_UTIL_IMPORT_ERROR)),
+    ('text2sql_lgesql', (is_package_available('text2sql_lgesql'),
+                         TEXT2SQL_LGESQL_IMPORT_ERROR)),
+    ('mpi4py', (is_package_available('mpi4py'), MPI4PY_IMPORT_ERROR)),
 ])
 
 SYSTEM_PACKAGE = set(['os', 'sys', 'typing'])
diff --git a/modelscope/utils/regress_test_utils.py b/modelscope/utils/regress_test_utils.py
index bae2edac..0f10c1ce 100644
--- a/modelscope/utils/regress_test_utils.py
+++ b/modelscope/utils/regress_test_utils.py
@@ -20,7 +20,7 @@ import torch
 import torch.optim
 from torch import nn
 
-from modelscope.utils.service_utils import NumpyEncoder
+from .test_utils import compare_arguments_nested
 
 
 class RegressTool:
@@ -71,6 +71,7 @@ class RegressTool:
                                       module: nn.Module,
                                       file_name: str,
                                       compare_fn=None,
+                                      compare_model_output=True,
                                       **kwargs):
         """Monitor a pytorch module in a single forward.
 
@@ -78,6 +79,7 @@ class RegressTool:
             module: A torch module
             file_name: The file_name to store or load file
             compare_fn: A custom fn used to compare the results manually.
+            compare_model_output: Only compare the input module's output, skip all other tensors
 
         >>> def compare_fn(v1, v2, key, type):
         >>>     return None
@@ -120,17 +122,46 @@ class RegressTool:
             with open(baseline, 'rb') as f:
                 base = pickle.load(f)
 
-            class SafeNumpyEncoder(NumpyEncoder):
+            class SafeNumpyEncoder(json.JSONEncoder):
+
+                def parse_default(self, obj):
+                    if isinstance(obj, np.ndarray):
+                        return obj.tolist()
+
+                    if isinstance(obj, np.floating):
+                        return float(obj)
+
+                    if isinstance(obj, np.integer):
+                        return int(obj)
+
+                    return json.JSONEncoder.default(self, obj)
 
                 def default(self, obj):
                     try:
-                        return super().default(obj)
+                        return self.default(obj)
                     except Exception:
                         print(
                             f'Type {obj.__class__} cannot be serialized and printed'
                         )
                         return None
 
+            if compare_model_output:
+                print(
+                    'Ignore inner modules, only the output of the model will be verified.'
+                )
+                base = {
+                    key: value
+                    for key, value in base.items() if key == file_name
+                }
+                for key, value in base.items():
+                    value['input'] = {'args': None, 'kwargs': None}
+                io_json = {
+                    key: value
+                    for key, value in io_json.items() if key == file_name
+                }
+                for key, value in io_json.items():
+                    value['input'] = {'args': None, 'kwargs': None}
+
             print(f'baseline: {json.dumps(base, cls=SafeNumpyEncoder)}')
             print(f'latest  : {json.dumps(io_json, cls=SafeNumpyEncoder)}')
             if not compare_io_and_print(base, io_json, compare_fn, **kwargs):
@@ -326,10 +357,75 @@ class MsRegressTool(RegressTool):
 
             def lazy_stop_callback():
 
-                from modelscope.trainers.hooks.hook import Hook, Priority
+                class EarlyStopHook:
+                    PRIORITY = 90
 
-                class EarlyStopHook(Hook):
-                    PRIORITY = Priority.VERY_LOW
+                    def before_run(self, trainer):
+                        pass
+
+                    def after_run(self, trainer):
+                        pass
+
+                    def before_epoch(self, trainer):
+                        pass
+
+                    def after_epoch(self, trainer):
+                        pass
+
+                    def before_iter(self, trainer):
+                        pass
+
+                    def before_train_epoch(self, trainer):
+                        self.before_epoch(trainer)
+
+                    def before_val_epoch(self, trainer):
+                        self.before_epoch(trainer)
+
+                    def after_train_epoch(self, trainer):
+                        self.after_epoch(trainer)
+
+                    def after_val_epoch(self, trainer):
+                        self.after_epoch(trainer)
+
+                    def before_train_iter(self, trainer):
+                        self.before_iter(trainer)
+
+                    def before_val_iter(self, trainer):
+                        self.before_iter(trainer)
+
+                    def after_train_iter(self, trainer):
+                        self.after_iter(trainer)
+
+                    def after_val_iter(self, trainer):
+                        self.after_iter(trainer)
+
+                    def every_n_epochs(self, trainer, n):
+                        return (trainer.epoch + 1) % n == 0 if n > 0 else False
+
+                    def every_n_inner_iters(self, runner, n):
+                        return (runner.inner_iter
+                                + 1) % n == 0 if n > 0 else False
+
+                    def every_n_iters(self, trainer, n):
+                        return (trainer.iter + 1) % n == 0 if n > 0 else False
+
+                    def end_of_epoch(self, trainer):
+                        return trainer.inner_iter + 1 == trainer.iters_per_epoch
+
+                    def is_last_epoch(self, trainer):
+                        return trainer.epoch + 1 == trainer.max_epochs
+
+                    def is_last_iter(self, trainer):
+                        return trainer.iter + 1 == trainer.max_iters
+
+                    def get_triggered_stages(self):
+                        return []
+
+                    def state_dict(self):
+                        return {}
+
+                    def load_state_dict(self, state_dict):
+                        pass
 
                     def after_iter(self, trainer):
                         raise MsRegressTool.EarlyStopError('Test finished.')
@@ -526,92 +622,6 @@ def intercept_module(module: nn.Module,
         intercept_module(module, io_json, full_name, restore)
 
 
-def compare_arguments_nested(print_content,
-                             arg1,
-                             arg2,
-                             rtol=1.e-3,
-                             atol=1.e-8,
-                             ignore_unknown_type=True):
-    type1 = type(arg1)
-    type2 = type(arg2)
-    if type1.__name__ != type2.__name__:
-        if print_content is not None:
-            print(
-                f'{print_content}, type not equal:{type1.__name__} and {type2.__name__}'
-            )
-        return False
-
-    if arg1 is None:
-        return True
-    elif isinstance(arg1, (int, str, bool, np.bool, np.integer, np.str)):
-        if arg1 != arg2:
-            if print_content is not None:
-                print(f'{print_content}, arg1:{arg1}, arg2:{arg2}')
-            return False
-        return True
-    elif isinstance(arg1, (float, np.floating)):
-        if not np.isclose(arg1, arg2, rtol=rtol, atol=atol, equal_nan=True):
-            if print_content is not None:
-                print(f'{print_content}, arg1:{arg1}, arg2:{arg2}')
-            return False
-        return True
-    elif isinstance(arg1, (tuple, list)):
-        if len(arg1) != len(arg2):
-            if print_content is not None:
-                print(
-                    f'{print_content}, length is not equal:{len(arg1)}, {len(arg2)}'
-                )
-            return False
-        if not all([
-                compare_arguments_nested(
-                    None, sub_arg1, sub_arg2, rtol=rtol, atol=atol)
-                for sub_arg1, sub_arg2 in zip(arg1, arg2)
-        ]):
-            if print_content is not None:
-                print(f'{print_content}')
-            return False
-        return True
-    elif isinstance(arg1, Mapping):
-        keys1 = arg1.keys()
-        keys2 = arg2.keys()
-        if len(keys1) != len(keys2):
-            if print_content is not None:
-                print(
-                    f'{print_content}, key length is not equal:{len(keys1)}, {len(keys2)}'
-                )
-            return False
-        if len(set(keys1) - set(keys2)) > 0:
-            if print_content is not None:
-                print(f'{print_content}, key diff:{set(keys1) - set(keys2)}')
-            return False
-        if not all([
-                compare_arguments_nested(
-                    None, arg1[key], arg2[key], rtol=rtol, atol=atol)
-                for key in keys1
-        ]):
-            if print_content is not None:
-                print(f'{print_content}')
-            return False
-        return True
-    elif isinstance(arg1, np.ndarray):
-        arg1 = np.where(np.equal(arg1, None), np.NaN,
-                        arg1).astype(dtype=np.float)
-        arg2 = np.where(np.equal(arg2, None), np.NaN,
-                        arg2).astype(dtype=np.float)
-        if not all(
-                np.isclose(arg1, arg2, rtol=rtol, atol=atol,
-                           equal_nan=True).flatten()):
-            if print_content is not None:
-                print(f'{print_content}')
-            return False
-        return True
-    else:
-        if ignore_unknown_type:
-            return True
-        else:
-            raise ValueError(f'type not supported: {type1}')
-
-
 def compare_io_and_print(baseline_json, io_json, compare_fn=None, **kwargs):
     if compare_fn is None:
 
diff --git a/modelscope/utils/service_utils.py b/modelscope/utils/service_utils.py
index 29c111f8..6e7c0fc1 100644
--- a/modelscope/utils/service_utils.py
+++ b/modelscope/utils/service_utils.py
@@ -5,12 +5,9 @@ from io import BytesIO
 import json
 import numpy as np
 import requests
-from PIL import Image
 
 from modelscope.outputs import TASK_OUTPUTS, OutputKeys
 from modelscope.pipeline_inputs import TASK_INPUTS, InputType
-from modelscope.pipelines import pipeline
-from modelscope.utils.constant import Tasks, TasksIODescriptions
 
 
 # service data decoder func decodes data from network and convert it to pipeline's input
@@ -91,12 +88,14 @@ def decode_base64_to_binary(encoding):
 
 
 def decode_base64_to_image(encoding):
+    from PIL import Image
     content = encoding.split(';')[1]
     image_encoded = content.split(',')[1]
     return Image.open(BytesIO(base64.b64decode(image_encoded)))
 
 
 def encode_array_to_img_base64(image_array):
+    from PIL import Image
     with BytesIO() as output_bytes:
         pil_image = Image.fromarray(image_array.astype(np.uint8))
         pil_image.save(output_bytes, 'PNG')
diff --git a/modelscope/utils/task_utils.py b/modelscope/utils/task_utils.py
new file mode 100644
index 00000000..07d3838e
--- /dev/null
+++ b/modelscope/utils/task_utils.py
@@ -0,0 +1,90 @@
+from modelscope.metainfo import TaskModels
+from modelscope.utils import registry
+from modelscope.utils.constant import Tasks
+
+SUB_TASKS = 'sub_tasks'
+PARENT_TASK = 'parent_task'
+TASK_MODEL = 'task_model'
+
+DEFAULT_TASKS_LEVEL = {
+    Tasks.text_classification: {
+        SUB_TASKS: [
+            Tasks.text_classification,
+            Tasks.sentence_similarity,
+            Tasks.sentiment_classification,
+            Tasks.sentiment_analysis,
+            Tasks.nli,
+        ],
+        TASK_MODEL:
+        TaskModels.text_classification,
+    },
+    Tasks.token_classification: {
+        SUB_TASKS: [
+            Tasks.token_classification,
+            Tasks.named_entity_recognition,
+            Tasks.word_segmentation,
+            Tasks.part_of_speech,
+        ],
+        TASK_MODEL:
+        TaskModels.text_classification,
+    },
+    Tasks.token_classification: {
+        SUB_TASKS: [
+            Tasks.token_classification,
+            Tasks.named_entity_recognition,
+            Tasks.word_segmentation,
+            Tasks.part_of_speech,
+        ],
+        TASK_MODEL:
+        TaskModels.text_classification,
+    },
+    Tasks.text_generation: {
+        SUB_TASKS: [
+            Tasks.text_generation,
+            Tasks.text2text_generation,
+        ],
+        TASK_MODEL: TaskModels.text_generation,
+    },
+    Tasks.information_extraction: {
+        SUB_TASKS: [
+            Tasks.information_extraction,
+            Tasks.relation_extraction,
+        ],
+        TASK_MODEL: TaskModels.information_extraction,
+    },
+    Tasks.fill_mask: {
+        SUB_TASKS: [
+            Tasks.fill_mask,
+        ],
+        TASK_MODEL: TaskModels.fill_mask,
+    },
+    Tasks.text_ranking: {
+        SUB_TASKS: [
+            Tasks.text_ranking,
+        ],
+        TASK_MODEL: TaskModels.text_ranking,
+    }
+    # TODO: add other tasks with their sub tasks in different domains
+}
+
+
+def _inverted_index(forward_index):
+    inverted_index = dict()
+    for index in forward_index:
+        for item in forward_index[index][SUB_TASKS]:
+            inverted_index[item] = {
+                PARENT_TASK: index,
+                TASK_MODEL: forward_index[index][TASK_MODEL],
+            }
+    return inverted_index
+
+
+INVERTED_TASKS_LEVEL = _inverted_index(DEFAULT_TASKS_LEVEL)
+
+
+def get_task_by_subtask_name(group_key):
+    if group_key in INVERTED_TASKS_LEVEL:
+        return INVERTED_TASKS_LEVEL[group_key][
+            PARENT_TASK], INVERTED_TASKS_LEVEL[group_key][TASK_MODEL]
+    else:
+        return group_key, None
diff --git a/modelscope/utils/test_utils.py b/modelscope/utils/test_utils.py
index 76759e34..9c3e1e8b 100644
--- a/modelscope/utils/test_utils.py
+++ b/modelscope/utils/test_utils.py
@@ -12,13 +12,12 @@ import tarfile
 import tempfile
 import unittest
 from collections import OrderedDict
+from collections.abc import Mapping
 
+import numpy as np
 import requests
-import torch
-from torch.utils.data import Dataset
 
 from modelscope.utils.import_utils import is_tf_available, is_torch_available
-from modelscope.utils.torch_utils import _find_free_port
 
 TEST_LEVEL = 2
 TEST_LEVEL_STR = 'TEST_LEVEL'
@@ -49,7 +48,7 @@ def set_test_level(level: int):
     TEST_LEVEL = level
 
 
-class DummyTorchDataset(Dataset):
+class DummyTorchDataset:
 
     def __init__(self, feat, label, num) -> None:
         self.feat = feat
@@ -57,6 +56,7 @@ class DummyTorchDataset(Dataset):
         self.num = num
 
     def __getitem__(self, index):
+        import torch
         return {
             'feat': torch.Tensor(self.feat),
             'labels': torch.Tensor(self.label)
@@ -119,6 +119,92 @@ def get_case_model_info():
     return model_cases
 
 
+def compare_arguments_nested(print_content,
+                             arg1,
+                             arg2,
+                             rtol=1.e-3,
+                             atol=1.e-8,
+                             ignore_unknown_type=True):
+    type1 = type(arg1)
+    type2 = type(arg2)
+    if type1.__name__ != type2.__name__:
+        if print_content is not None:
+            print(
+                f'{print_content}, type not equal:{type1.__name__} and {type2.__name__}'
+            )
+        return False
+
+    if arg1 is None:
+        return True
+    elif isinstance(arg1, (int, str, bool, np.bool, np.integer, np.str)):
+        if arg1 != arg2:
+            if print_content is not None:
+                print(f'{print_content}, arg1:{arg1}, arg2:{arg2}')
+            return False
+        return True
+    elif isinstance(arg1, (float, np.floating)):
+        if not np.isclose(arg1, arg2, rtol=rtol, atol=atol, equal_nan=True):
+            if print_content is not None:
+                print(f'{print_content}, arg1:{arg1}, arg2:{arg2}')
+            return False
+        return True
+    elif isinstance(arg1, (tuple, list)):
+        if len(arg1) != len(arg2):
+            if print_content is not None:
+                print(
+                    f'{print_content}, length is not equal:{len(arg1)}, {len(arg2)}'
+                )
+            return False
+        if not all([
+                compare_arguments_nested(
+                    None, sub_arg1, sub_arg2, rtol=rtol, atol=atol)
+                for sub_arg1, sub_arg2 in zip(arg1, arg2)
+        ]):
+            if print_content is not None:
+                print(f'{print_content}')
+            return False
+        return True
+    elif isinstance(arg1, Mapping):
+        keys1 = arg1.keys()
+        keys2 = arg2.keys()
+        if len(keys1) != len(keys2):
+            if print_content is not None:
+                print(
+                    f'{print_content}, key length is not equal:{len(keys1)}, {len(keys2)}'
+                )
+            return False
+        if len(set(keys1) - set(keys2)) > 0:
+            if print_content is not None:
+                print(f'{print_content}, key diff:{set(keys1) - set(keys2)}')
+            return False
+        if not all([
+                compare_arguments_nested(
+                    None, arg1[key], arg2[key], rtol=rtol, atol=atol)
+                for key in keys1
+        ]):
+            if print_content is not None:
+                print(f'{print_content}')
+            return False
+        return True
+    elif isinstance(arg1, np.ndarray):
+        arg1 = np.where(np.equal(arg1, None), np.NaN,
+                        arg1).astype(dtype=np.float)
+        arg2 = np.where(np.equal(arg2, None), np.NaN,
+                        arg2).astype(dtype=np.float)
+        if not all(
+                np.isclose(arg1, arg2, rtol=rtol, atol=atol,
+                           equal_nan=True).flatten()):
+            if print_content is not None:
+                print(f'{print_content}')
+            return False
+        return True
+    else:
+        if ignore_unknown_type:
+            return True
+        else:
+            raise ValueError(f'type not supported: {type1}')
+
+
 _DIST_SCRIPT_TEMPLATE = """
 import ast
 import argparse
@@ -156,31 +242,31 @@ if __name__ == '__main__':
 class DistributedTestCase(unittest.TestCase):
     """Distributed TestCase for test function with distributed mode.
     Examples:
-        import torch
-        from torch import distributed as dist
-        from modelscope.utils.torch_utils import init_dist
+        >>> import torch
+        >>> from torch import distributed as dist
+        >>> from modelscope.utils.torch_utils import init_dist
 
-        def _test_func(*args, **kwargs):
-            init_dist(launcher='pytorch')
-            rank = dist.get_rank()
-            if rank == 0:
-                value = torch.tensor(1.0).cuda()
-            else:
-                value = torch.tensor(2.0).cuda()
-            dist.all_reduce(value)
-            return value.cpu().numpy()
+        >>> def _test_func(*args, **kwargs):
+        >>>     init_dist(launcher='pytorch')
+        >>>     rank = dist.get_rank()
+        >>>     if rank == 0:
+        >>>         value = torch.tensor(1.0).cuda()
+        >>>     else:
+        >>>         value = torch.tensor(2.0).cuda()
+        >>>     dist.all_reduce(value)
+        >>>     return value.cpu().numpy()
 
-        class DistTest(DistributedTestCase):
-            def test_function_dist(self):
-                args = ()  # args should be python builtin type
-                kwargs = {}  # kwargs should be python builtin type
-                self.start(
-                    _test_func,
-                    num_gpus=2,
-                    assert_callback=lambda x: self.assertEqual(x, 3.0),
-                    *args,
-                    **kwargs,
-                )
+        >>> class DistTest(DistributedTestCase):
+        >>>     def test_function_dist(self):
+        >>>         args = ()  # args should be python builtin type
+        >>>         kwargs = {}  # kwargs should be python builtin type
+        >>>         self.start(
+        >>>             _test_func,
+        >>>             num_gpus=2,
+        >>>             assert_callback=lambda x: self.assertEqual(x, 3.0),
+        >>>             *args,
+        >>>             **kwargs,
+        >>>         )
     """
 
     def _start(self,
@@ -263,6 +349,7 @@ class DistributedTestCase(unittest.TestCase):
               save_all_ranks=False,
               *args,
               **kwargs):
+        from .torch_utils import _find_free_port
         ip = socket.gethostbyname(socket.gethostname())
         dist_start_cmd = '%s -m torch.distributed.launch --nproc_per_node=%d --master_addr=\'%s\' --master_port=%s' % (
             sys.executable, num_gpus, ip, _find_free_port())
diff --git a/modelscope/utils/type_assert.py b/modelscope/utils/type_assert.py
index f732a81a..5a65db22 100644
--- a/modelscope/utils/type_assert.py
+++ b/modelscope/utils/type_assert.py
@@ -7,24 +7,24 @@ from inspect import signature
 def type_assert(*ty_args, **ty_kwargs):
     """a decorator which is used to check the types of arguments in a function or class
     Examples:
-    >>> @type_assert(str)
-    ... def main(a: str, b: list):
-    ...     print(a, b)
-    >>> main(1)
-    Argument a must be a str
+        >>> @type_assert(str)
+        ... def main(a: str, b: list):
+        ...     print(a, b)
+        >>> main(1)
+        Argument a must be a str
 
-    >>> @type_assert(str, (int, str))
-    ... def main(a: str, b: int | str):
-    ...     print(a, b)
-    >>> main('1', [1])
-    Argument b must be (<class 'int'>, <class 'str'>)
+        >>> @type_assert(str, (int, str))
+        ... def main(a: str, b: int | str):
+        ...     print(a, b)
+        >>> main('1', [1])
+        Argument b must be (<class 'int'>, <class 'str'>)
 
-    >>> @type_assert(str, (int, str))
-    ... class A:
-    ...     def __init__(self, a: str, b: int | str)
-    ...         print(a, b)
-    >>> a = A('1', [1])
-    Argument b must be (<class 'int'>, <class 'str'>)
+        >>> @type_assert(str, (int, str))
+        ... class A:
+        ...     def __init__(self, a: str, b: int | str)
+        ...         print(a, b)
+        >>> a = A('1', [1])
+        Argument b must be (<class 'int'>, <class 'str'>)
     """
 
     def decorate(func):
diff --git a/modelscope/version.py b/modelscope/version.py
index 1f4b62e7..4fa90b93 100644
--- a/modelscope/version.py
+++ b/modelscope/version.py
@@ -1,5 +1,5 @@
 # Make sure to modify __release_datetime__ to release time when making official release.
-__version__ = '1.2.0'
+__version__ = '1.3.0'
 # default release datetime for branches under active development is set
 # to be a time far-far-away-into-the-future
 __release_datetime__ = '2099-10-13 08:56:12'
diff --git a/requirements/audio.txt b/requirements/audio.txt
index 983fd70f..331c334b 100644
--- a/requirements/audio.txt
+++ b/requirements/audio.txt
@@ -1,45 +1,4 @@
-bitstring
-easyasr>=0.0.2
-espnet==202204
-funasr>=0.1.6
-funtextprocessing>=0.1.1
-greenlet>=1.1.2
-h5py
-inflect
-jedi>=0.18.1
-keras
-kwsbp>=0.0.2
-librosa
-lxml
-matplotlib
-MinDAEC
-mir_eval>=0.7
-msgpack>=1.0.4
-nara_wpe
-nltk
-# tensorflow 1.15 requires numpy<=1.18
-numpy<=1.18
-parso>=0.8.3
-pexpect>=4.8.0
-pickleshare>=0.7.5
-prompt-toolkit>=3.0.30
-# protobuf version beyond 3.20.0 is not compatible with TensorFlow 1.x, therefore is discouraged.
-protobuf>3,<3.21.0
-ptflops
-ptyprocess>=0.7.0
-py_sound_connect>=0.1
-pygments>=2.12.0
-pysptk>=0.1.15,<0.2.0
-pytorch_wavelets
-PyWavelets>=1.0.0
-rotary_embedding_torch>=0.1.5
-scikit-learn
-SoundFile>0.10
-sox
-speechbrain>=0.5
-torchaudio
-tqdm
-traitlets>=5.3.0
-ttsfrd>=0.1.1
-unidecode
-wcwidth>=0.2.5
+-r audio/audio_asr.txt
+-r audio/audio_kws.txt
+-r audio/audio_signal.txt
+-r audio/audio_tts.txt
diff --git a/requirements/audio/audio_asr.txt b/requirements/audio/audio_asr.txt
new file mode 100644
index 00000000..2dc2f9b7
--- /dev/null
+++ b/requirements/audio/audio_asr.txt
@@ -0,0 +1,2 @@
+easyasr>=0.0.2
+funasr>=0.2.0
diff --git a/requirements/audio/audio_kws.txt b/requirements/audio/audio_kws.txt
new file mode 100644
index 00000000..12b73bea
--- /dev/null
+++ b/requirements/audio/audio_kws.txt
@@ -0,0 +1,8 @@
+kaldiio
+kwsbp>=0.0.2
+matplotlib
+numpy
+py_sound_connect>=0.1
+scipy
+SoundFile>0.10
+tensorboardX
diff --git a/requirements/audio/audio_signal.txt b/requirements/audio/audio_signal.txt
new file mode 100644
index 00000000..6082a2e1
--- /dev/null
+++ b/requirements/audio/audio_signal.txt
@@ -0,0 +1,11 @@
+hyperpyyaml
+librosa
+MinDAEC
+mir_eval>=0.7
+numpy
+rotary_embedding_torch>=0.1.5
+scipy
+SoundFile>0.10
+speechbrain>=0.5.7
+torchaudio
+tqdm
diff --git a/requirements/audio/audio_tts.txt b/requirements/audio/audio_tts.txt
new file mode 100644
index 00000000..b9974294
--- /dev/null
+++ b/requirements/audio/audio_tts.txt
@@ -0,0 +1,27 @@
+bitstring
+greenlet>=1.1.2
+inflect
+jedi>=0.18.1
+librosa
+lxml
+matplotlib
+msgpack>=1.0.4
+parso>=0.8.3
+pexpect>=4.8.0
+pickleshare>=0.7.5
+prompt-toolkit>=3.0.30
+protobuf
+ptflops
+ptyprocess>=0.7.0
+pygments>=2.12.0
+pysptk>=0.1.15,<0.2.0
+pytorch_wavelets
+PyWavelets>=1.0.0
+scikit-learn
+sox
+tensorboardx
+tqdm
+traitlets>=5.3.0
+ttsfrd>=0.1.1
+unidecode
+wcwidth>=0.2.5
diff --git a/requirements/cv.txt b/requirements/cv.txt
index 17bd6ecf..8cf58f77 100644
--- a/requirements/cv.txt
+++ b/requirements/cv.txt
@@ -1,9 +1,14 @@
+accelerate
 albumentations>=1.0.3
 av>=9.2.0
 bmt_clipit>=1.0
+chumpy
 clip>=1.0
+ddpm_guided_diffusion
+diffusers
 easydict
 easyrobust
+face_alignment>=1.3.5
 fairscale>=0.4.1
 fastai>=1.0.51
 ffmpeg>=1.4
@@ -19,26 +24,38 @@ lpips
 ml_collections
 mmcls>=0.21.0
 mmdet>=2.25.0
+# mmdet3d-1.0.0rc6 remove networkx and numba version restriction
+mmdet3d==1.0.0a1
+mmsegmentation
 moviepy>=1.0.3
-networkx>=2.5
+nerfacc==0.2.2
+networkx
 numba
+omegaconf
 onnxruntime>=1.10
+open-clip-torch>=2.7.0
 opencv-python
 pai-easycv>=0.8
+paint_ldm
 pandas
 panopticapi
 plyfile>=0.7.4
 psutil
+PyMCubes
+pytorch-lightning
 regex
 scikit-image>=0.19.3
 scikit-learn>=0.20.1
 shapely
 shotdetect_scenedetect_lgss
+smplx
 tensorflow-estimator>=1.15.1
 tf_slim
 timm>=0.4.9
 torchmetrics>=0.6.2
+torchsummary>=1.5.1
 torchvision
+transformers>=4.26.0
 ujson
 utils
 videofeatures_clipit>=1.0
diff --git a/requirements/framework.txt b/requirements/framework.txt
index d5b4cefb..9a6a8998 100644
--- a/requirements/framework.txt
+++ b/requirements/framework.txt
@@ -1,7 +1,6 @@
 addict
 attrs
-datasets>=2.7.0
-easydict
+datasets>=2.7.0,<=2.8.0
 einops
 filelock>=3.3.0
 gast>=0.2.2
@@ -9,12 +8,11 @@ jsonplus
 numpy
 oss2
 Pillow>=6.2.0
-# for pyarrow 9.0.0 event_loop core dump
+# pyarrow 9.0.0 introduced event_loop core dump
 pyarrow>=6.0.0,!=9.0.0
 pyyaml
 requests
 scipy
 setuptools
-tensorboard
 tqdm>=4.64.0
 yapf
diff --git a/requirements/nlp.txt b/requirements/nlp.txt
index b566c5b6..2e22ef8f 100644
--- a/requirements/nlp.txt
+++ b/requirements/nlp.txt
@@ -20,7 +20,6 @@ seqeval
 spacy>=2.3.5
 subword_nmt>=0.3.8
 termcolor
-text2sql_lgesql
 tokenizers
 transformers>=4.12.0
 zhconv
diff --git a/setup.cfg b/setup.cfg
index 6a5a66e5..bfee5eec 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -24,5 +24,8 @@ select = B,C,E,F,P,T4,W,B9
 ignore = F401,F405,F821,W503,E251
 exclude = docs/src,*.pyi,.git
 
+[darglint]
+ignore=DAR101
+
 [easy_install]
 index-url=https://pypi.tuna.tsinghua.edu.cn/simple
diff --git a/setup.py b/setup.py
index 210c211c..07831d36 100644
--- a/setup.py
+++ b/setup.py
@@ -83,7 +83,9 @@ def parse_requirements(fname='requirements.txt', with_version=True):
         if line.startswith('-r '):
             # Allow specifying requirements in other files
             target = line.split(' ')[1]
-            for info in parse_require_file(target):
+            relative_base = os.path.dirname(fname)
+            absolute_target = os.path.join(relative_base, target)
+            for info in parse_require_file(absolute_target):
                 yield info
         else:
             info = {'line': line}
@@ -186,7 +188,10 @@ if __name__ == '__main__':
         # result in mac/windows compatibility problems
         if field != Fields.audio:
             all_requires.append(extra_requires[field])
-
+    for subfiled in ['asr', 'kws', 'signal', 'tts']:
+        filed_name = f'audio_{subfiled}'
+        extra_requires[filed_name], _ = parse_requirements(
+            f'requirements/audio/{filed_name}.txt')
     extra_requires['all'] = all_requires
 
     setup(
diff --git a/tests/export/test_export_csanmt_model.py b/tests/export/test_export_csanmt_model.py
new file mode 100644
index 00000000..84e1d7c0
--- /dev/null
+++ b/tests/export/test_export_csanmt_model.py
@@ -0,0 +1,40 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os
+import shutil
+import tempfile
+import unittest
+
+import numpy as np
+import tensorflow as tf
+from tensorflow.keras.applications.resnet50 import ResNet50, preprocess_input
+from tensorflow.keras.preprocessing import image
+
+from modelscope.exporters import TfModelExporter
+from modelscope.models import Model
+from modelscope.utils.constant import Tasks
+from modelscope.utils.test_utils import compare_arguments_nested, test_level
+
+
+class TestExportTfModel(unittest.TestCase):
+
+    def setUp(self):
+        print(('Testing %s.%s' % (type(self).__name__, self._testMethodName)))
+        self.tmp_dir = tempfile.TemporaryDirectory().name
+        if not os.path.exists(self.tmp_dir):
+            os.makedirs(self.tmp_dir)
+
+    def tearDown(self):
+        shutil.rmtree(self.tmp_dir)
+        super().tearDown()
+
+    @unittest.skipUnless(test_level() >= 2,
+                         'test with numpy version == 1.18.1')
+    def test_export_csanmt(self):
+        model = Model.from_pretrained('damo/nlp_csanmt_translation_en2zh_base')
+        print(
+            TfModelExporter.from_model(model).export_saved_model(
+                output_dir=self.tmp_dir))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/export/test_export_sbert_sequence_classification.py b/tests/export/test_export_sbert_sequence_classification.py
index dc02cf18..8ef27efd 100644
--- a/tests/export/test_export_sbert_sequence_classification.py
+++ b/tests/export/test_export_sbert_sequence_classification.py
@@ -38,7 +38,7 @@ class TestExportSbertSequenceClassification(unittest.TestCase):
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_export_bert_sequence_classification(self):
         model = Model.from_pretrained(
-            self.model_id_bert, task=Tasks.text_classification)
+            self.model_id_bert, task=Tasks.text_classification, num_labels=2)
         print(
             Exporter.from_model(model).export_onnx(
                 shape=(2, 256), output_dir=self.tmp_dir))
diff --git a/tests/export/test_export_sbert_zero_shot_classification.py b/tests/export/test_export_sbert_zero_shot_classification.py
new file mode 100644
index 00000000..52d1cba4
--- /dev/null
+++ b/tests/export/test_export_sbert_zero_shot_classification.py
@@ -0,0 +1,47 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os
+import shutil
+import tempfile
+import unittest
+from collections import OrderedDict
+
+from modelscope.exporters import Exporter, TorchModelExporter
+from modelscope.models import Model
+from modelscope.utils.constant import Tasks
+from modelscope.utils.test_utils import test_level
+
+
+class TestExportSbertZeroShotClassification(unittest.TestCase):
+
+    def setUp(self):
+        print(('Testing %s.%s' % (type(self).__name__, self._testMethodName)))
+        self.tmp_dir = tempfile.TemporaryDirectory().name
+        if not os.path.exists(self.tmp_dir):
+            os.makedirs(self.tmp_dir)
+        self.model_id = 'damo/nlp_structbert_zero-shot-classification_chinese-base'
+
+    def tearDown(self):
+        shutil.rmtree(self.tmp_dir)
+        super().tearDown()
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_export_sbert_sequence_classification(self):
+        model = Model.from_pretrained(self.model_id)
+        print(
+            Exporter.from_model(model).export_onnx(
+                candidate_labels=[
+                    '文化', '体育', '娱乐', '财经', '家居', '汽车', '教育', '科技', '军事'
+                ],
+                hypothesis_template='这篇文章的标题是{}',
+                output_dir=self.tmp_dir))
+        print(
+            Exporter.from_model(model).export_torch_script(
+                candidate_labels=[
+                    '文化', '体育', '娱乐', '财经', '家居', '汽车', '教育', '科技', '军事'
+                ],
+                hypothesis_template='这篇文章的标题是{}',
+                output_dir=self.tmp_dir))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/export/test_export_tf_model.py b/tests/export/test_export_tf_model.py
index 723c3d1d..8a2a87ce 100644
--- a/tests/export/test_export_tf_model.py
+++ b/tests/export/test_export_tf_model.py
@@ -10,7 +10,6 @@ from tensorflow.keras.applications.resnet50 import ResNet50, preprocess_input
 from tensorflow.keras.preprocessing import image
 
 from modelscope.exporters import TfModelExporter
-from modelscope.utils.regress_test_utils import compare_arguments_nested
 from modelscope.utils.test_utils import test_level
 
 
diff --git a/tests/hub/test_hub_operation.py b/tests/hub/test_hub_operation.py
index 44f3eea1..6d1347c2 100644
--- a/tests/hub/test_hub_operation.py
+++ b/tests/hub/test_hub_operation.py
@@ -49,7 +49,7 @@ class HubOperationTest(unittest.TestCase):
         repo.tag_and_push(self.revision, 'Test revision')
 
     def test_model_repo_creation(self):
-        # change to proper model names before use
+        # change to proper model names before use.
         try:
             info = self.api.get_model(model_id=self.model_id)
             assert info['Name'] == self.model_name
diff --git a/tests/models/test_base_torch.py b/tests/models/test_base_torch.py
index c147259b..40d1413a 100644
--- a/tests/models/test_base_torch.py
+++ b/tests/models/test_base_torch.py
@@ -1,5 +1,8 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 
+import os
+import shutil
+import tempfile
 import unittest
 
 import numpy as np
@@ -12,6 +15,16 @@ from modelscope.models.base import TorchModel
 
 class TorchBaseTest(unittest.TestCase):
 
+    def setUp(self):
+        print(('Testing %s.%s' % (type(self).__name__, self._testMethodName)))
+        self.tmp_dir = tempfile.TemporaryDirectory().name
+        if not os.path.exists(self.tmp_dir):
+            os.makedirs(self.tmp_dir)
+
+    def tearDown(self):
+        shutil.rmtree(self.tmp_dir)
+        super().tearDown()
+
     def test_custom_model(self):
 
         class MyTorchModel(TorchModel):
@@ -55,6 +68,18 @@ class TorchBaseTest(unittest.TestCase):
         self.assertEqual((1, 20, 2, 2), out.shape)
         self.assertTrue(np.all(out.detach().numpy() > (add_bias - 10)))
 
+    def test_save_pretrained(self):
+        model = TorchModel.from_pretrained(
+            'damo/nlp_structbert_sentence-similarity_chinese-tiny')
+        save_path = os.path.join(self.tmp_dir, 'test_save_pretrained')
+        model.save_pretrained(
+            save_path, save_checkpoint_names='pytorch_model.bin')
+        self.assertTrue(
+            os.path.isfile(os.path.join(save_path, 'pytorch_model.bin')))
+        self.assertTrue(
+            os.path.isfile(os.path.join(save_path, 'configuration.json')))
+        self.assertTrue(os.path.isfile(os.path.join(save_path, 'vocab.txt')))
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/msdatasets/test_ms_dataset.py b/tests/msdatasets/test_ms_dataset.py
index 6cc1dc51..2bea0c4c 100644
--- a/tests/msdatasets/test_ms_dataset.py
+++ b/tests/msdatasets/test_ms_dataset.py
@@ -4,6 +4,7 @@ import unittest
 
 from modelscope.models import Model
 from modelscope.msdatasets import MsDataset
+from modelscope.msdatasets.audio.asr_dataset import ASRDataset
 from modelscope.preprocessors import TextClassificationTransformersPreprocessor
 from modelscope.preprocessors.base import Preprocessor
 from modelscope.utils.constant import DEFAULT_DATASET_NAMESPACE, DownloadMode
@@ -111,6 +112,12 @@ class MsDatasetTest(unittest.TestCase):
             drop_remainder=True)
         print(next(iter(tf_dataset)))
 
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_to_dataset_asr(self):
+        ms_ds_asr = ASRDataset.load(
+            'speech_asr_aishell1_trainsets', namespace='speech_asr')
+        print(next(iter(ms_ds_asr['train'])))
+
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     @require_torch
     def test_to_torch_dataset_img(self):
@@ -197,7 +204,7 @@ class MsDatasetTest(unittest.TestCase):
         assert isinstance(data_example['Noisy Image:FILE:Object'],
                           PngImageFile)
 
-    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
     def test_to_ms_dataset(self):
         """Test case for converting huggingface dataset to `MsDataset` instance."""
         from datasets.load import load_dataset
diff --git a/tests/pipelines/test_abnormal_object_detection.py b/tests/pipelines/test_abnormal_object_detection.py
new file mode 100644
index 00000000..fbce51c6
--- /dev/null
+++ b/tests/pipelines/test_abnormal_object_detection.py
@@ -0,0 +1,29 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import unittest
+
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
+from modelscope.utils.test_utils import test_level
+
+
+class ObjectDetectionTest(unittest.TestCase, DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.task = Tasks.image_object_detection
+        self.model_id = 'damo/cv_resnet50_object-detection_maskscoring'
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_abnormal_object_detection(self):
+        input_location = 'data/test/images/image_detection.jpg'
+        object_detect = pipeline(self.task, model=self.model_id)
+        result = object_detect(input_location)
+        print(result)
+
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/pipelines/test_adaint_image_color_enhance.py b/tests/pipelines/test_adaint_image_color_enhance.py
new file mode 100644
index 00000000..e36a85ec
--- /dev/null
+++ b/tests/pipelines/test_adaint_image_color_enhance.py
@@ -0,0 +1,50 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os.path as osp
+import unittest
+
+import cv2
+import torch
+
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines import pipeline
+from modelscope.pipelines.base import Pipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
+from modelscope.utils.test_utils import test_level
+
+
+class AdaIntImageColorEnhanceTest(unittest.TestCase, DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.model_id = 'damo/cv_adaint_image-color-enhance-models'
+        self.task = Tasks.image_color_enhancement
+
+    def pipeline_inference(self, pipeline: Pipeline, input_location: str):
+        result = pipeline(input_location)
+        if result is not None:
+            cv2.imwrite('result.png', result[OutputKeys.OUTPUT_IMG])
+            print(f'Output written to {osp.abspath("result.png")}')
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    @unittest.skipIf(not torch.cuda.is_available(), 'cuda unittest only')
+    def test_run_modelhub(self):
+        img_color_enhance = pipeline(
+            Tasks.image_color_enhancement, model=self.model_id)
+        self.pipeline_inference(img_color_enhance,
+                                'data/test/images/image_color_enhance.png')
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    @unittest.skipIf(not torch.cuda.is_available(), 'cuda unittest only')
+    def test_run_modelhub_default_model(self):
+        img_color_enhance = pipeline(Tasks.image_color_enhancement)
+        self.pipeline_inference(img_color_enhance,
+                                'data/test/images/image_color_enhance.png')
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    @unittest.skipIf(not torch.cuda.is_available(), 'cuda unittest only')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/pipelines/test_bad_image_detecting.py b/tests/pipelines/test_bad_image_detecting.py
new file mode 100644
index 00000000..728da8d1
--- /dev/null
+++ b/tests/pipelines/test_bad_image_detecting.py
@@ -0,0 +1,67 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import unittest
+
+from modelscope.hub.snapshot_download import snapshot_download
+from modelscope.models import Model
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines import pipeline
+from modelscope.pipelines.cv import BadImageDetecingPipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
+from modelscope.utils.test_utils import test_level
+
+
+class BadImageDetectingTest(unittest.TestCase, DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.task = Tasks.bad_image_detecting
+        self.model_id = 'damo/cv_mobilenet-v2_bad-image-detecting'
+        self.test_img = 'data/test/images/dogs.jpg'
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_run_by_direct_model_download(self):
+        cache_path = snapshot_download(self.model_id)
+        pipeline = BadImageDetecingPipeline(cache_path)
+        pipeline.group_key = self.task
+        out = pipeline(input=self.test_img)
+        labels = out[OutputKeys.LABELS]
+        scores = out[OutputKeys.SCORES]
+        print('pipeline: the out_label is {}'.format(labels))
+        print('pipeline: the out_score is {}'.format(scores))
+
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_run_with_model_from_modelhub(self):
+        model = Model.from_pretrained(self.model_id)
+        pipeline_ins = pipeline(task=Tasks.bad_image_detecting, model=model)
+        out = pipeline_ins(input=self.test_img)
+        labels = out[OutputKeys.LABELS]
+        scores = out[OutputKeys.SCORES]
+        print('pipeline: the out_label is {}'.format(labels))
+        print('pipeline: the out_score is {}'.format(scores))
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_with_model_name(self):
+        pipeline_ins = pipeline(
+            task=Tasks.bad_image_detecting, model=self.model_id)
+        out = pipeline_ins(input=self.test_img)
+        labels = out[OutputKeys.LABELS]
+        scores = out[OutputKeys.SCORES]
+        print('pipeline: the out_label is {}'.format(labels))
+        print('pipeline: the out_score is {}'.format(scores))
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_run_with_default_model(self):
+        pipeline_ins = pipeline(task=Tasks.bad_image_detecting)
+        out = pipeline_ins(input=self.test_img)
+        labels = out[OutputKeys.LABELS]
+        scores = out[OutputKeys.SCORES]
+        print('pipeline: the out_label is {}'.format(labels))
+        print('pipeline: the out_score is {}'.format(scores))
+
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/pipelines/test_base.py b/tests/pipelines/test_base.py
index f913f4c0..75aa8afc 100644
--- a/tests/pipelines/test_base.py
+++ b/tests/pipelines/test_base.py
@@ -54,6 +54,79 @@ class CustomPipelineTest(unittest.TestCase):
         with self.assertRaises(TypeError):
             CustomPipeline1()
 
+    def test_batch(self):
+        import torch
+
+        dummy_task = 'dummy-task'
+        dummy_module = 'custom-batch'
+
+        @PIPELINES.register_module(
+            group_key=dummy_task, module_name=dummy_module)
+        class CustomBatchPipeline(Pipeline):
+
+            def __init__(self,
+                         config_file: str = None,
+                         model=None,
+                         preprocessor=None,
+                         **kwargs):
+                super().__init__(config_file, model, preprocessor, **kwargs)
+
+            def _batch(self, sample_list):
+                sample_batch = {'img': [], 'url': []}
+                for sample in sample_list:
+                    resized_img = torch.from_numpy(
+                        np.array(sample['img'].resize((640, 640))))
+                    sample_batch['img'].append(torch.unsqueeze(resized_img, 0))
+                    sample_batch['url'].append(sample['url'])
+
+                sample_batch['img'] = torch.concat(sample_batch['img'])
+                return sample_batch
+
+            def preprocess(self, input: Union[str,
+                                              'PIL.Image']) -> Dict[str, Any]:
+                """ Provide default implementation based on preprocess_cfg and user can reimplement it
+
+                """
+                if not isinstance(input, Image.Image):
+                    from modelscope.preprocessors import load_image
+                    data_dict = {'img': load_image(input), 'url': input}
+                else:
+                    data_dict = {'img': input}
+                return data_dict
+
+            def forward(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
+                """ Provide default implementation using self.model and user can reimplement it
+                """
+                inputs['img'] += 1
+                return inputs
+
+            def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
+                inputs['url'] += 'dummy_end'
+                return inputs
+
+        self.assertTrue(dummy_module in PIPELINES.modules[dummy_task])
+        add_default_pipeline_info(dummy_task, dummy_module, overwrite=True)
+        pipe = pipeline(
+            task=dummy_task, pipeline_name=dummy_module, model=self.model_dir)
+
+        img_url = 'data/test/images/dogs.jpg'
+        output = pipe([img_url for _ in range(9)], batch_size=2)
+        for out in output:
+            self.assertEqual(out['url'], img_url + 'dummy_end')
+            self.assertEqual(out['img'].shape, (640, 640, 3))
+
+        pipe_nocollate = pipeline(
+            task=dummy_task,
+            pipeline_name=dummy_module,
+            model=self.model_dir,
+            auto_collate=False)
+
+        img_url = 'data/test/images/dogs.jpg'
+        output = pipe_nocollate([img_url for _ in range(9)], batch_size=2)
+        for out in output:
+            self.assertEqual(out['url'], img_url + 'dummy_end')
+            self.assertEqual(out['img'].shape, (640, 640, 3))
+
     def test_custom(self):
         dummy_task = 'dummy-task'
 
diff --git a/tests/pipelines/test_body_3d_keypoints_hdformer.py b/tests/pipelines/test_body_3d_keypoints_hdformer.py
new file mode 100644
index 00000000..2ebbc95b
--- /dev/null
+++ b/tests/pipelines/test_body_3d_keypoints_hdformer.py
@@ -0,0 +1,50 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import unittest
+
+import cv2
+import numpy as np
+
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines import pipeline
+from modelscope.pipelines.base import Pipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
+from modelscope.utils.test_utils import test_level
+
+
+class Body3DKeypointsHDFormerTest(unittest.TestCase, DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.model_id = 'damo/cv_hdformer_body-3d-keypoints_video'
+        self.test_video = 'data/test/videos/Walking.54138969.mp4'
+        self.task = Tasks.body_3d_keypoints
+
+    def pipeline_inference(self, pipeline: Pipeline, pipeline_input):
+        output = pipeline(pipeline_input, output_video='./result.mp4')
+        poses = np.array(output[OutputKeys.KEYPOINTS])
+        print(f'result 3d points shape {poses.shape}')
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_modelhub_with_video_file(self):
+        body_3d_keypoints = pipeline(
+            Tasks.body_3d_keypoints, model=self.model_id)
+        pipeline_input = self.test_video
+        self.pipeline_inference(
+            body_3d_keypoints, pipeline_input=pipeline_input)
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_modelhub_with_video_stream(self):
+        body_3d_keypoints = pipeline(Tasks.body_3d_keypoints)
+        cap = cv2.VideoCapture(self.test_video)
+        if not cap.isOpened():
+            raise Exception('modelscope error: %s cannot be decoded by OpenCV.'
+                            % (self.test_video))
+        self.pipeline_inference(body_3d_keypoints, pipeline_input=cap)
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/pipelines/test_cartoon_stable_diffusion.py b/tests/pipelines/test_cartoon_stable_diffusion.py
new file mode 100644
index 00000000..751c7ea8
--- /dev/null
+++ b/tests/pipelines/test_cartoon_stable_diffusion.py
@@ -0,0 +1,86 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import unittest
+
+import cv2
+
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
+from modelscope.utils.test_utils import test_level
+
+
+class CartoonStableDiffusionTest(unittest.TestCase, DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.task = Tasks.text_to_image_synthesis
+        self.model_id = 'damo/cv_cartoon_stable_diffusion_design'
+        self.model_id_illu = 'damo/cv_cartoon_stable_diffusion_illustration'
+        self.model_id_watercolor = 'damo/cv_cartoon_stable_diffusion_watercolor'
+        self.model_id_flat = 'damo/cv_cartoon_stable_diffusion_flat'
+        self.model_id_clipart = 'damo/cv_cartoon_stable_diffusion_clipart'
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_default(self):
+        pipe = pipeline(
+            task=self.task, model=self.model_id, model_revision='v1.0.0')
+        output = pipe(
+            {'text': 'sks style, a portrait painting of Johnny Depp'})
+        cv2.imwrite('result_design.png', output['output_imgs'][0])
+        print('Image saved to result_design.png')
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_illustration(self):
+        pipe = pipeline(
+            task=self.task, model=self.model_id_illu, model_revision='v1.0.0')
+        output = pipe(
+            {'text': 'sks style, a portrait painting of Johnny Depp'})
+        cv2.imwrite('result_illu.png', output['output_imgs'][0])
+        print('Image saved to result_illu.png')
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_watercolor(self):
+        pipe = pipeline(
+            task=self.task,
+            model=self.model_id_watercolor,
+            model_revision='v1.0.0')
+        output = pipe(
+            {'text': 'sks style, a portrait painting of Johnny Depp'})
+        cv2.imwrite('result_watercolor.png', output['output_imgs'][0])
+        print('Image saved to result_watercolor.png')
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_flat(self):
+        pipe = pipeline(
+            task=self.task, model=self.model_id_flat, model_revision='v1.0.0')
+        output = pipe(
+            {'text': 'sks style, a portrait painting of Johnny Depp'})
+        cv2.imwrite('result_flat.png', output['output_imgs'][0])
+        print('Image saved to result_flat.png')
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_clipart(self):
+        pipe = pipeline(
+            task=self.task,
+            model=self.model_id_clipart,
+            model_revision='v1.0.0')
+        output = pipe(
+            {'text': 'archer style, a portrait painting of Johnny Depp'})
+        cv2.imwrite('result_clipart.png', output['output_imgs'][0])
+        print('Image saved to result_clipart.png')
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_eulerasolver(self):
+        from diffusers.schedulers import EulerAncestralDiscreteScheduler
+        pipe = pipeline(
+            task=self.task, model=self.model_id, model_revision='v1.0.0')
+        pipe.pipeline.scheduler = EulerAncestralDiscreteScheduler.from_config(
+            pipe.pipeline.scheduler.config)
+        output = pipe(
+            {'text': 'sks style, a portrait painting of Johnny Depp'})
+        cv2.imwrite('result_design2.png', output['output_imgs'][0])
+        print('Image saved to result_design2.png')
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/pipelines/test_content_check.py b/tests/pipelines/test_content_check.py
new file mode 100644
index 00000000..c68af257
--- /dev/null
+++ b/tests/pipelines/test_content_check.py
@@ -0,0 +1,29 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import unittest
+
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
+from modelscope.utils.test_utils import test_level
+
+
+class ContentCheckTest(unittest.TestCase, DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.task = Tasks.image_classification
+        self.model_id = 'damo/cv_resnet50_image-classification_cc'
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run(self):
+        content_check_func = pipeline(self.task, model=self.model_id)
+        result = content_check_func('data/test/images/content_check.jpg')
+        print(result)
+
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/pipelines/test_csanmt_translation.py b/tests/pipelines/test_csanmt_translation.py
index 74e12bb6..d989a6c4 100644
--- a/tests/pipelines/test_csanmt_translation.py
+++ b/tests/pipelines/test_csanmt_translation.py
@@ -26,6 +26,15 @@ class TranslationTest(unittest.TestCase, DemoCompatibilityCheck):
         pipeline_ins = pipeline(self.task, model=model_id)
         print(pipeline_ins(input=inputs))
 
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_with_model_name_for_en2zh_batch(self):
+        model_id = 'damo/nlp_csanmt_translation_en2zh'
+        inputs = 'Elon Musk, co-founder and chief executive officer of Tesla Motors.' + \
+            '<SENT_SPLIT>' + "Alibaba Group's mission is to let the world have no difficult business" + \
+            '<SENT_SPLIT>' + 'Beijing is the capital of China.'
+        pipeline_ins = pipeline(self.task, model=model_id)
+        print(pipeline_ins(input=inputs))
+
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_run_with_model_name_for_en2zh_base(self):
         model_id = 'damo/nlp_csanmt_translation_en2zh_base'
@@ -47,6 +56,13 @@ class TranslationTest(unittest.TestCase, DemoCompatibilityCheck):
         pipeline_ins = pipeline(self.task, model=model_id)
         print(pipeline_ins(input=inputs))
 
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_with_model_name_for_en2ru(self):
+        model_id = 'damo/nlp_csanmt_translation_en2ru_base'
+        inputs = 'When I was in my 20s, I saw my very first psychotherapy client.'
+        pipeline_ins = pipeline(self.task, model=model_id)
+        print(pipeline_ins(input=inputs))
+
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_run_with_model_name_for_fr2en(self):
         model_id = 'damo/nlp_csanmt_translation_fr2en'
@@ -61,6 +77,13 @@ class TranslationTest(unittest.TestCase, DemoCompatibilityCheck):
         pipeline_ins = pipeline(self.task, model=model_id)
         print(pipeline_ins(input=inputs))
 
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_with_model_name_for_ru2en(self):
+        model_id = 'damo/nlp_csanmt_translation_ru2en_base'
+        inputs = 'Это всего лишь пример.'
+        pipeline_ins = pipeline(self.task, model=model_id)
+        print(pipeline_ins(input=inputs))
+
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_with_default_model(self):
         inputs = '声明补充说，沃伦的同事都深感震惊，并且希望他能够投案自首。'
diff --git a/tests/pipelines/test_ddpm_semantic_segmentation.py b/tests/pipelines/test_ddpm_semantic_segmentation.py
new file mode 100644
index 00000000..554636c4
--- /dev/null
+++ b/tests/pipelines/test_ddpm_semantic_segmentation.py
@@ -0,0 +1,34 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import unittest
+
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
+from modelscope.utils.test_utils import test_level
+
+
+class DDPMImageSemanticSegmentationTest(unittest.TestCase,
+                                        DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.task = Tasks.image_segmentation
+        self.model_id = 'damo/cv_diffusion_image-segmentation'
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_ddpm_image_semantic_segmentation(self):
+        input_location = 'data/test/images/image_ffhq34_00041527.png'
+
+        pp = pipeline(Tasks.semantic_segmentation, model=self.model_id)
+        result = pp(input_location)
+        if result:
+            print(result)
+        else:
+            raise ValueError('process error')
+
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/pipelines/test_deeplpf_image_color_enhance.py b/tests/pipelines/test_deeplpf_image_color_enhance.py
new file mode 100644
index 00000000..08b1a357
--- /dev/null
+++ b/tests/pipelines/test_deeplpf_image_color_enhance.py
@@ -0,0 +1,46 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os.path as osp
+import unittest
+
+import cv2
+
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines import pipeline
+from modelscope.pipelines.base import Pipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
+from modelscope.utils.test_utils import test_level
+
+
+class DeepLPFImageColorEnhanceTest(unittest.TestCase, DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.model_id = 'damo/cv_deeplpfnet_image-color-enhance-models'
+        self.task = Tasks.image_color_enhancement
+
+    def pipeline_inference(self, pipeline: Pipeline, input_location: str):
+        result = pipeline(input_location)
+        if result is not None:
+            cv2.imwrite('result.png', result[OutputKeys.OUTPUT_IMG])
+            print(f'Output written to {osp.abspath("result.png")}')
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_modelhub(self):
+        img_color_enhance = pipeline(
+            Tasks.image_color_enhancement, model=self.model_id)
+        self.pipeline_inference(img_color_enhance,
+                                'data/test/images/image_color_enhance.png')
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_run_modelhub_default_model(self):
+        img_color_enhance = pipeline(Tasks.image_color_enhancement)
+        self.pipeline_inference(img_color_enhance,
+                                'data/test/images/image_color_enhance.png')
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/pipelines/test_document_grounded_dialog_generate.py b/tests/pipelines/test_document_grounded_dialog_generate.py
new file mode 100644
index 00000000..da23fe19
--- /dev/null
+++ b/tests/pipelines/test_document_grounded_dialog_generate.py
@@ -0,0 +1,88 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os.path
+import unittest
+from threading import Thread
+
+from modelscope.hub.snapshot_download import snapshot_download
+from modelscope.models import Model
+from modelscope.pipelines import pipeline
+from modelscope.preprocessors.nlp import \
+    DocumentGroundedDialogGeneratePreprocessor
+from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
+from modelscope.utils.test_utils import test_level
+
+
+class DocumentGroundedDialogGenerateTest(unittest.TestCase,
+                                         DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.task = Tasks.document_grounded_dialog_generate
+        self.model_id = 'DAMO_ConvAI/nlp_convai_generation_pretrain'
+
+    param = {
+        'query': [
+            '<last_turn>：我想知道孩子如果出现阑尾炎的话会怎么样？',
+            '<last_turn>：好像是从肚脐开始，然后到右下方<system>您可以描述一下孩子的情况吗？<user>我想知道孩子如果出现阑尾炎的话会怎么样？',
+        ],
+        'context': [
+            ['c1', 'c2', 'c3', 'c4', 'c5'],
+            ['c1', 'c2', 'c3', 'c4', 'c5'],
+        ],
+        'label': [
+            '<response>您可以描述一下孩子的情况吗？',
+            '<response>那还有没有烦躁或无精打采的表现呢？',
+        ]
+    }
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_run_by_direct_model_download(self):
+        cache_path = snapshot_download(self.model_id, revision='v1.0.0')
+        preprocessor = DocumentGroundedDialogGeneratePreprocessor(
+            model_dir=cache_path)
+        pipeline_ins = pipeline(
+            Tasks.document_grounded_dialog_generate,
+            model=cache_path,
+            preprocessor=preprocessor)
+        print(pipeline_ins(self.param))
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_run_by_direct_model_download_with_multithreads(self):
+        cache_path = snapshot_download(self.model_id, revision='v1.0.0')
+        pl = pipeline(
+            Tasks.document_grounded_dialog_generate, model=cache_path)
+
+        def print_func(pl, i):
+            result = pl(self.param)
+            print(i, result)
+
+        procs = []
+        for i in range(5):
+            proc = Thread(target=print_func, args=(pl, i))
+            procs.append(proc)
+            proc.start()
+        for proc in procs:
+            proc.join()
+
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_run_with_model_from_modelhub(self):
+        model = Model.from_pretrained(self.model_id, revision='v1.0.0')
+
+        preprocessor = DocumentGroundedDialogGeneratePreprocessor(
+            model_dir=model.model_dir)
+        pipeline_ins = pipeline(
+            Tasks.document_grounded_dialog_generate,
+            model=model,
+            preprocessor=preprocessor)
+        print(pipeline_ins(self.param))
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_run_with_default_model(self):
+        pipeline_ins = pipeline(
+            task=Tasks.document_grounded_dialog_generate,
+            model_revision='v1.0.0')
+        print(pipeline_ins(self.param))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/pipelines/test_document_grounded_dialog_rerank.py b/tests/pipelines/test_document_grounded_dialog_rerank.py
new file mode 100644
index 00000000..00dd8587
--- /dev/null
+++ b/tests/pipelines/test_document_grounded_dialog_rerank.py
@@ -0,0 +1,51 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import unittest
+
+import json
+import torch
+
+from modelscope.models import Model
+from modelscope.models.nlp import DocumentGroundedDialogRerankModel
+from modelscope.msdatasets import MsDataset
+from modelscope.pipelines.nlp import DocumentGroundedDialogRerankPipeline
+from modelscope.preprocessors.nlp import \
+    DocumentGroundedDialogRerankPreprocessor
+from modelscope.utils.constant import DownloadMode, Tasks
+from modelscope.utils.test_utils import test_level
+
+
+class DocumentGroundedDialogRerankTest(unittest.TestCase):
+
+    def setUp(self) -> None:
+        self.task = Tasks.document_grounded_dialog_rerank
+        self.model_id = 'DAMO_ConvAI/nlp_convai_ranking_pretrain'
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_run(self):
+        args = {
+            'output': '../../../result.json',
+            'max_batch_size': 64,
+            'exclude_instances': '',
+            'include_passages': False,
+            'do_lower_case': True,
+            'max_seq_length': 512,
+            'query_length': 195,
+            'tokenizer_resize': True,
+            'model_resize': True,
+            'kilt_data': True
+        }
+        model = Model.from_pretrained(self.model_id, revision='v1.0.0', **args)
+        mypreprocessor = DocumentGroundedDialogRerankPreprocessor(
+            model.model_dir, **args)
+        pipeline_ins = DocumentGroundedDialogRerankPipeline(
+            model=model, preprocessor=mypreprocessor, **args)
+        dataset = MsDataset.load(
+            'DAMO_ConvAI/FrDoc2BotRerank',
+            download_mode=DownloadMode.FORCE_REDOWNLOAD,
+            split='test')[:2]
+        # print(dataset)
+        pipeline_ins(dataset)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/pipelines/test_document_grounded_dialog_retrieval.py b/tests/pipelines/test_document_grounded_dialog_retrieval.py
new file mode 100644
index 00000000..6bcca369
--- /dev/null
+++ b/tests/pipelines/test_document_grounded_dialog_retrieval.py
@@ -0,0 +1,84 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os.path
+import unittest
+from threading import Thread
+
+from modelscope.hub.snapshot_download import snapshot_download
+from modelscope.models import Model
+from modelscope.pipelines import pipeline
+from modelscope.preprocessors.nlp import \
+    DocumentGroundedDialogRetrievalPreprocessor
+from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
+from modelscope.utils.test_utils import test_level
+
+
+class DocumentGroundedDialogRetrievalTest(unittest.TestCase,
+                                          DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.task = Tasks.document_grounded_dialog_retrieval
+        self.model_id = 'DAMO_ConvAI/nlp_convai_retrieval_pretrain'
+
+    param = {
+        'query': [
+            '<last_turn>我想知道孩子如果出现阑尾炎的话会怎么样',
+            '<last_turn>好像是从肚脐开始，然后到右下方<system>您可以描述一下孩子的情况吗？<user>我想知道孩子如果出现阑尾炎的话会怎么样？',
+        ],
+        'positive': ['阑尾炎', '肚脐开始'],
+        'negative': [
+            '肠胃炎',
+            '肚脐为止',
+        ]
+    }
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_run_by_direct_model_download(self):
+        cache_path = snapshot_download(self.model_id, revision='v1.0.0')
+        preprocessor = DocumentGroundedDialogRetrievalPreprocessor(
+            model_dir=cache_path)
+        pipeline_ins = pipeline(
+            Tasks.document_grounded_dialog_retrieval,
+            model=cache_path,
+            preprocessor=preprocessor)
+        print(pipeline_ins(self.param))
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_run_by_direct_model_download_with_multithreads(self):
+        cache_path = snapshot_download(self.model_id, revision='v1.0.0')
+        pl = pipeline(
+            Tasks.document_grounded_dialog_retrieval, model=cache_path)
+
+        def print_func(pl, i):
+            result = pl(self.param)
+            print(i, result)
+
+        procs = []
+        for i in range(5):
+            proc = Thread(target=print_func, args=(pl, i))
+            procs.append(proc)
+            proc.start()
+        for proc in procs:
+            proc.join()
+
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_run_with_model_from_modelhub(self):
+        model = Model.from_pretrained(self.model_id, revision='v1.0.0')
+        preprocessor = DocumentGroundedDialogRetrievalPreprocessor(
+            model_dir=model.model_dir)
+        pipeline_ins = pipeline(
+            Tasks.document_grounded_dialog_retrieval,
+            model=model,
+            preprocessor=preprocessor)
+        print(pipeline_ins(self.param))
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_run_with_default_model(self):
+        pipeline_ins = pipeline(
+            task=Tasks.document_grounded_dialog_retrieval,
+            model_revision='v1.0.0')
+        print(pipeline_ins(self.param))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/pipelines/test_face_liveness_xc.py b/tests/pipelines/test_face_liveness_xc.py
new file mode 100644
index 00000000..91b46e01
--- /dev/null
+++ b/tests/pipelines/test_face_liveness_xc.py
@@ -0,0 +1,38 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os.path as osp
+import unittest
+
+import cv2
+
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.cv.image_utils import draw_face_detection_no_lm_result
+from modelscope.utils.test_utils import test_level
+
+
+class FaceLivenessXcTest(unittest.TestCase):
+
+    def setUp(self) -> None:
+        self.model_id = 'damo/cv_manual_face-liveness_flxc'
+        self.img_path = 'data/test/images/face_liveness_rgb.png'
+
+    def show_result(self, img_path, detection_result):
+        img = draw_face_detection_no_lm_result(img_path, detection_result)
+        cv2.imwrite('result.png', img)
+        print(f'output written to {osp.abspath("result.png")}')
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_modelhub(self):
+        face_detection = pipeline(Tasks.face_liveness, model=self.model_id)
+        result = face_detection(self.img_path)
+        self.show_result(self.img_path, result)
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_default_model(self):
+        face_detection = pipeline(Tasks.face_liveness)
+        result = face_detection(self.img_path)
+        self.show_result(self.img_path, result)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/pipelines/test_face_quality_assessment.py b/tests/pipelines/test_face_quality_assessment.py
new file mode 100644
index 00000000..33938b37
--- /dev/null
+++ b/tests/pipelines/test_face_quality_assessment.py
@@ -0,0 +1,39 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os.path as osp
+import unittest
+
+import cv2
+
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.cv.image_utils import draw_face_detection_no_lm_result
+from modelscope.utils.test_utils import test_level
+
+
+class FaceQualityAssessmentTest(unittest.TestCase):
+
+    def setUp(self) -> None:
+        self.model_id = 'damo/cv_manual_face-quality-assessment_fqa'
+        self.img_path = 'data/test/images/face_recognition_1.png'
+
+    def show_result(self, img_path, detection_result):
+        img = draw_face_detection_no_lm_result(img_path, detection_result)
+        cv2.imwrite('result.png', img)
+        print(f'output written to {osp.abspath("result.png")}')
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_modelhub(self):
+        face_detection = pipeline(
+            Tasks.face_quality_assessment, model=self.model_id)
+        result = face_detection(self.img_path)
+        self.show_result(self.img_path, result)
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_default_model(self):
+        face_detection = pipeline(Tasks.face_quality_assessment)
+        result = face_detection(self.img_path)
+        self.show_result(self.img_path, result)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/pipelines/test_face_recognition_onnx_fm.py b/tests/pipelines/test_face_recognition_onnx_fm.py
new file mode 100644
index 00000000..4fd6565c
--- /dev/null
+++ b/tests/pipelines/test_face_recognition_onnx_fm.py
@@ -0,0 +1,37 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import unittest
+
+import numpy as np
+
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
+from modelscope.utils.test_utils import test_level
+
+
+class FmFaceRecognitionTest(unittest.TestCase, DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.task = Tasks.face_recognition
+        self.model_id = 'damo/cv_manual_face-recognition_frfm'
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_face_compare(self):
+        img1 = 'data/test/images/face_recognition_1.png'
+        img2 = 'data/test/images/face_recognition_2.png'
+
+        face_recognition = pipeline(
+            Tasks.face_recognition, model=self.model_id)
+        emb1 = face_recognition(img1)[OutputKeys.IMG_EMBEDDING]
+        emb2 = face_recognition(img2)[OutputKeys.IMG_EMBEDDING]
+        sim = np.dot(emb1[0], emb2[0])
+        print(f'Cos similarity={sim:.3f}, img1:{img1}  img2:{img2}')
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/pipelines/test_face_recognition_onnx_ir.py b/tests/pipelines/test_face_recognition_onnx_ir.py
new file mode 100644
index 00000000..12f82aa3
--- /dev/null
+++ b/tests/pipelines/test_face_recognition_onnx_ir.py
@@ -0,0 +1,37 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import unittest
+
+import numpy as np
+
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
+from modelscope.utils.test_utils import test_level
+
+
+class IrFaceRecognitionTest(unittest.TestCase, DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.task = Tasks.face_recognition
+        self.model_id = 'damo/cv_manual_face-recognition_frir'
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_face_compare(self):
+        img1 = 'data/test/images/ir_face_recognition_1.png'
+        img2 = 'data/test/images/ir_face_recognition_2.png'
+
+        face_recognition = pipeline(
+            Tasks.face_recognition, model=self.model_id)
+        emb1 = face_recognition(img1)[OutputKeys.IMG_EMBEDDING]
+        emb2 = face_recognition(img2)[OutputKeys.IMG_EMBEDDING]
+        sim = np.dot(emb1[0], emb2[0])
+        print(f'Cos similarity={sim:.3f}, img1:{img1}  img2:{img2}')
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/pipelines/test_face_recognition_ood.py b/tests/pipelines/test_face_recognition_ood.py
index 06325e3b..041fd352 100644
--- a/tests/pipelines/test_face_recognition_ood.py
+++ b/tests/pipelines/test_face_recognition_ood.py
@@ -13,7 +13,7 @@ from modelscope.utils.test_utils import test_level
 class FaceRecognitionOodTest(unittest.TestCase, DemoCompatibilityCheck):
 
     def setUp(self) -> None:
-        self.task = Tasks.face_recognition_ood
+        self.task = Tasks.face_recognition
         self.model_id = 'damo/cv_ir_face-recognition-ood_rts'
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
@@ -21,8 +21,7 @@ class FaceRecognitionOodTest(unittest.TestCase, DemoCompatibilityCheck):
         img1 = 'data/test/images/face_recognition_1.png'
         img2 = 'data/test/images/face_recognition_2.png'
 
-        face_recognition = pipeline(
-            Tasks.face_recognition_ood, model=self.model_id)
+        face_recognition = pipeline(self.task, model=self.model_id)
         result1 = face_recognition(img1)
         emb1 = result1[OutputKeys.IMG_EMBEDDING]
         score1 = result1[OutputKeys.SCORES][0][0]
diff --git a/tests/pipelines/test_face_reconstruction.py b/tests/pipelines/test_face_reconstruction.py
new file mode 100644
index 00000000..d4370da3
--- /dev/null
+++ b/tests/pipelines/test_face_reconstruction.py
@@ -0,0 +1,52 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os.path as osp
+import sys
+import unittest
+
+from modelscope.hub.snapshot_download import snapshot_download
+from modelscope.models.cv.face_reconstruction.utils import write_obj
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines import pipeline
+from modelscope.pipelines.base import Pipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
+from modelscope.utils.test_utils import test_level
+
+sys.path.append('.')
+
+
+class FaceReconstructionTest(unittest.TestCase, DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.task = Tasks.face_reconstruction
+        self.model_id = 'damo/cv_resnet50_face-reconstruction'
+        self.test_image = 'data/test/images/face_reconstruction.jpg'
+
+    def pipeline_inference(self, pipeline: Pipeline, input_location: str):
+        result = pipeline(input_location)
+        mesh = result[OutputKeys.OUTPUT]
+        write_obj('result_face_reconstruction.obj', mesh)
+        print(
+            f'Output written to {osp.abspath("result_face_reconstruction.obj")}'
+        )
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_run_by_direct_model_download(self):
+        model_dir = snapshot_download(self.model_id)
+        face_reconstruction = pipeline(
+            Tasks.face_reconstruction, model=model_dir)
+        self.pipeline_inference(face_reconstruction, self.test_image)
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_modelhub(self):
+        face_reconstruction = pipeline(
+            Tasks.face_reconstruction, model=self.model_id)
+        self.pipeline_inference(face_reconstruction, self.test_image)
+
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/pipelines/test_facial_landmark_confidence.py b/tests/pipelines/test_facial_landmark_confidence.py
index 7b5fc99f..dde1fcf0 100644
--- a/tests/pipelines/test_facial_landmark_confidence.py
+++ b/tests/pipelines/test_facial_landmark_confidence.py
@@ -25,7 +25,7 @@ class FacialLandmarkConfidenceTest(unittest.TestCase):
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_run_modelhub(self):
-        flcm = pipeline(Tasks.facial_landmark_confidence, model=self.model_id)
+        flcm = pipeline(Tasks.face_2d_keypoints, model=self.model_id)
         img_path = 'data/test/images/face_recognition_1.png'
         result = flcm(img_path)
         self.show_result(img_path, result)
diff --git a/tests/pipelines/test_faq_question_answering.py b/tests/pipelines/test_faq_question_answering.py
index 20c21755..dc29e385 100644
--- a/tests/pipelines/test_faq_question_answering.py
+++ b/tests/pipelines/test_faq_question_answering.py
@@ -21,6 +21,8 @@ class FaqQuestionAnsweringTest(unittest.TestCase, DemoCompatibilityCheck):
     def setUp(self) -> None:
         self.task = Tasks.faq_question_answering
         self.model_id = 'damo/nlp_structbert_faq-question-answering_chinese-base'
+        self.mgimn_model_id = 'damo/nlp_mgimn_faq-question-answering_chinese-base'
+        self.model_id_multilingual = 'damo/nlp_faq-question-answering_multilingual-base'
 
     param = {
         'query_set': ['如何使用优惠券', '在哪里领券', '在哪里领券'],
@@ -75,11 +77,28 @@ class FaqQuestionAnsweringTest(unittest.TestCase, DemoCompatibilityCheck):
         result = pipeline_ins(self.param)
         print(result)
 
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_multilingual_model(self):
+        pipeline_ins = pipeline(
+            task=Tasks.faq_question_answering,
+            model=self.model_id_multilingual,
+            model_revision='v1.0.0')
+        result = pipeline_ins(self.param)
+        print(result)
+
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_with_default_model(self):
         pipeline_ins = pipeline(task=Tasks.faq_question_answering)
         print(pipeline_ins(self.param, max_seq_length=20))
 
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_run_with_mgimn_model(self):
+        pipeline_ins = pipeline(
+            task=Tasks.faq_question_answering,
+            model=self.mgimn_model_id,
+            model_revision='v1.0.0')
+        print(pipeline_ins(self.param, max_seq_length=20))
+
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_sentence_embedding(self):
         pipeline_ins = pipeline(task=Tasks.faq_question_answering)
diff --git a/tests/pipelines/test_feature_extraction.py b/tests/pipelines/test_feature_extraction.py
index 6bad602a..da6be1c0 100644
--- a/tests/pipelines/test_feature_extraction.py
+++ b/tests/pipelines/test_feature_extraction.py
@@ -5,7 +5,7 @@ import numpy as np
 
 from modelscope.hub.snapshot_download import snapshot_download
 from modelscope.models import Model
-from modelscope.models.nlp import FeatureExtractionModel
+from modelscope.models.nlp import ModelForFeatureExtraction
 from modelscope.outputs import OutputKeys
 from modelscope.pipelines import pipeline
 from modelscope.pipelines.nlp import FeatureExtractionPipeline
@@ -28,7 +28,7 @@ class FeatureExtractionTaskModelTest(unittest.TestCase,
     def test_run_with_direct_file_download(self):
         cache_path = snapshot_download(self.model_id)
         tokenizer = FillMaskTransformersPreprocessor(cache_path, padding=False)
-        model = FeatureExtractionModel.from_pretrained(self.model_id)
+        model = ModelForFeatureExtraction.from_pretrained(self.model_id)
         pipeline1 = FeatureExtractionPipeline(model, preprocessor=tokenizer)
         pipeline2 = pipeline(
             Tasks.feature_extraction, model=model, preprocessor=tokenizer)
diff --git a/tests/pipelines/test_gridvlp_classification.py b/tests/pipelines/test_gridvlp_classification.py
new file mode 100644
index 00000000..18c6c582
--- /dev/null
+++ b/tests/pipelines/test_gridvlp_classification.py
@@ -0,0 +1,71 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import unittest
+
+from modelscope.pipelines.multi_modal.gridvlp_pipeline import (
+    GridVlpClassificationPipeline, GridVlpEmbeddingPipeline)
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
+from modelscope.utils.test_utils import test_level
+
+
+class GridVlpClassificationTest(unittest.TestCase, DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.model_id = 'rgtjf1/multi-modal_gridvlp_classification_chinese-base-ecom-cate'
+
+    text = '女装快干弹力轻型短裤448575'
+    image = 'https://yejiabo-public.oss-cn-zhangjiakou.aliyuncs.com/alinlp/clothes.png'
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_run_with_pipeline(self):
+
+        gridvlp_classification_pipeline = GridVlpClassificationPipeline(
+            'rgtjf1/multi-modal_gridvlp_classification_chinese-base-ecom-cate')
+        input_params = {'text': self.text, 'image': self.image}
+        inputs = gridvlp_classification_pipeline.preprocess(input_params)
+        outputs = gridvlp_classification_pipeline.forward(inputs)
+        print(f'text: {self.text}\nimage: {self.image}\n'
+              f'outputs: {outputs}')
+
+        gridvlp_classification_pipeline = GridVlpClassificationPipeline(
+            'rgtjf1/multi-modal_gridvlp_classification_chinese-base-ecom-cate-large'
+        )
+        input_params = {'text': self.text, 'image': self.image}
+        inputs = gridvlp_classification_pipeline.preprocess(input_params)
+        outputs = gridvlp_classification_pipeline.forward(inputs)
+        print(f'text: {self.text}\nimage: {self.image}\n'
+              f'outputs: {outputs}')
+
+        gridvlp_classification_pipeline = GridVlpClassificationPipeline(
+            'rgtjf1/multi-modal_gridvlp_classification_chinese-base-ecom-brand'
+        )
+        input_params = {'text': self.text, 'image': self.image}
+        inputs = gridvlp_classification_pipeline.preprocess(input_params)
+        outputs = gridvlp_classification_pipeline.forward(inputs)
+        print(f'text: {self.text}\nimage: {self.image}\n'
+              f'outputs: {outputs}')
+
+        gridvlp_classification_pipeline = GridVlpClassificationPipeline(
+            'rgtjf1/multi-modal_gridvlp_classification_chinese-base-similarity'
+        )
+        input_params = {'text': self.text, 'image': self.image}
+        inputs = gridvlp_classification_pipeline.preprocess(input_params)
+        outputs = gridvlp_classification_pipeline.forward(inputs)
+        print(f'text: {self.text}\nimage: {self.image}\n'
+              f'outputs: {outputs}')
+
+        gridvlp_embedding_pipeline = GridVlpEmbeddingPipeline(
+            'rgtjf1/multi-modal_gridvlp_classification_chinese-base-ecom-embedding'
+        )
+        input_params = {'text': self.text, 'image': self.image}
+        inputs = gridvlp_embedding_pipeline.preprocess(input_params)
+        outputs = gridvlp_embedding_pipeline.forward(inputs)
+        print(f'text: {self.text}\nimage: {self.image}\n'
+              f'outputs shape: {outputs.shape}')
+
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/pipelines/test_image_debanding.py b/tests/pipelines/test_image_debanding.py
new file mode 100644
index 00000000..105d1f45
--- /dev/null
+++ b/tests/pipelines/test_image_debanding.py
@@ -0,0 +1,45 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os.path as osp
+import unittest
+
+import cv2
+
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines import pipeline
+from modelscope.pipelines.base import Pipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
+from modelscope.utils.test_utils import test_level
+
+
+class ImageColorEnhanceTest(unittest.TestCase, DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.model_id = 'damo/cv_rrdb_image-debanding'
+        self.task = Tasks.image_debanding
+
+    def pipeline_inference(self, pipeline: Pipeline, input_location: str):
+        result = pipeline(input_location)
+        if result is not None:
+            cv2.imwrite('result.png', result[OutputKeys.OUTPUT_IMG])
+            print(f'Output written to {osp.abspath("result.png")}')
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_modelhub(self):
+        img_debanding = pipeline(Tasks.image_debanding, model=self.model_id)
+        self.pipeline_inference(img_debanding,
+                                'data/test/images/image_debanding.png')
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_run_modelhub_default_model(self):
+        img_debanding = pipeline(Tasks.image_debanding)
+        self.pipeline_inference(img_debanding,
+                                'data/test/images/image_debanding.png')
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/pipelines/test_image_deblur.py b/tests/pipelines/test_image_deblur.py
index 476263af..fc9d0101 100644
--- a/tests/pipelines/test_image_deblur.py
+++ b/tests/pipelines/test_image_deblur.py
@@ -18,7 +18,7 @@ class ImageDenoiseTest(unittest.TestCase, DemoCompatibilityCheck):
         self.task = Tasks.image_deblurring
         self.model_id = 'damo/cv_nafnet_image-deblur_gopro'
 
-    demo_image_path = 'data/test/images/blurry.jpg'
+    demo_image_path = 'data/test/images/GOPR0384_11_00-000001.png'
 
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_by_direct_model_download(self):
diff --git a/tests/pipelines/test_image_defrcn_fewshot.py b/tests/pipelines/test_image_defrcn_fewshot.py
index 4658206a..d2ecde13 100644
--- a/tests/pipelines/test_image_defrcn_fewshot.py
+++ b/tests/pipelines/test_image_defrcn_fewshot.py
@@ -29,28 +29,33 @@ class ImageDefrcnFewShotTest(unittest.TestCase, DemoCompatibilityCheck):
 
         self.task = Tasks.image_fewshot_detection
         self.model_id = 'damo/cv_resnet101_detection_fewshot-defrcn'
-        self.image = 'data/test/images/image_voc2007_000001.jpg'
+        self.image = 'https://modelscope.oss-cn-beijing.aliyuncs.com/test/images/image_voc2007_000001.jpg'
+        self.revision = 'v1.1.0'
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_run_with_model_from_modelhub(self):
-        model = Model.from_pretrained(self.model_id)
-        pipeline_defrcn = pipeline(task=self.task, model=model)
+        model = Model.from_pretrained(self.model_id, revision=self.revision)
+        pipeline_defrcn = pipeline(
+            task=self.task, model=model, model_revision=self.revision)
         print(pipeline_defrcn(input=self.image)[OutputKeys.LABELS])
 
     @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
     def test_run_with_model_name(self):
-        pipeline_defrcn = pipeline(task=self.task, model=self.model_id)
+        pipeline_defrcn = pipeline(
+            task=self.task, model=self.model_id, model_revision=self.revision)
         print(pipeline_defrcn(input=self.image)[OutputKeys.LABELS])
 
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_with_default_model(self):
-        pipeline_defrcn = pipeline(task=self.task)
+        pipeline_defrcn = pipeline(
+            task=self.task, model_revision=self.revision)
         print(pipeline_defrcn(input=self.image)[OutputKeys.LABELS])
 
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_by_direct_model_download(self):
-        cache_path = snapshot_download(self.model_id)
-        pipeline_defrcn = pipeline(self.task, model=cache_path)
+        cache_path = snapshot_download(self.model_id, revision=self.revision)
+        pipeline_defrcn = pipeline(
+            self.task, model=cache_path, model_revision=self.revision)
         print(pipeline_defrcn(input=self.image)[OutputKeys.LABELS])
 
     @unittest.skip('demo compatibility test is only enabled on a needed-basis')
diff --git a/tests/pipelines/test_image_depth_estimation_bts.py b/tests/pipelines/test_image_depth_estimation_bts.py
new file mode 100644
index 00000000..084b176c
--- /dev/null
+++ b/tests/pipelines/test_image_depth_estimation_bts.py
@@ -0,0 +1,54 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import unittest
+
+import cv2
+
+from modelscope.hub.snapshot_download import snapshot_download
+from modelscope.models import Model
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
+from modelscope.utils.test_utils import test_level
+
+
+class ImageDepthEstimationBtsTest(unittest.TestCase, DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.task = Tasks.image_depth_estimation
+        self.model_id = 'damo/cv_densenet161_image-depth-estimation_bts'
+        self.image = 'data/test/images/image_depth_estimation.jpg'
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_with_model_from_modelhub(self):
+        model = Model.from_pretrained(self.model_id)
+        pipeline_bts = pipeline(task=self.task, model=model)
+        result = pipeline_bts(input=self.image)
+        depth_vis = result[OutputKeys.DEPTHS_COLOR]
+        cv2.imwrite('result_modelhub.jpg', depth_vis)
+        print('Test run with model from modelhub ok.')
+
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_run_with_model_name(self):
+        pipeline_bts = pipeline(task=self.task, model=self.model_id)
+        result = pipeline_bts(input=self.image)
+        depth_vis = result[OutputKeys.DEPTHS_COLOR]
+        cv2.imwrite('result_modelname.jpg', depth_vis)
+        print('Test run with model name ok.')
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_run_by_direct_model_download(self):
+        cache_path = snapshot_download(self.model_id)
+        pipeline_bts = pipeline(self.task, model=cache_path)
+        result = pipeline_bts(input=self.image)
+        depth_vis = result[OutputKeys.DEPTHS_COLOR]
+        cv2.imwrite('result_snapshot.jpg', depth_vis)
+        print('Test run with snapshot ok.')
+
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/pipelines/test_image_driving_perception.py b/tests/pipelines/test_image_driving_perception.py
new file mode 100644
index 00000000..2f28b7d3
--- /dev/null
+++ b/tests/pipelines/test_image_driving_perception.py
@@ -0,0 +1,68 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os
+import os.path as osp
+import unittest
+
+import cv2
+
+import modelscope
+from modelscope.hub.snapshot_download import snapshot_download
+from modelscope.models import Model
+from modelscope.models.cv.image_driving_perception import YOLOPv2
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines import pipeline
+from modelscope.pipelines.base import Pipeline
+from modelscope.pipelines.cv import ImageDrivingPerceptionPipeline
+from modelscope.preprocessors.image import LoadImage
+from modelscope.utils.constant import Tasks
+from modelscope.utils.cv.image_utils import \
+    show_image_driving_perception_result
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
+from modelscope.utils.test_utils import test_level
+
+
+class ImageDrivingPerceptionTest(unittest.TestCase, DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.model_id = 'damo/cv_yolopv2_image-driving-perception_bdd100k'
+        self.img_path = 'data/test/images/image_driving_perception.jpg'
+
+    def pipeline_inference(self, pipeline: Pipeline, img_path: str):
+        result = pipeline(img_path)
+        img = LoadImage.convert_to_ndarray(img_path)
+        img = cv2.resize(img, (1280, 720), interpolation=cv2.INTER_LINEAR)
+        show_image_driving_perception_result(
+            img, result, out_file='result.jpg', if_draw=[1, 1, 1])
+        print(f'Output written to {osp.abspath("result.jpg")}')
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_with_model_name(self):
+        image_driving_perception_pipeline = pipeline(
+            Tasks.image_driving_perception, model=self.model_id)
+        self.pipeline_inference(image_driving_perception_pipeline,
+                                self.img_path)
+
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_run_with_model_from_modelhub(self):
+        model = Model.from_pretrained(self.model_id)
+        image_driving_perception_pipeline = pipeline(
+            task=Tasks.image_driving_perception, model=model)
+        self.pipeline_inference(image_driving_perception_pipeline,
+                                self.img_path)
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_run_by_direct_model_download(self):
+        cache_path = snapshot_download(self.model_id)
+        model = YOLOPv2(cache_path)
+        image_driving_perception_pipeline = ImageDrivingPerceptionPipeline(
+            model, preprocessor=None)
+        self.pipeline_inference(image_driving_perception_pipeline,
+                                self.img_path)
+
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/pipelines/test_image_human_parsing.py b/tests/pipelines/test_image_human_parsing.py
new file mode 100644
index 00000000..77d75862
--- /dev/null
+++ b/tests/pipelines/test_image_human_parsing.py
@@ -0,0 +1,48 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import unittest
+
+from modelscope.models import Model
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
+from modelscope.utils.test_utils import test_level
+
+
+class ImageHumanParsingTest(unittest.TestCase, DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.task = Tasks.image_segmentation
+        self.model_id_single = 'damo/cv_resnet101_image-single-human-parsing'
+        self.model_id_multiple = 'damo/cv_resnet101_image-multiple-human-parsing'
+
+    image_single = 'data/test/images/image_single_human_parsing.jpg'
+    image_multiple = 'data/test/images/image_multiple_human_parsing.jpg'
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_with_model_name(self):
+        pipeline_parsing = pipeline(
+            task=Tasks.image_segmentation, model=self.model_id_single)
+        print(pipeline_parsing(input=self.image_single)[OutputKeys.LABELS])
+        pipeline_parsing = pipeline(
+            task=Tasks.image_segmentation, model=self.model_id_multiple)
+        print(pipeline_parsing(input=self.image_multiple)[OutputKeys.LABELS])
+
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_run_with_model_from_modelhub(self):
+        model = Model.from_pretrained(self.model_id_single)
+        pipeline_parsing = pipeline(
+            task=Tasks.image_segmentation, model=model, preprocessor=None)
+        print(pipeline_parsing(input=self.image_single)[OutputKeys.LABELS])
+        model = Model.from_pretrained(self.model_id_multiple)
+        pipeline_parsing = pipeline(
+            task=Tasks.image_segmentation, model=model, preprocessor=None)
+        print(pipeline_parsing(input=self.image_multiple)[OutputKeys.LABELS])
+
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/pipelines/test_image_inpainting_sdv2.py b/tests/pipelines/test_image_inpainting_sdv2.py
new file mode 100644
index 00000000..81002ce8
--- /dev/null
+++ b/tests/pipelines/test_image_inpainting_sdv2.py
@@ -0,0 +1,59 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import tempfile
+import unittest
+
+import cv2
+
+from modelscope.hub.snapshot_download import snapshot_download
+from modelscope.models import Model
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines import pipeline
+from modelscope.pipelines.cv import ImageInpaintingSDV2Pipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
+from modelscope.utils.test_utils import test_level
+
+
+class ImageInpaintingSDV2Test(unittest.TestCase, DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.task = Tasks.image_inpainting
+        self.model_id = 'damo/cv_stable-diffusion-v2_image-inpainting_base'
+        self.input_location = 'data/test/images/image_inpainting/image_inpainting_1.png'
+        self.input_mask_location = 'data/test/images/image_inpainting/image_inpainting_mask_1.png'
+        self.prompt = 'background'
+
+        self.input = {
+            'image': self.input_location,
+            'mask': self.input_mask_location,
+            'prompt': self.prompt
+        }
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_run_by_direct_model_download(self):
+        output_image_path = tempfile.NamedTemporaryFile(suffix='.png').name
+        cache_path = snapshot_download(self.model_id)
+        pipeline = ImageInpaintingSDV2Pipeline(cache_path)
+        pipeline.group_key = self.task
+        output = pipeline(input=self.input)[OutputKeys.OUTPUT_IMG]
+        cv2.imwrite(output_image_path, output)
+        print(
+            'pipeline: the output image path is {}'.format(output_image_path))
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_with_model_from_modelhub(self):
+        output_image_path = tempfile.NamedTemporaryFile(suffix='.png').name
+        pipeline_ins = pipeline(
+            task=Tasks.image_inpainting, model=self.model_id)
+        output = pipeline_ins(input=self.input)[OutputKeys.OUTPUT_IMG]
+        cv2.imwrite(output_image_path, output)
+        print(
+            'pipeline: the output image path is {}'.format(output_image_path))
+
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/pipelines/test_image_open_vocabulary_detection.py b/tests/pipelines/test_image_open_vocabulary_detection.py
new file mode 100644
index 00000000..28ae636e
--- /dev/null
+++ b/tests/pipelines/test_image_open_vocabulary_detection.py
@@ -0,0 +1,83 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os
+import unittest
+
+import cv2
+
+from modelscope.hub.snapshot_download import snapshot_download
+from modelscope.models import Model
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.cv.image_utils import draw_box
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
+from modelscope.utils.logger import get_logger
+from modelscope.utils.test_utils import test_level
+
+logger = get_logger()
+
+
+class ImageOpenVocabularyDetectionTest(unittest.TestCase,
+                                       DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        os.system(
+            'pip install tensorflow==2.9.2 -i https://pypi.tuna.tsinghua.edu.cn/simple'
+        )
+        logger.info('upgrade tensorflow finished')
+
+        self.task = Tasks.open_vocabulary_detection
+        self.model_id = 'damo/cv_resnet152_open-vocabulary-detection_vild'
+        self.image = 'data/test/images/image_open_vocabulary_detection.jpg'
+        self.category_names = ';'.join([
+            'flipflop', 'street sign', 'bracelet', 'necklace', 'shorts',
+            'floral camisole', 'orange shirt', 'purple dress', 'yellow tee',
+            'green umbrella', 'pink striped umbrella', 'transparent umbrella',
+            'plain pink umbrella', 'blue patterned umbrella', 'koala',
+            'electric box', 'car', 'pole'
+        ])
+        self.input = {'img': self.image, 'category_names': self.category_names}
+
+    def tearDown(self) -> None:
+        os.system(
+            'pip install tensorflow-gpu==1.15 -i https://pypi.tuna.tsinghua.edu.cn/simple'
+        )
+        logger.info('degrade tensorflow finished')
+        return super().tearDown()
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_with_model_from_modelhub(self):
+        model = Model.from_pretrained(self.model_id)
+        vild_pipeline = pipeline(task=self.task, model=model)
+        result = vild_pipeline(input=self.input)
+        image = cv2.imread(self.image)
+        draw_box(image, result[OutputKeys.BOXES][0, :])
+        cv2.imwrite('result_modelhub.jpg', image)
+        print('Test run with model from modelhub ok.')
+
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_run_with_model_name(self):
+        vild_pipeline = pipeline(task=self.task, model=self.model_id)
+        result = vild_pipeline(self.input)
+        image = cv2.imread(self.image)
+        draw_box(image, result[OutputKeys.BOXES][0, :])
+        cv2.imwrite('result_modelname.jpg', image)
+        print('Test run with model name ok.')
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_run_by_direct_model_download(self):
+        cache_path = snapshot_download(self.model_id)
+        vild_pipeline = pipeline(self.task, model=cache_path)
+        result = vild_pipeline(input=self.input)
+        image = cv2.imread(self.image)
+        draw_box(image, result[OutputKeys.BOXES][0, :])
+        cv2.imwrite('result_snapshot.jpg', image)
+        print('Test run with snapshot ok.')
+
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/pipelines/test_image_paint_by_example.py b/tests/pipelines/test_image_paint_by_example.py
new file mode 100644
index 00000000..70ee62f1
--- /dev/null
+++ b/tests/pipelines/test_image_paint_by_example.py
@@ -0,0 +1,65 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import unittest
+
+import cv2
+import torch
+from PIL import Image
+
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.logger import get_logger
+from modelscope.utils.test_utils import test_level
+
+logger = get_logger()
+
+
+class ImagePaintbyexampleTest(unittest.TestCase):
+
+    def setUp(self) -> None:
+        self.input_location = 'data/test/images/image_paint_by_example/image/example_1.png'
+        self.input_mask_location = 'data/test/images/image_paint_by_example/mask/example_1.png'
+        self.reference_location = 'data/test/images/image_paint_by_example/reference/example_1.jpg'
+        self.model_id = 'damo/cv_stable-diffusion_paint-by-example'
+        self.input = {
+            'img': self.input_location,
+            'mask': self.input_mask_location,
+            'reference': self.reference_location
+        }
+
+    def save_result(self, result):
+        vis_img = result[OutputKeys.OUTPUT_IMG]
+        vis_img.save('result.png')
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    @unittest.skipIf(not torch.cuda.is_available(), 'cuda unittest')
+    def test_paintbyexample(self):
+        paintbyexample = pipeline(
+            Tasks.image_paintbyexample, model=self.model_id)
+        result = paintbyexample(self.input)
+        if result:
+            self.save_result(result)
+        else:
+            raise ValueError('process error')
+
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    @unittest.skipIf(not torch.cuda.is_available(), 'cuda unittest')
+    def test_paintbyexample_with_image(self):
+        paintbyexample = pipeline(
+            Tasks.image_paintbyexample, model=self.model_id)
+        img = Image.open(self.input_location)
+        mask = Image.open(self.input_mask_location)
+        reference = Image.open(self.reference_location)
+        result = paintbyexample({
+            'img': img,
+            'mask': mask,
+            'reference': reference
+        })
+        if result:
+            self.save_result(result)
+        else:
+            raise ValueError('process error')
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/pipelines/test_image_quality_assessment_degradation.py b/tests/pipelines/test_image_quality_assessment_degradation.py
new file mode 100644
index 00000000..cb0f24c7
--- /dev/null
+++ b/tests/pipelines/test_image_quality_assessment_degradation.py
@@ -0,0 +1,63 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os
+import sys
+import unittest
+
+from modelscope.hub.snapshot_download import snapshot_download
+from modelscope.models import Model
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines import pipeline
+from modelscope.pipelines.cv import ImageQualityAssessmentDegradationPipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
+from modelscope.utils.test_utils import test_level
+
+sys.path.insert(0, '.')
+
+
+class ImageQualityAssessmentDegradationTest(unittest.TestCase,
+                                            DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.task = Tasks.image_quality_assessment_degradation
+        self.model_id = 'damo/cv_resnet50_image-quality-assessment_degradation'
+        self.test_img = 'data/test/images/dogs.jpg'
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_run_by_direct_model_download(self):
+        cache_path = snapshot_download(self.model_id)
+        pipeline = ImageQualityAssessmentDegradationPipeline(cache_path)
+        pipeline.group_key = self.task
+        out_path = pipeline(input=self.test_img)[OutputKeys.SCORES]
+        print('pipeline: the out_path is {}'.format(out_path))
+
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_run_with_model_from_modelhub(self):
+        model = Model.from_pretrained(self.model_id)
+        pipeline_ins = pipeline(
+            task=Tasks.image_quality_assessment_degradation, model=model)
+        out_path = pipeline_ins(input=self.test_img)[OutputKeys.SCORES]
+        print('pipeline: the out_path is {}'.format(out_path))
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_with_model_name(self):
+        pipeline_ins = pipeline(
+            task=Tasks.image_quality_assessment_degradation,
+            model=self.model_id)
+        out_path = pipeline_ins(input=self.test_img)[OutputKeys.SCORES]
+        print('pipeline: the out_path is {}'.format(out_path))
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_run_with_default_model(self):
+        pipeline_ins = pipeline(
+            task=Tasks.image_quality_assessment_degradation)
+        out_path = pipeline_ins(input=self.test_img)[OutputKeys.SCORES]
+        print('pipeline: the out_path is {}'.format(out_path))
+
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/pipelines/test_image_quality_assessment_mos.py b/tests/pipelines/test_image_quality_assessment_mos.py
new file mode 100644
index 00000000..608be8f8
--- /dev/null
+++ b/tests/pipelines/test_image_quality_assessment_mos.py
@@ -0,0 +1,56 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import unittest
+
+from modelscope.hub.snapshot_download import snapshot_download
+from modelscope.models import Model
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines import pipeline
+from modelscope.pipelines.cv import ImageQualityAssessmentMosPipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
+from modelscope.utils.test_utils import test_level
+
+
+class ImageQualityAssessmentMosTest(unittest.TestCase, DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.task = Tasks.image_quality_assessment_mos
+        self.model_id = 'damo/cv_resnet_image-quality-assessment-mos_youtubeUGC'
+        self.test_img = 'data/test/images/dogs.jpg'
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_run_by_direct_model_download(self):
+        cache_path = snapshot_download(self.model_id)
+        pipeline = ImageQualityAssessmentMosPipeline(cache_path)
+        pipeline.group_key = self.task
+        out_path = pipeline(input=self.test_img)[OutputKeys.SCORE]
+        print('pipeline: the out_path is {}'.format(out_path))
+
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_run_with_model_from_modelhub(self):
+        model = Model.from_pretrained(self.model_id)
+        pipeline_ins = pipeline(
+            task=Tasks.image_quality_assessment_mos, model=model)
+        out_path = pipeline_ins(input=self.test_img)[OutputKeys.SCORE]
+        print('pipeline: the out_path is {}'.format(out_path))
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_with_model_name(self):
+        pipeline_ins = pipeline(
+            task=Tasks.image_quality_assessment_mos, model=self.model_id)
+        out_path = pipeline_ins(input=self.test_img)[OutputKeys.SCORE]
+        print('pipeline: the out_path is {}'.format(out_path))
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_run_with_default_model(self):
+        pipeline_ins = pipeline(task=Tasks.image_quality_assessment_mos)
+        out_path = pipeline_ins(input=self.test_img)[OutputKeys.SCORE]
+        print('pipeline: the out_path is {}'.format(out_path))
+
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/pipelines/test_image_restoration.py b/tests/pipelines/test_image_restoration.py
new file mode 100644
index 00000000..baffa7d5
--- /dev/null
+++ b/tests/pipelines/test_image_restoration.py
@@ -0,0 +1,33 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import unittest
+
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
+from modelscope.utils.test_utils import test_level
+
+
+class ImageRestorationTest(unittest.TestCase, DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.task = Tasks.image_demoireing
+        self.model_id = 'damo/cv_uhdm_image-demoireing'
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_image_demoireing(self):
+        input_location = 'data/test/images/image_moire.jpg'
+        model_id = 'damo/cv_uhdm_image-demoireing'
+        image_demoire = pipeline(Tasks.image_demoireing, model=model_id)
+        result = image_demoire(input_location)
+        from PIL import Image
+        Image.fromarray(result[OutputKeys.OUTPUT_IMG]).save(input_location
+                                                            + '_demoire.jpg')
+
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/pipelines/test_image_structured_model_probing.py b/tests/pipelines/test_image_structured_model_probing.py
new file mode 100644
index 00000000..563e131c
--- /dev/null
+++ b/tests/pipelines/test_image_structured_model_probing.py
@@ -0,0 +1,29 @@
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
+
+import unittest
+
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
+from modelscope.utils.test_utils import test_level
+
+
+class ImageStructuredModelProbingTest(unittest.TestCase,
+                                      DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.task = Tasks.image_classification
+        self.model_id = 'damo/structured_model_probing'
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_modelhub(self):
+
+        recognition_pipeline = pipeline(self.task, self.model_id)
+        file_name = 'data/test/images/image_structured_model_probing_test_image.jpg'
+        result = recognition_pipeline(file_name)
+
+        print(f'recognition output: {result}.')
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/pipelines/test_interactive_translation_pipeline.py b/tests/pipelines/test_interactive_translation_pipeline.py
new file mode 100644
index 00000000..b973250a
--- /dev/null
+++ b/tests/pipelines/test_interactive_translation_pipeline.py
@@ -0,0 +1,37 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import unittest
+
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
+from modelscope.utils.test_utils import test_level
+
+
+class InteractiveTranslationTest(unittest.TestCase, DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.task = Tasks.translation
+        self.model_id = 'damo/nlp_imt_translation_zh2en'
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_with_imt_model_name_for_zh2en(self):
+        inputs = '声明补充说，沃伦的同事都深感震惊，并且希望他能够投案自首。'
+        prefix = 'The statement ad'
+        pipeline_ins = pipeline(self.task, model=self.model_id)
+        print(pipeline_ins(inputs + '<PREFIX_SPLIT>' + prefix))
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_with_imt_model_name_for_en2zh(self):
+        model_id = 'damo/nlp_imt_translation_en2zh'
+        inputs = 'Elon Musk, co-founder and chief executive officer of Tesla Motors.'
+        prefix = '特斯拉汽车公司'
+        pipeline_ins = pipeline(self.task, model=model_id)
+        print(pipeline_ins(inputs + '<PREFIX_SPLIT>' + prefix))
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/pipelines/test_key_word_spotting.py b/tests/pipelines/test_key_word_spotting.py
index f31d212b..85f3370f 100644
--- a/tests/pipelines/test_key_word_spotting.py
+++ b/tests/pipelines/test_key_word_spotting.py
@@ -153,9 +153,35 @@ class KeyWordSpottingTest(unittest.TestCase, DemoCompatibilityCheck):
                     'fa_per_hour': 0.0
                 }]
             }
+        },
+        'test_run_with_all_models': {
+            'checking_item': [OutputKeys.KWS_LIST, 0, 'keyword'],
+            'checking_value': '小云小云',
+            'example': {
+                'kws_type':
+                'wav',
+                'kws_list': [{
+                    'keyword': '小云小云',
+                    'offset': 5.76,
+                    'length': 9.132938,
+                    'confidence': 0.990368
+                }],
+                'wav_count':
+                1
+            }
         }
     }
 
+    all_models_info = [{
+        'model_id': 'damo/speech_charctc_kws_phone-xiaoyun-commands',
+        'wav_path': 'data/test/audios/kws_xiaoyunxiaoyun.wav',
+        'keywords': '小云小云'
+    }, {
+        'model_id': 'damo/speech_charctc_kws_phone-xiaoyun',
+        'wav_path': 'data/test/audios/kws_xiaoyunxiaoyun.wav',
+        'keywords': '小云小云'
+    }]
+
     def setUp(self) -> None:
         self.model_id = 'damo/speech_charctc_kws_phone-xiaoyun'
         self.workspace = os.path.join(os.getcwd(), '.tmp')
@@ -296,6 +322,19 @@ class KeyWordSpottingTest(unittest.TestCase, DemoCompatibilityCheck):
             model_id=self.model_id, audio_in=audio_list)
         self.check_result('test_run_with_roc', kws_result)
 
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_run_with_all_models(self):
+        logger.info('test_run_with_all_models')
+        for item in self.all_models_info:
+            model_id = item['model_id']
+            wav_path = item['wav_path']
+            keywords = item['keywords']
+
+            logger.info('run with model_id:' + model_id)
+            kws_result = self.run_pipeline(
+                model_id=model_id, audio_in=wav_path, keywords=keywords)
+            self.check_result('test_run_with_all_models', kws_result)
+
     @unittest.skip('demo compatibility test is only enabled on a needed-basis')
     def test_demo_compatibility(self):
         self.compatibility_check()
diff --git a/tests/pipelines/test_mobile_image_super_resolution.py b/tests/pipelines/test_mobile_image_super_resolution.py
new file mode 100644
index 00000000..2cc7adf0
--- /dev/null
+++ b/tests/pipelines/test_mobile_image_super_resolution.py
@@ -0,0 +1,47 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os.path as osp
+import unittest
+
+import cv2
+
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines import pipeline
+from modelscope.pipelines.base import Pipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
+from modelscope.utils.test_utils import test_level
+
+
+class MobileImageSuperResolutionTest(unittest.TestCase,
+                                     DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.model_id = 'damo/cv_ecbsr_image-super-resolution_mobile'
+        self.img = 'data/test/images/butterfly_lrx2_y.png'
+        self.task = Tasks.image_super_resolution
+
+    def pipeline_inference(self, pipeline: Pipeline, img: str):
+        result = pipeline(img)
+        if result is not None:
+            cv2.imwrite('result.png', result[OutputKeys.OUTPUT_IMG])
+            print(f'Output written to {osp.abspath("result.png")}')
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_modelhub(self):
+        super_resolution = pipeline(
+            Tasks.image_super_resolution, model=self.model_id)
+
+        self.pipeline_inference(super_resolution, self.img)
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_run_modelhub_default_model(self):
+        super_resolution = pipeline(Tasks.image_super_resolution)
+        self.pipeline_inference(super_resolution, self.img)
+
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/pipelines/test_motion_generation.py b/tests/pipelines/test_motion_generation.py
new file mode 100644
index 00000000..7938611c
--- /dev/null
+++ b/tests/pipelines/test_motion_generation.py
@@ -0,0 +1,32 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import unittest
+
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
+from modelscope.utils.test_utils import test_level
+
+
+class MDMMotionGenerationTest(unittest.TestCase, DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.task = Tasks.motion_generation
+        self.model_id = 'damo/cv_mdm_motion-generation'
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run(self):
+        motion_generation_pipline = pipeline(self.task, model=self.model_id)
+        result = motion_generation_pipline(
+            'the person walked forward and is picking up his toolbox')
+        print('motion generation data shape:',
+              result[OutputKeys.KEYPOINTS].shape)
+        print('motion generation video file:', result[OutputKeys.OUTPUT_VIDEO])
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/pipelines/test_msrresnetlite_video_super_resolution.py b/tests/pipelines/test_msrresnetlite_video_super_resolution.py
new file mode 100644
index 00000000..d79e9702
--- /dev/null
+++ b/tests/pipelines/test_msrresnetlite_video_super_resolution.py
@@ -0,0 +1,59 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import unittest
+
+from modelscope.hub.snapshot_download import snapshot_download
+from modelscope.models import Model
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines import pipeline
+from modelscope.pipelines.cv import VideoSuperResolutionPipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
+from modelscope.utils.test_utils import test_level
+
+
+class MSRResNetLiteVSRTest(unittest.TestCase, DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.task = Tasks.video_super_resolution
+        self.model_id = 'damo/cv_msrresnet_video-super-resolution_lite'
+        self.test_video = 'https://modelscope.oss-cn-beijing.aliyuncs.com/test/videos/000.mp4'
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_run_by_direct_model_download(self):
+        cache_path = snapshot_download(self.model_id)
+        pipeline = VideoSuperResolutionPipeline(cache_path)
+        pipeline.group_key = self.task
+        out_video_path = pipeline(
+            input=self.test_video)[OutputKeys.OUTPUT_VIDEO]
+        print('pipeline: the output video path is {}'.format(out_video_path))
+
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_run_with_model_from_modelhub(self):
+        model = Model.from_pretrained(self.model_id)
+        pipeline_ins = pipeline(task=Tasks.video_super_resolution, model=model)
+        out_video_path = pipeline_ins(
+            input=self.test_video)[OutputKeys.OUTPUT_VIDEO]
+        print('pipeline: the output video path is {}'.format(out_video_path))
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_with_model_name(self):
+        pipeline_ins = pipeline(
+            task=Tasks.video_super_resolution, model=self.model_id)
+        out_video_path = pipeline_ins(
+            input=self.test_video)[OutputKeys.OUTPUT_VIDEO]
+        print('pipeline: the output video path is {}'.format(out_video_path))
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_run_with_default_model(self):
+        pipeline_ins = pipeline(task=Tasks.video_super_resolution)
+        out_video_path = pipeline_ins(
+            input=self.test_video)[OutputKeys.OUTPUT_VIDEO]
+        print('pipeline: the output video path is {}'.format(out_video_path))
+
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/pipelines/test_multilingual_named_entity_recognition.py b/tests/pipelines/test_multilingual_named_entity_recognition.py
index 5ed019d9..ec134023 100644
--- a/tests/pipelines/test_multilingual_named_entity_recognition.py
+++ b/tests/pipelines/test_multilingual_named_entity_recognition.py
@@ -3,8 +3,7 @@ import unittest
 
 from modelscope.hub.snapshot_download import snapshot_download
 from modelscope.models import Model
-from modelscope.models.nlp import (LSTMCRFForNamedEntityRecognition,
-                                   TransformerCRFForNamedEntityRecognition)
+from modelscope.models.nlp import ModelForTokenClassificationWithCRF
 from modelscope.pipelines import pipeline
 from modelscope.pipelines.nlp import NamedEntityRecognitionPipeline
 from modelscope.preprocessors import NERPreprocessorThai, NERPreprocessorViet
@@ -33,8 +32,7 @@ class MultilingualNamedEntityRecognitionTest(unittest.TestCase,
     def test_run_tcrf_by_direct_model_download_thai(self):
         cache_path = snapshot_download(self.thai_tcrf_model_id)
         tokenizer = NERPreprocessorThai(cache_path)
-        model = TransformerCRFForNamedEntityRecognition(
-            cache_path, tokenizer=tokenizer)
+        model = ModelForTokenClassificationWithCRF.from_pretrained(cache_path)
         pipeline1 = NamedEntityRecognitionPipeline(
             model, preprocessor=tokenizer)
         pipeline2 = pipeline(
@@ -73,8 +71,7 @@ class MultilingualNamedEntityRecognitionTest(unittest.TestCase,
     def test_run_tcrf_by_direct_model_download_viet(self):
         cache_path = snapshot_download(self.viet_tcrf_model_id)
         tokenizer = NERPreprocessorViet(cache_path)
-        model = TransformerCRFForNamedEntityRecognition(
-            cache_path, tokenizer=tokenizer)
+        model = ModelForTokenClassificationWithCRF.from_pretrained(cache_path)
         pipeline1 = NamedEntityRecognitionPipeline(
             model, preprocessor=tokenizer)
         pipeline2 = pipeline(
diff --git a/tests/pipelines/test_multilingual_word_segmentation.py b/tests/pipelines/test_multilingual_word_segmentation.py
index da54fe02..f10e6d98 100644
--- a/tests/pipelines/test_multilingual_word_segmentation.py
+++ b/tests/pipelines/test_multilingual_word_segmentation.py
@@ -3,7 +3,7 @@ import unittest
 
 from modelscope.hub.snapshot_download import snapshot_download
 from modelscope.models import Model
-from modelscope.models.nlp import TransformerCRFForWordSegmentation
+from modelscope.models.nlp import ModelForTokenClassificationWithCRF
 from modelscope.pipelines import pipeline
 from modelscope.pipelines.nlp import WordSegmentationThaiPipeline
 from modelscope.preprocessors import WordSegmentationPreprocessorThai
@@ -26,7 +26,7 @@ class WordSegmentationTest(unittest.TestCase, DemoCompatibilityCheck):
     def test_run_by_direct_model_download(self):
         cache_path = snapshot_download(self.model_id)
         tokenizer = WordSegmentationPreprocessorThai(cache_path)
-        model = TransformerCRFForWordSegmentation.from_pretrained(cache_path)
+        model = ModelForTokenClassificationWithCRF.from_pretrained(cache_path)
         pipeline1 = WordSegmentationThaiPipeline(model, preprocessor=tokenizer)
         pipeline2 = pipeline(
             Tasks.word_segmentation, model=model, preprocessor=tokenizer)
diff --git a/tests/pipelines/test_named_entity_recognition.py b/tests/pipelines/test_named_entity_recognition.py
index a7c790ef..175e9261 100644
--- a/tests/pipelines/test_named_entity_recognition.py
+++ b/tests/pipelines/test_named_entity_recognition.py
@@ -3,8 +3,8 @@ import unittest
 
 from modelscope.hub.snapshot_download import snapshot_download
 from modelscope.models import Model
-from modelscope.models.nlp import (LSTMCRFForNamedEntityRecognition,
-                                   TransformerCRFForNamedEntityRecognition)
+from modelscope.models.nlp import (LSTMForTokenClassificationWithCRF,
+                                   ModelForTokenClassificationWithCRF)
 from modelscope.pipelines import pipeline
 from modelscope.pipelines.nlp import NamedEntityRecognitionPipeline
 from modelscope.preprocessors import \
@@ -278,8 +278,7 @@ class NamedEntityRecognitionTest(unittest.TestCase, DemoCompatibilityCheck):
     def test_run_tcrf_by_direct_model_download(self):
         cache_path = snapshot_download(self.tcrf_model_id)
         tokenizer = TokenClassificationTransformersPreprocessor(cache_path)
-        model = TransformerCRFForNamedEntityRecognition(
-            cache_path, tokenizer=tokenizer)
+        model = ModelForTokenClassificationWithCRF.from_pretrained(cache_path)
         pipeline1 = NamedEntityRecognitionPipeline(
             model, preprocessor=tokenizer)
         pipeline2 = pipeline(
@@ -295,8 +294,7 @@ class NamedEntityRecognitionTest(unittest.TestCase, DemoCompatibilityCheck):
     def test_run_lcrf_by_direct_model_download(self):
         cache_path = snapshot_download(self.lcrf_model_id)
         tokenizer = TokenClassificationTransformersPreprocessor(cache_path)
-        model = LSTMCRFForNamedEntityRecognition(
-            cache_path, tokenizer=tokenizer)
+        model = LSTMForTokenClassificationWithCRF.from_pretrained(cache_path)
         pipeline1 = NamedEntityRecognitionPipeline(
             model, preprocessor=tokenizer)
         pipeline2 = pipeline(
diff --git a/tests/pipelines/test_nerf_recon_acc.py b/tests/pipelines/test_nerf_recon_acc.py
new file mode 100644
index 00000000..502d153e
--- /dev/null
+++ b/tests/pipelines/test_nerf_recon_acc.py
@@ -0,0 +1,64 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import unittest
+
+import torch
+
+from modelscope.hub.snapshot_download import snapshot_download
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
+from modelscope.utils.test_utils import test_level
+
+
+class NeRFReconAccTest(unittest.TestCase, DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.task = Tasks.image_face_fusion
+        self.model_id = 'damo/cv_nerf-3d-reconstruction-accelerate_damo'
+        self.video_path = 'data/test/videos/video_nerf_recon_test.mp4'
+        self.data_dir = 'data/test/videos/nerf_dir'
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    @unittest.skipIf(not torch.cuda.is_available(), 'cuda unittest only')
+    def test_run_by_direct_model_download(self):
+        snapshot_path = snapshot_download(self.model_id)
+        print('snapshot_path: {}'.format(snapshot_path))
+        nerf_recon_acc = pipeline(
+            Tasks.nerf_recon_acc,
+            model=snapshot_path,
+        )
+
+        result = nerf_recon_acc(
+            dict(data_dir=self.data_dir, video_input_path=self.video_path))
+        print(result[OutputKeys.OUTPUT_VIDEO])
+        print('facefusion.test_run_direct_model_download done')
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    @unittest.skipIf(not torch.cuda.is_available(), 'cuda unittest only')
+    def test_run_modelhub(self):
+        nerf_recon_acc = pipeline(Tasks.nerf_recon_acc, model=self.model_id)
+
+        result = nerf_recon_acc(
+            dict(data_dir=self.data_dir, video_input_path=self.video_path))
+        print(result[OutputKeys.OUTPUT_VIDEO])
+        print('facefusion.test_run_modelhub done')
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    @unittest.skipIf(not torch.cuda.is_available(), 'cuda unittest only')
+    def test_run_modelhub_default_model(self):
+        nerf_recon_acc = pipeline(Tasks.nerf_recon_acc)
+
+        result = nerf_recon_acc(
+            dict(data_dir=self.data_dir, video_input_path=self.video_path))
+        print(result[OutputKeys.OUTPUT_VIDEO])
+        print('facefusion.test_run_modelhub_default_model done')
+
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
+    @unittest.skipIf(not torch.cuda.is_available(), 'cuda unittest only')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/pipelines/test_nli.py b/tests/pipelines/test_nli.py
index 94689e96..9d985d25 100644
--- a/tests/pipelines/test_nli.py
+++ b/tests/pipelines/test_nli.py
@@ -17,6 +17,7 @@ class NLITest(unittest.TestCase, DemoCompatibilityCheck):
     def setUp(self) -> None:
         self.task = Tasks.nli
         self.model_id = 'damo/nlp_structbert_nli_chinese-base'
+        self.model_id_fact_checking = 'damo/nlp_structbert_fact-checking_chinese-base'
 
     sentence1 = '四川商务职业学院和四川财经职业学院哪个好？'
     sentence2 = '四川商务职业学院商务管理在哪个校区？'
@@ -52,6 +53,14 @@ class NLITest(unittest.TestCase, DemoCompatibilityCheck):
                 compare_fn=IgnoreKeyFn('.*intermediate_act_fn')):
             print(pipeline_ins(input=(self.sentence1, self.sentence2)))
 
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_fact_checking_model(self):
+        pipeline_ins = pipeline(
+            task=Tasks.nli,
+            model=self.model_id_fact_checking,
+            model_revision='v1.0.1')
+        print(pipeline_ins(input=(self.sentence1, self.sentence2)))
+
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_with_default_model(self):
         pipeline_ins = pipeline(task=Tasks.nli)
diff --git a/tests/pipelines/test_object_detecion_3d.py b/tests/pipelines/test_object_detecion_3d.py
new file mode 100644
index 00000000..bb0eebda
--- /dev/null
+++ b/tests/pipelines/test_object_detecion_3d.py
@@ -0,0 +1,57 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os
+import os.path as osp
+import unittest
+
+import cv2
+
+from modelscope.msdatasets import MsDataset
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines import pipeline
+from modelscope.pipelines.base import Pipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
+from modelscope.utils.test_utils import test_level
+
+
+class ObjectDetection3DTest(unittest.TestCase, DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.task = Tasks.object_detection_3d
+        self.model_id = 'damo/cv_object-detection-3d_depe'
+        ms_ds_nuscenes = MsDataset.load('nuScenes_mini', namespace='shaoxuan')
+
+        data_path = ms_ds_nuscenes.config_kwargs['split_config']
+        val_dir = data_path['validation']
+        self.val_root = val_dir + '/' + os.listdir(val_dir)[0] + '/'
+
+    def pipeline_inference(self, pipeline: Pipeline, sample_idx: int):
+        input_dict = {'data_root': self.val_root, 'sample_idx': sample_idx}
+
+        result = pipeline(input_dict, save_path='./depe_result')
+        if result is not None:
+            cv2.imwrite('result.jpg', result[OutputKeys.OUTPUT_IMG])
+            print(f'Output written to {osp.abspath("result.jpg")}')
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_modelhub(self):
+        idx = 10
+        detect = pipeline(
+            self.task,
+            model=self.model_id,
+        )
+        self.pipeline_inference(detect, idx)
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_run_modelhub_default_model(self):
+        idx = 0
+        detect = pipeline(self.task)
+        self.pipeline_inference(detect, idx)
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/pipelines/test_object_detection.py b/tests/pipelines/test_object_detection.py
index 64766c77..e4bf6b54 100644
--- a/tests/pipelines/test_object_detection.py
+++ b/tests/pipelines/test_object_detection.py
@@ -59,6 +59,15 @@ class ObjectDetectionTest(unittest.TestCase, DemoCompatibilityCheck):
         image_object_detection_auto.show_result(test_image, result,
                                                 'auto_demo_ret.jpg')
 
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_image_object_detection_dino_pipeline(self):
+        model_id = 'damo/cv_swinl_image-object-detection_dino'
+        test_image = 'data/test/images/image_detection.jpg'
+        image_object_detection_dino = pipeline(
+            Tasks.image_object_detection, model=model_id)
+        result = image_object_detection_dino(test_image)
+        print(result)
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_ocr_detection.py b/tests/pipelines/test_ocr_detection.py
index f1c20f47..243e274b 100644
--- a/tests/pipelines/test_ocr_detection.py
+++ b/tests/pipelines/test_ocr_detection.py
@@ -13,6 +13,7 @@ class OCRDetectionTest(unittest.TestCase, DemoCompatibilityCheck):
     def setUp(self) -> None:
         self.model_id = 'damo/cv_resnet18_ocr-detection-line-level_damo'
         self.model_id_vlpt = 'damo/cv_resnet50_ocr-detection-vlpt'
+        self.model_id_db = 'damo/cv_resnet18_ocr-detection-db-line-level_damo'
         self.test_image = 'data/test/images/ocr_detection.jpg'
         self.test_image_vlpt = 'data/test/images/ocr_detection_vlpt.jpg'
         self.task = Tasks.ocr_detection
@@ -32,6 +33,11 @@ class OCRDetectionTest(unittest.TestCase, DemoCompatibilityCheck):
         ocr_detection = pipeline(Tasks.ocr_detection, model=self.model_id_vlpt)
         self.pipeline_inference(ocr_detection, self.test_image_vlpt)
 
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_with_db_with_model_from_modelhub(self):
+        ocr_detection = pipeline(Tasks.ocr_detection, model=self.model_id_db)
+        self.pipeline_inference(ocr_detection, self.test_image)
+
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_modelhub_default_model(self):
         ocr_detection = pipeline(Tasks.ocr_detection)
diff --git a/tests/pipelines/test_ocr_recognition.py b/tests/pipelines/test_ocr_recognition.py
index 8d48dd7a..372a4bc4 100644
--- a/tests/pipelines/test_ocr_recognition.py
+++ b/tests/pipelines/test_ocr_recognition.py
@@ -13,7 +13,7 @@ from modelscope.utils.test_utils import test_level
 class OCRRecognitionTest(unittest.TestCase, DemoCompatibilityCheck):
 
     def setUp(self) -> None:
-        self.model_id = 'damo/cv_convnextTiny_ocr-recognition-general_damo'
+        self.model_id = 'damo/cv_crnn_ocr-recognition-general_damo'
         self.test_image = 'data/test/images/ocr_recognition.jpg'
         self.task = Tasks.ocr_recognition
 
@@ -23,18 +23,25 @@ class OCRRecognitionTest(unittest.TestCase, DemoCompatibilityCheck):
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_run_with_model_from_modelhub(self):
-        ocr_recognition = pipeline(Tasks.ocr_recognition, model=self.model_id)
+        ocr_recognition = pipeline(
+            Tasks.ocr_recognition,
+            model=self.model_id,
+            model_revision='v1.0.0')
         self.pipeline_inference(ocr_recognition, self.test_image)
 
     @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
     def test_run_with_model_from_modelhub_PILinput(self):
-        ocr_recognition = pipeline(Tasks.ocr_recognition, model=self.model_id)
+        ocr_recognition = pipeline(
+            Tasks.ocr_recognition,
+            model=self.model_id,
+            model_revision='v1.0.0')
         imagePIL = PIL.Image.open(self.test_image)
         self.pipeline_inference(ocr_recognition, imagePIL)
 
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_modelhub_default_model(self):
-        ocr_recognition = pipeline(Tasks.ocr_recognition)
+        ocr_recognition = pipeline(
+            Tasks.ocr_recognition, model_revision='v2.0.0')
         self.pipeline_inference(ocr_recognition, self.test_image)
 
     @unittest.skip('demo compatibility test is only enabled on a needed-basis')
diff --git a/tests/pipelines/test_part_of_speech.py b/tests/pipelines/test_part_of_speech.py
index 359503b7..87c7c93a 100644
--- a/tests/pipelines/test_part_of_speech.py
+++ b/tests/pipelines/test_part_of_speech.py
@@ -4,8 +4,8 @@ import unittest
 
 from modelscope.hub.snapshot_download import snapshot_download
 from modelscope.models import Model
-from modelscope.models.nlp import (LSTMCRFForPartOfSpeech,
-                                   TokenClassificationModel)
+from modelscope.models.nlp import (LSTMForTokenClassificationWithCRF,
+                                   ModelForTokenClassification)
 from modelscope.pipelines import pipeline
 from modelscope.pipelines.nlp import TokenClassificationPipeline
 from modelscope.preprocessors import \
@@ -23,7 +23,7 @@ class PartOfSpeechTest(unittest.TestCase):
     def test_run_by_direct_model_download(self):
         cache_path = snapshot_download(self.model_id)
         tokenizer = TokenClassificationTransformersPreprocessor(cache_path)
-        model = TokenClassificationModel.from_pretrained(cache_path)
+        model = ModelForTokenClassification.from_pretrained(cache_path)
         pipeline1 = TokenClassificationPipeline(model, preprocessor=tokenizer)
         pipeline2 = pipeline(
             Tasks.part_of_speech, model=model, preprocessor=tokenizer)
@@ -36,7 +36,7 @@ class PartOfSpeechTest(unittest.TestCase):
     def test_run_lstmcrf_news_by_direct_model_download(self):
         cache_path = snapshot_download(self.lstmcrf_news_model_id)
         tokenizer = TokenClassificationTransformersPreprocessor(cache_path)
-        model = LSTMCRFForPartOfSpeech.from_pretrained(cache_path)
+        model = LSTMForTokenClassificationWithCRF.from_pretrained(cache_path)
         pipeline1 = TokenClassificationPipeline(model, preprocessor=tokenizer)
         pipeline2 = pipeline(
             Tasks.part_of_speech, model=model, preprocessor=tokenizer)
diff --git a/tests/pipelines/test_person_image_cartoon.py b/tests/pipelines/test_person_image_cartoon.py
index b8549f4f..1dfaf519 100644
--- a/tests/pipelines/test_person_image_cartoon.py
+++ b/tests/pipelines/test_person_image_cartoon.py
@@ -20,6 +20,9 @@ class ImageCartoonTest(unittest.TestCase, DemoCompatibilityCheck):
         self.model_id_handdrawn = 'damo/cv_unet_person-image-cartoon-handdrawn_compound-models'
         self.model_id_sketch = 'damo/cv_unet_person-image-cartoon-sketch_compound-models'
         self.model_id_artstyle = 'damo/cv_unet_person-image-cartoon-artstyle_compound-models'
+        self.model_id_design = 'damo/cv_unet_person-image-cartoon-sd-design_compound-models'
+        self.model_id_illu = 'damo/cv_unet_person-image-cartoon-sd-illustration_compound-models'
+
         self.task = Tasks.image_portrait_stylization
         self.test_image = 'https://modelscope.oss-cn-beijing.aliyuncs.com/test/images/image_cartoon.png'
 
@@ -59,6 +62,22 @@ class ImageCartoonTest(unittest.TestCase, DemoCompatibilityCheck):
             Tasks.image_portrait_stylization, model=self.model_id_artstyle)
         self.pipeline_inference(img_cartoon, self.test_image)
 
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_modelhub_design(self):
+        img_cartoon = pipeline(
+            Tasks.image_portrait_stylization,
+            model=self.model_id_design,
+            model_revision='v1.0.0')
+        self.pipeline_inference(img_cartoon, self.test_image)
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_modelhub_illustration(self):
+        img_cartoon = pipeline(
+            Tasks.image_portrait_stylization,
+            model=self.model_id_illu,
+            model_revision='v1.0.0')
+        self.pipeline_inference(img_cartoon, self.test_image)
+
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_modelhub_default_model(self):
         img_cartoon = pipeline(Tasks.image_portrait_stylization)
diff --git a/tests/pipelines/test_plug_dialogue.py b/tests/pipelines/test_plug_dialogue.py
new file mode 100644
index 00000000..4d76b61c
--- /dev/null
+++ b/tests/pipelines/test_plug_dialogue.py
@@ -0,0 +1,45 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import unittest
+
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
+from modelscope.utils.test_utils import test_level
+
+
+class PlugDialogueTest(unittest.TestCase, DemoCompatibilityCheck):
+
+    know_list = [
+        '唐代诗人李白（701年—762年12月）,▂字太白,▂号青莲居士,▂又号“谪仙人”,▂唐代伟大的浪漫主义诗人,▂被后人誉为“诗仙”,▂与杜甫并称为“李杜”,▂为了与另两位诗人李商隐与杜牧即“小李杜”区别,▂杜甫与李白',
+        '白词”享有极为崇高的地位。李白▂主要成就▂创造了古代积极浪漫主义文学高峰、为唐诗的繁荣与发展打开了新局面、开创了中国古典诗歌的黄金时代',
+        '李白（701年—762年），字太白，号青莲居士，又号“谪仙人”。是唐代伟大的浪漫主义诗人，被后人誉为“诗仙”。与杜甫并称为“李杜”，为了与另两位诗人李商隐与杜牧即“小李杜”区别，杜甫与',
+    ]
+    input = {
+        'history': '你好[SEP]你好，我是小达，很高兴认识你！[SEP]李白是谁',
+        'knowledge': '[SEP]'.join(know_list),
+        'bot_profile':
+        '我是小达;我是女生;我是单身;我今年21岁;我生日是2001年11月11日;我是天蝎座;我现在在复旦大学上学;我家现在常住上海',
+        'user_profile': '你是小明'
+    }
+
+    def setUp(self) -> None:
+        self.task = Tasks.fid_dialogue
+        self.model_id = 'damo/plug-dialogue'
+        self.model_revision = 'v1.0.1'
+
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_run_with_pipeline(self):
+        pipeline_ins = pipeline(
+            task=self.task,
+            model=self.model_id,
+            model_revision=self.model_revision)
+        result = pipeline_ins(self.input)
+        print(result)
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/pipelines/test_realtime_object_detection.py b/tests/pipelines/test_realtime_object_detection.py
index e04f6b5c..498c09d8 100644
--- a/tests/pipelines/test_realtime_object_detection.py
+++ b/tests/pipelines/test_realtime_object_detection.py
@@ -8,42 +8,41 @@ from modelscope.pipelines import pipeline
 from modelscope.utils.constant import Tasks
 from modelscope.utils.cv.image_utils import realtime_object_detection_bbox_vis
 from modelscope.utils.demo_utils import DemoCompatibilityCheck
+from modelscope.utils.logger import get_logger
 from modelscope.utils.test_utils import test_level
 
+logger = get_logger()
+
 
 class RealtimeObjectDetectionTest(unittest.TestCase, DemoCompatibilityCheck):
 
     def setUp(self) -> None:
-        self.model_id = 'damo/cv_cspnet_image-object-detection_yolox'
-        self.model_nano_id = 'damo/cv_cspnet_image-object-detection_yolox_nano_coco'
+        self.easycv_small_model_id = 'damo/cv_cspnet_image-object-detection_yolox'
+        self.easycv_nano_model_id = 'damo/cv_cspnet_image-object-detection_yolox_nano_coco'
         self.test_image = 'data/test/images/keypoints_detect/000000438862.jpg'
         self.task = Tasks.image_object_detection
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
-    def test_run_modelhub(self):
+    def test_run_easycv_yolox(self):
         realtime_object_detection = pipeline(
-            Tasks.image_object_detection, model=self.model_id)
+            Tasks.image_object_detection, model=self.easycv_small_model_id)
 
         image = cv2.imread(self.test_image)
         result = realtime_object_detection(image)
         if result:
-            bboxes = result[OutputKeys.BOXES].astype(int)
-            image = realtime_object_detection_bbox_vis(image, bboxes)
-            cv2.imwrite('rt_obj_out.jpg', image)
+            logger.info(result)
         else:
             raise ValueError('process error')
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
-    def test_run_nano(self):
+    def test_run_easycv_yolox_nano(self):
         realtime_object_detection = pipeline(
-            Tasks.image_object_detection, model=self.model_nano_id)
+            Tasks.image_object_detection, model=self.easycv_nano_model_id)
 
         image = cv2.imread(self.test_image)
         result = realtime_object_detection(image)
         if result:
-            bboxes = result[OutputKeys.BOXES].astype(int)
-            image = realtime_object_detection_bbox_vis(image, bboxes)
-            cv2.imwrite('rtnano_obj_out.jpg', image)
+            logger.info(result)
         else:
             raise ValueError('process error')
 
diff --git a/tests/pipelines/test_relation_extraction.py b/tests/pipelines/test_relation_extraction.py
index b7bbe131..17ab61fc 100644
--- a/tests/pipelines/test_relation_extraction.py
+++ b/tests/pipelines/test_relation_extraction.py
@@ -3,7 +3,7 @@ import unittest
 
 from modelscope.hub.snapshot_download import snapshot_download
 from modelscope.models import Model
-from modelscope.models.nlp import InformationExtractionModel
+from modelscope.models.nlp import ModelForInformationExtraction
 from modelscope.pipelines import pipeline
 from modelscope.pipelines.nlp import InformationExtractionPipeline
 from modelscope.preprocessors import RelationExtractionTransformersPreprocessor
@@ -24,7 +24,7 @@ class RelationExtractionTest(unittest.TestCase, DemoCompatibilityCheck):
     def test_run_by_direct_model_download(self):
         cache_path = snapshot_download(self.model_id)
         tokenizer = RelationExtractionTransformersPreprocessor(cache_path)
-        model = InformationExtractionModel.from_pretrained(cache_path)
+        model = ModelForInformationExtraction.from_pretrained(cache_path)
         pipeline1 = InformationExtractionPipeline(
             model, preprocessor=tokenizer)
         pipeline2 = pipeline(
diff --git a/tests/pipelines/test_sentence_embedding.py b/tests/pipelines/test_sentence_embedding.py
index 06be4850..13260132 100644
--- a/tests/pipelines/test_sentence_embedding.py
+++ b/tests/pipelines/test_sentence_embedding.py
@@ -14,9 +14,13 @@ from modelscope.utils.test_utils import test_level
 
 class SentenceEmbeddingTest(unittest.TestCase):
     model_id = 'damo/nlp_corom_sentence-embedding_english-base'
+    tiny_model_id = 'damo/nlp_corom_sentence-embedding_english-tiny'
     ecom_base_model_id = 'damo/nlp_corom_sentence-embedding_chinese-base-ecom'
+    ecom_tiny_model_id = 'damo/nlp_corom_sentence-embedding_chinese-tiny-ecom'
     medical_base_model_id = 'damo/nlp_corom_sentence-embedding_chinese-base-medical'
+    medical_tiny_model_id = 'damo/nlp_corom_sentence-embedding_chinese-tiny-medical'
     general_base_model_id = 'damo/nlp_corom_sentence-embedding_chinese-base'
+    general_tiny_model_id = 'damo/nlp_corom_sentence-embedding_chinese-tiny'
 
     inputs = {
         'source_sentence': ["how long it take to get a master's degree"],
@@ -51,6 +55,25 @@ class SentenceEmbeddingTest(unittest.TestCase):
         ]
     }
 
+    general_inputs1 = {
+        'source_sentence': ['功和功率的区别'],
+        'sentences_to_compare': [
+            '功反映做功多少，功率反映做功快慢。',
+            '什么是有功功率和无功功率?无功功率有什么用什么是有功功率和无功功率?无功功率有什么用电力系统中的电源是由发电机产生的三相正弦交流电,在交>流电路中,由电源供给负载的电功率有两种;一种是有功功率,一种是无功功率。',
+            '优质解答在物理学中,用电功率表示消耗电能的快慢．电功率用P表示,它的单位是瓦特（Watt）,简称瓦（Wa）符号是W.电流在单位时间内做的功叫做电功率 以灯泡为例,电功率越大\
+             ,灯泡越亮.灯泡的亮暗由电功率（实际功率）决定,不由通过的电流、电压、电能决定!',
+        ]
+    }
+
+    general_inputs2 = {
+        'source_sentence': [
+            '功反映做功多少，功率反映做功快慢。',
+            '什么是有功功率和无功功率?无功功率有什么用什么是有功功率和无功功率?无功功率有什么用电力系统中的电源是由发电机产生的三相正弦交流电,在交>流电路中,由电源供给负载的电功率有两种;一种是有功功率,一种是无功功率。',
+            '优质解答在物理学中,用电功率表示消耗电能的快慢．电功率用P表示,它的单位是瓦特（Watt）,简称瓦（Wa）符号是W.电流在单位时间内做的功叫做电功率 以灯泡为例,电功率越大\
+             ,灯泡越亮.灯泡的亮暗由电功率（实际功率）决定,不由通过的电流、电压、电能决定!',
+        ]
+    }
+
     ecom_inputs1 = {
         'source_sentence': ['毛绒玩具'],
         'sentences_to_compare': ['大熊泰迪熊猫毛绒玩具公仔布娃娃抱抱熊', '背心式狗狗牵引绳']
@@ -144,6 +167,9 @@ class SentenceEmbeddingTest(unittest.TestCase):
         pipeline_ins = pipeline(
             task=Tasks.sentence_embedding, model=self.model_id)
         print(pipeline_ins(input=self.inputs))
+        pipeline_ins = pipeline(
+            task=Tasks.sentence_embedding, model=self.tiny_model_id)
+        print(pipeline_ins(input=self.inputs))
 
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_with_default_model(self):
@@ -155,12 +181,18 @@ class SentenceEmbeddingTest(unittest.TestCase):
         pipeline_ins = pipeline(
             task=Tasks.sentence_embedding, model=self.ecom_base_model_id)
         print(pipeline_ins(input=self.ecom_inputs2))
+        pipeline_ins = pipeline(
+            task=Tasks.sentence_embedding, model=self.ecom_tiny_model_id)
+        print(pipeline_ins(input=self.ecom_inputs2))
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_run_medical_model_with_model_name(self):
         pipeline_ins = pipeline(
             task=Tasks.sentence_embedding, model=self.medical_base_model_id)
         print(pipeline_ins(input=self.medical_inputs1))
+        pipeline_ins = pipeline(
+            task=Tasks.sentence_embedding, model=self.medical_tiny_model_id)
+        print(pipeline_ins(input=self.medical_inputs1))
 
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_with_el_model(self):
@@ -168,6 +200,17 @@ class SentenceEmbeddingTest(unittest.TestCase):
             task=Tasks.sentence_embedding, model=self.el_model_id)
         print(pipeline_ins(input=self.el_inputs))
 
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_general_model_with_model_name(self):
+        pipeline_ins = pipeline(
+            task=Tasks.sentence_embedding, model=self.general_base_model_id)
+        print(pipeline_ins(input=self.general_inputs1))
+        print(pipeline_ins(input=self.general_inputs2))
+        pipeline_ins = pipeline(
+            task=Tasks.sentence_embedding, model=self.general_tiny_model_id)
+        print(pipeline_ins(input=self.general_inputs1))
+        print(pipeline_ins(input=self.general_inputs2))
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_sentence_similarity.py b/tests/pipelines/test_sentence_similarity.py
index 486fadfa..846b72c3 100644
--- a/tests/pipelines/test_sentence_similarity.py
+++ b/tests/pipelines/test_sentence_similarity.py
@@ -18,6 +18,7 @@ class SentenceSimilarityTest(unittest.TestCase, DemoCompatibilityCheck):
     def setUp(self) -> None:
         self.task = Tasks.sentence_similarity
         self.model_id = 'damo/nlp_structbert_sentence-similarity_chinese-base'
+        self.model_id_retail = 'damo/nlp_structbert_sentence-similarity_chinese-retail-base'
 
     sentence1 = '今天气温比昨天高么？'
     sentence2 = '今天湿度比昨天高么？'
@@ -81,6 +82,14 @@ class SentenceSimilarityTest(unittest.TestCase, DemoCompatibilityCheck):
                 compare_fn=IgnoreKeyFn('.*intermediate_act_fn')):
             print(pipeline_ins(input=(self.sentence1, self.sentence2)))
 
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_retail_similarity_model(self):
+        pipeline_ins = pipeline(
+            task=Tasks.sentence_similarity,
+            model=self.model_id_retail,
+            model_revision='v1.0.0')
+        print(pipeline_ins(input=(self.sentence1, self.sentence2)))
+
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_with_default_model(self):
         pipeline_ins = pipeline(task=Tasks.sentence_similarity)
diff --git a/tests/pipelines/test_sentiment_classification.py b/tests/pipelines/test_sentiment_classification.py
index e0f823be..278f34a8 100644
--- a/tests/pipelines/test_sentiment_classification.py
+++ b/tests/pipelines/test_sentiment_classification.py
@@ -3,8 +3,8 @@ import unittest
 
 from modelscope.hub.snapshot_download import snapshot_download
 from modelscope.models import Model
-from modelscope.models.nlp.task_models.sequence_classification import \
-    SequenceClassificationModel
+from modelscope.models.nlp.task_models.text_classification import \
+    ModelForTextClassification
 from modelscope.pipelines import pipeline
 from modelscope.pipelines.nlp import TextClassificationPipeline
 from modelscope.preprocessors import TextClassificationTransformersPreprocessor
@@ -26,7 +26,7 @@ class SentimentClassificationTaskModelTest(unittest.TestCase,
     def test_run_with_direct_file_download(self):
         cache_path = snapshot_download(self.model_id)
         tokenizer = TextClassificationTransformersPreprocessor(cache_path)
-        model = SequenceClassificationModel.from_pretrained(
+        model = ModelForTextClassification.from_pretrained(
             self.model_id, num_labels=2)
         pipeline1 = TextClassificationPipeline(model, preprocessor=tokenizer)
         pipeline2 = pipeline(
@@ -46,7 +46,7 @@ class SentimentClassificationTaskModelTest(unittest.TestCase,
             preprocessor=tokenizer)
         print(pipeline_ins(input=self.sentence1))
         self.assertTrue(
-            isinstance(pipeline_ins.model, SequenceClassificationModel))
+            isinstance(pipeline_ins.model, ModelForTextClassification))
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_run_with_model_name(self):
@@ -54,14 +54,14 @@ class SentimentClassificationTaskModelTest(unittest.TestCase,
             task=Tasks.text_classification, model=self.model_id)
         print(pipeline_ins(input=self.sentence1))
         self.assertTrue(
-            isinstance(pipeline_ins.model, SequenceClassificationModel))
+            isinstance(pipeline_ins.model, ModelForTextClassification))
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_run_with_default_model(self):
         pipeline_ins = pipeline(task=Tasks.text_classification)
         print(pipeline_ins(input=self.sentence1))
         self.assertTrue(
-            isinstance(pipeline_ins.model, SequenceClassificationModel))
+            isinstance(pipeline_ins.model, ModelForTextClassification))
 
     @unittest.skip('demo compatibility test is only enabled on a needed-basis')
     def test_demo_compatibility(self):
diff --git a/tests/pipelines/test_siamese_uie.py b/tests/pipelines/test_siamese_uie.py
new file mode 100644
index 00000000..9097813c
--- /dev/null
+++ b/tests/pipelines/test_siamese_uie.py
@@ -0,0 +1,76 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import unittest
+
+import json
+
+from modelscope.hub.snapshot_download import snapshot_download
+from modelscope.models import Model
+from modelscope.models.nlp import SiameseUieModel
+from modelscope.pipelines import pipeline
+from modelscope.pipelines.nlp import SiameseUiePipeline
+from modelscope.preprocessors import SiameseUiePreprocessor
+from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
+from modelscope.utils.regress_test_utils import IgnoreKeyFn, MsRegressTool
+from modelscope.utils.test_utils import test_level
+
+
+class ZeroShotClassificationTest(unittest.TestCase, DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.task = Tasks.siamese_uie
+        self.model_id = 'damo/nlp_structbert_siamese-uie_chinese-base'
+
+    sentence = '1944年毕业于北大的名古屋铁道会长谷口清太郎等人在日本积极筹资，共筹款2.7亿日元，参加捐款的日本企业有69家。'
+    schema = {'人物': None, '地理位置': None, '组织机构': None}
+    regress_tool = MsRegressTool(baseline=False)
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_run_with_direct_file_download(self):
+        cache_path = snapshot_download(self.model_id)
+        tokenizer = SiameseUiePreprocessor(cache_path)
+        model = SiameseUieModel.from_pretrained(cache_path)
+        pipeline1 = SiameseUiePipeline(
+            model, preprocessor=tokenizer, model_revision='v1.0')
+        pipeline2 = pipeline(
+            Tasks.siamese_uie,
+            model=model,
+            preprocessor=tokenizer,
+            model_revision='v1.0')
+
+        print(
+            f'sentence: {self.sentence}\n'
+            f'pipeline1:{pipeline1(input=self.sentence, schema=json.dumps(self.schema, ensure_ascii=False))}'
+        )
+        print(f'sentence: {self.sentence}\n'
+              f'pipeline2: {pipeline2(self.sentence, schema=self.schema)}')
+
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_run_with_model_from_modelhub(self):
+        model = Model.from_pretrained(self.model_id)
+        tokenizer = SiameseUiePreprocessor(model.model_dir)
+        pipeline_ins = pipeline(
+            task=Tasks.siamese_uie,
+            model=model,
+            preprocessor=tokenizer,
+            model_revision='v1.0')
+        print(pipeline_ins(input=self.sentence, schema=self.schema))
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_with_model_name(self):
+        pipeline_ins = pipeline(
+            task=Tasks.siamese_uie, model=self.model_id, model_revision='v1.0')
+        print(pipeline_ins(input=self.sentence, schema=self.schema))
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_run_with_default_model(self):
+        pipeline_ins = pipeline(task=Tasks.siamese_uie, model_revision='v1.0')
+        print(pipeline_ins(input=self.sentence, schema=self.schema))
+
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/pipelines/test_speaker_verification.py b/tests/pipelines/test_speaker_verification.py
new file mode 100644
index 00000000..addb9058
--- /dev/null
+++ b/tests/pipelines/test_speaker_verification.py
@@ -0,0 +1,49 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import os.path
+import unittest
+from typing import Any, Dict, List
+
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
+from modelscope.utils.logger import get_logger
+from modelscope.utils.test_utils import test_level
+
+logger = get_logger()
+
+SPEAKER1_A_EN_16K_WAV = 'data/test/audios/speaker1_a_en_16k.wav'
+SPEAKER1_B_EN_16K_WAV = 'data/test/audios/speaker1_b_en_16k.wav'
+SPEAKER2_A_EN_16K_WAV = 'data/test/audios/speaker2_a_en_16k.wav'
+
+
+class SpeakerVerificationTest(unittest.TestCase, DemoCompatibilityCheck):
+    ecapatdnn_voxceleb_16k_model_id = 'damo/speech_ecapa-tdnn_sv_en_voxceleb_16k'
+
+    def setUp(self) -> None:
+        self.task = Tasks.speaker_verification
+
+    def run_pipeline(self, model_id: str, audios: List[str]) -> Dict[str, Any]:
+        p = pipeline(task=self.task, model=model_id)
+        result = p(audios)
+        return result
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_with_speaker_verification_ecapatdnn_voxceleb_16k(self):
+        logger.info(
+            'Run speaker verification for ecapatdnn_voxceleb_16k model')
+
+        result = self.run_pipeline(
+            model_id=self.ecapatdnn_voxceleb_16k_model_id,
+            audios=[SPEAKER1_A_EN_16K_WAV, SPEAKER2_A_EN_16K_WAV])
+        print(result)
+        self.assertTrue(OutputKeys.SCORE in result)
+
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/pipelines/test_text2text_generation.py b/tests/pipelines/test_text2text_generation.py
index 6ce6a9b3..40576a29 100644
--- a/tests/pipelines/test_text2text_generation.py
+++ b/tests/pipelines/test_text2text_generation.py
@@ -19,6 +19,8 @@ class Text2TextGenerationTest(unittest.TestCase, DemoCompatibilityCheck):
         self.input_generate = '中国的首都位于<extra_id_0>。'
         self.model_id_translate = 'damo/t5-translate-base-test'
         self.input_translate = 'My name is Wolfgang and I live in Berlin'
+        self.model_id_rewriting = 'damo/nlp_mt5_dialogue-rewriting_chinese-base'
+        self.input_rewriting = '杨阳胖吗[SEP]我一个同学叫杨阳[SEP]他多少斤'
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_run_T5(self):
@@ -48,6 +50,14 @@ class Text2TextGenerationTest(unittest.TestCase, DemoCompatibilityCheck):
             task=Tasks.text2text_generation, model=self.model_id_translate)
         print(pipeline_ins(self.input_translate))
 
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_rewriting_model(self):
+        pipeline_ins = pipeline(
+            task=Tasks.text2text_generation,
+            model=self.model_id_rewriting,
+            model_revision='v1.0.1')
+        print(pipeline_ins(self.input_rewriting))
+
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_run_pipeline_with_model_id_batch(self):
         pipeline_ins = pipeline(
diff --git a/tests/pipelines/test_text_ranking.py b/tests/pipelines/test_text_ranking.py
index 40c4da58..26197410 100644
--- a/tests/pipelines/test_text_ranking.py
+++ b/tests/pipelines/test_text_ranking.py
@@ -14,6 +14,7 @@ from modelscope.utils.test_utils import test_level
 
 class TextRankingTest(unittest.TestCase):
     base_model_id = 'damo/nlp_corom_passage-ranking_english-base'
+    tiny_model_id = 'damo/nlp_corom_passage-ranking_english-tiny'
     inputs = {
         'source_sentence': ["how long it take to get a master's degree"],
         'sentences_to_compare': [
@@ -25,6 +26,7 @@ class TextRankingTest(unittest.TestCase):
     }
 
     chinese_base_model_id = 'damo/nlp_rom_passage-ranking_chinese-base'
+    chinese_tiny_model_id = 'damo/nlp_corom_passage-ranking_chinese-tiny'
     chinese_inputs = {
         'source_sentence': ['功和功率的区别'],
         'sentences_to_compare': [
@@ -36,12 +38,14 @@ class TextRankingTest(unittest.TestCase):
     }
 
     ecom_base_model_id = 'damo/nlp_corom_passage-ranking_chinese-base-ecom'
+    ecom_tiny_model_id = 'damo/nlp_corom_passage-ranking_chinese-tiny-ecom'
     ecom_inputs = {
         'source_sentence': ['毛绒玩具'],
         'sentences_to_compare': ['大熊泰迪熊猫毛绒玩具公仔布娃娃抱抱熊', '背心式狗狗牵引绳']
     }
 
     medical_base_model_id = 'damo/nlp_corom_passage-ranking_chinese-base-medical'
+    medical_tiny_model_id = 'damo/nlp_corom_passage-ranking_chinese-tiny-medical'
     medical_inputs = {
         'source_sentence': ['肠道不适可以服用益生菌吗'],
         'sentences_to_compare': ['肠胃不好能吃益生菌,益生菌有调节肠胃道菌群的作用', '身体发烧应该多喝水']
@@ -86,6 +90,9 @@ class TextRankingTest(unittest.TestCase):
         pipeline_ins = pipeline(
             task=Tasks.text_ranking, model=self.base_model_id)
         print(pipeline_ins(input=self.inputs))
+        pipeline_ins = pipeline(
+            task=Tasks.text_ranking, model=self.tiny_model_id)
+        print(pipeline_ins(input=self.inputs))
 
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_with_default_model(self):
@@ -97,18 +104,27 @@ class TextRankingTest(unittest.TestCase):
         pipeline_ins = pipeline(
             task=Tasks.text_ranking, model=self.chinese_base_model_id)
         print(pipeline_ins(input=self.chinese_inputs))
+        pipeline_ins = pipeline(
+            task=Tasks.text_ranking, model=self.chinese_tiny_model_id)
+        print(pipeline_ins(input=self.chinese_inputs))
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_run_ecom_model_with_model_name(self):
         pipeline_ins = pipeline(
             task=Tasks.text_ranking, model=self.ecom_base_model_id)
         print(pipeline_ins(input=self.ecom_inputs))
+        pipeline_tiny_ins = pipeline(
+            task=Tasks.text_ranking, model=self.ecom_tiny_model_id)
+        print(pipeline_tiny_ins(input=self.ecom_inputs))
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_run_medical_model_with_model_name(self):
         pipeline_ins = pipeline(
             task=Tasks.text_ranking, model=self.medical_base_model_id)
         print(pipeline_ins(input=self.medical_inputs))
+        pipeline_ins = pipeline(
+            task=Tasks.text_ranking, model=self.medical_tiny_model_id)
+        print(pipeline_ins(input=self.medical_inputs))
 
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_with_el_model(self):
diff --git a/tests/pipelines/test_tinynas_detection.py b/tests/pipelines/test_tinynas_detection.py
index a73e7b0c..4c3735dc 100644
--- a/tests/pipelines/test_tinynas_detection.py
+++ b/tests/pipelines/test_tinynas_detection.py
@@ -153,6 +153,49 @@ class TinynasObjectDetectionTest(unittest.TestCase, DemoCompatibilityCheck):
             OutputKeys.LABELS in result) and (OutputKeys.BOXES in result)
         print('results: ', result)
 
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_phone_detection_damoyolo(self):
+        tinynas_object_detection = pipeline(
+            Tasks.domain_specific_object_detection,
+            model='damo/cv_tinynas_object-detection_damoyolo_phone')
+        result = tinynas_object_detection('data/test/images/image_phone.jpg')
+        assert result and (OutputKeys.SCORES in result) and (
+            OutputKeys.LABELS in result) and (OutputKeys.BOXES in result)
+        print('results: ', result)
+
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_phone_detection_damoyolo_with_image(self):
+        tinynas_object_detection = pipeline(
+            Tasks.domain_specific_object_detection,
+            model='damo/cv_tinynas_object-detection_damoyolo_phone')
+        img = Image.open('data/test/images/image_phone.jpg')
+        result = tinynas_object_detection(img)
+        assert result and (OutputKeys.SCORES in result) and (
+            OutputKeys.LABELS in result) and (OutputKeys.BOXES in result)
+        print('results: ', result)
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_head_detection_damoyolo(self):
+        tinynas_object_detection = pipeline(
+            Tasks.domain_specific_object_detection,
+            model='damo/cv_tinynas_head-detection_damoyolo')
+        result = tinynas_object_detection(
+            'data/test/images/image_detection.jpg')
+        assert result and (OutputKeys.SCORES in result) and (
+            OutputKeys.LABELS in result) and (OutputKeys.BOXES in result)
+        print('results: ', result)
+
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_head_detection_damoyolo_with_image(self):
+        tinynas_object_detection = pipeline(
+            Tasks.domain_specific_object_detection,
+            model='damo/cv_tinynas_head-detection_damoyolo')
+        img = Image.open('data/test/images/image_detection.jpg')
+        result = tinynas_object_detection(img)
+        assert result and (OutputKeys.SCORES in result) and (
+            OutputKeys.LABELS in result) and (OutputKeys.BOXES in result)
+        print('results: ', result)
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_traffic_sign_detection.py b/tests/pipelines/test_traffic_sign_detection.py
new file mode 100644
index 00000000..5404649d
--- /dev/null
+++ b/tests/pipelines/test_traffic_sign_detection.py
@@ -0,0 +1,48 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import unittest
+
+from PIL import Image
+
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
+from modelscope.utils.test_utils import test_level
+
+
+class TrafficSignDetectionTest(unittest.TestCase, DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.task = Tasks.domain_specific_object_detection
+        self.model_id = 'damo/cv_tinynas_object-detection_damoyolo_traffic_sign'
+
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_traffic_sign_detection_damoyolo(self):
+        tinynas_object_detection = pipeline(
+            Tasks.domain_specific_object_detection,
+            model='damo/cv_tinynas_object-detection_damoyolo_traffic_sign')
+        result = tinynas_object_detection(
+            'data/test/images/image_traffic_sign.jpg')
+        assert result and (OutputKeys.SCORES in result) and (
+            OutputKeys.LABELS in result) and (OutputKeys.BOXES in result)
+        print('results: ', result)
+
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_traffic_sign_detection_damoyolo_with_image(self):
+        tinynas_object_detection = pipeline(
+            Tasks.domain_specific_object_detection,
+            model='damo/cv_tinynas_object-detection_damoyolo_traffic_sign')
+        img = Image.open('data/test/images/image_traffic_sign.jpg')
+        result = tinynas_object_detection(img)
+        assert result and (OutputKeys.SCORES in result) and (
+            OutputKeys.LABELS in result) and (OutputKeys.BOXES in result)
+        print('results: ', result)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/pipelines/test_translation_evaluation.py b/tests/pipelines/test_translation_evaluation.py
index 76720ac0..53524fdc 100644
--- a/tests/pipelines/test_translation_evaluation.py
+++ b/tests/pipelines/test_translation_evaluation.py
@@ -42,6 +42,16 @@ class TranslationEvaluationTest(unittest.TestCase, DemoCompatibilityCheck):
         pipeline_ins.change_eval_mode(eval_mode=EvaluationMode.REF)
         print(pipeline_ins(input=input))
 
+        pipeline_ins = pipeline(
+            self.task, model=self.model_id_large, device='cpu')
+        print(pipeline_ins(input=input))
+
+        pipeline_ins.change_eval_mode(eval_mode=EvaluationMode.SRC)
+        print(pipeline_ins(input=input))
+
+        pipeline_ins.change_eval_mode(eval_mode=EvaluationMode.REF)
+        print(pipeline_ins(input=input))
+
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_run_with_model_name_for_unite_base(self):
         input = {
@@ -68,6 +78,16 @@ class TranslationEvaluationTest(unittest.TestCase, DemoCompatibilityCheck):
         pipeline_ins.change_eval_mode(eval_mode=EvaluationMode.REF)
         print(pipeline_ins(input=input))
 
+        pipeline_ins = pipeline(
+            self.task, model=self.model_id_base, device='cpu')
+        print(pipeline_ins(input=input))
+
+        pipeline_ins.change_eval_mode(eval_mode=EvaluationMode.SRC)
+        print(pipeline_ins(input=input))
+
+        pipeline_ins.change_eval_mode(eval_mode=EvaluationMode.REF)
+        print(pipeline_ins(input=input))
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_universal_matting.py b/tests/pipelines/test_universal_matting.py
new file mode 100644
index 00000000..5868cf36
--- /dev/null
+++ b/tests/pipelines/test_universal_matting.py
@@ -0,0 +1,44 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os.path as osp
+import unittest
+
+import cv2
+
+from modelscope.msdatasets import MsDataset
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import ModelFile, Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
+from modelscope.utils.test_utils import test_level
+
+
+class UniversalMattingTest(unittest.TestCase, DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.model_id = 'damo/cv_unet_universal-matting'
+
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_run_with_dataset(self):
+        input_location = ['data/test/images/universal_matting.jpg']
+
+        dataset = MsDataset.load(input_location, target='image')
+        img_matting = pipeline(Tasks.universal_matting, model=self.model_id)
+        result = img_matting(dataset)
+        cv2.imwrite('result.png', next(result)[OutputKeys.OUTPUT_IMG])
+        print(f'Output written to {osp.abspath("result.png")}')
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_modelhub(self):
+        img_matting = pipeline(Tasks.universal_matting, model=self.model_id)
+
+        result = img_matting('data/test/images/universal_matting.jpg')
+        cv2.imwrite('result.png', result[OutputKeys.OUTPUT_IMG])
+        print(f'Output written to {osp.abspath("result.png")}')
+
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/pipelines/test_video_colorization.py b/tests/pipelines/test_video_colorization.py
new file mode 100644
index 00000000..c35577a4
--- /dev/null
+++ b/tests/pipelines/test_video_colorization.py
@@ -0,0 +1,53 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os.path as osp
+import unittest
+
+import cv2
+
+from modelscope.hub.snapshot_download import snapshot_download
+from modelscope.models import Model
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines import pipeline
+from modelscope.pipelines.base import Pipeline
+from modelscope.pipelines.cv import VideoColorizationPipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
+from modelscope.utils.test_utils import test_level
+
+
+class VideoColorizationTest(unittest.TestCase, DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.task = Tasks.video_colorization
+        self.model_id = 'damo/cv_unet_video-colorization'
+        self.test_video = 'data/test/videos/video_frame_interpolation_test.mp4'
+
+    def pipeline_inference(self, pipeline: Pipeline, test_video: str):
+        result = pipeline(test_video)[OutputKeys.OUTPUT_VIDEO]
+        if result is not None:
+            print(f'Output video written to {result}')
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_run_by_direct_model_download(self):
+        cache_path = snapshot_download(self.model_id)
+        video_colorization = VideoColorizationPipeline(cache_path)
+        self.pipeline_inference(video_colorization, self.test_video)
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_with_model_from_modelhub(self):
+        video_colorization = pipeline(
+            task=Tasks.video_colorization, model=self.model_id)
+        self.pipeline_inference(video_colorization, self.test_video)
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_run_with_default_model(self):
+        video_colorization = pipeline(Tasks.video_colorization)
+        self.pipeline_inference(video_colorization, self.test_video)
+
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/pipelines/test_video_deinterlace.py b/tests/pipelines/test_video_deinterlace.py
new file mode 100644
index 00000000..bcb36cc3
--- /dev/null
+++ b/tests/pipelines/test_video_deinterlace.py
@@ -0,0 +1,61 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import sys
+import unittest
+
+from modelscope.hub.snapshot_download import snapshot_download
+from modelscope.models import Model
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines import pipeline
+from modelscope.pipelines.cv import VideoDeinterlacePipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
+from modelscope.utils.test_utils import test_level
+
+
+class VideoDeinterlaceTest(unittest.TestCase, DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.task = Tasks.video_deinterlace
+        self.model_id = 'damo/cv_unet_video-deinterlace'
+        self.test_video = 'data/test/videos/video_deinterlace_test.mp4'
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_run_by_direct_model_download(self):
+        cache_path = snapshot_download(self.model_id)
+        pipeline = VideoDeinterlacePipeline(cache_path)
+        pipeline.group_key = self.task
+        out_video_path = pipeline(
+            input=self.test_video)[OutputKeys.OUTPUT_VIDEO]
+        print('pipeline: the output video path is {}'.format(out_video_path))
+
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_run_with_model_from_pretrained(self):
+        cache_path = Model.from_pretrained(self.model_id)
+        pipeline = VideoDeinterlacePipeline(cache_path)
+        pipeline.group_key = self.task
+        out_video_path = pipeline(
+            input=self.test_video)[OutputKeys.OUTPUT_VIDEO]
+        print('pipeline: the output video path is {}'.format(out_video_path))
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_with_model_from_modelhub(self):
+        pipeline_ins = pipeline(
+            task=Tasks.video_deinterlace, model=self.model_id)
+        out_video_path = pipeline_ins(
+            input=self.test_video)[OutputKeys.OUTPUT_VIDEO]
+        print('pipeline: the output video path is {}'.format(out_video_path))
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_run_with_default_model(self):
+        pipeline_ins = pipeline(task=Tasks.video_deinterlace)
+        out_video_path = pipeline_ins(
+            input=self.test_video)[OutputKeys.OUTPUT_VIDEO]
+        print('pipeline: the output video path is {}'.format(out_video_path))
+
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/pipelines/test_video_frame_interpolation.py b/tests/pipelines/test_video_frame_interpolation.py
index 951da2b9..c23aa46a 100644
--- a/tests/pipelines/test_video_frame_interpolation.py
+++ b/tests/pipelines/test_video_frame_interpolation.py
@@ -17,6 +17,7 @@ class VideoFrameInterpolationTest(unittest.TestCase, DemoCompatibilityCheck):
     def setUp(self) -> None:
         self.task = Tasks.video_frame_interpolation
         self.model_id = 'damo/cv_raft_video-frame-interpolation'
+        self.model_practical_id = 'damo/cv_raft_video-frame-interpolation_practical'
         self.test_video = 'data/test/videos/video_frame_interpolation_test.mp4'
 
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
@@ -28,6 +29,13 @@ class VideoFrameInterpolationTest(unittest.TestCase, DemoCompatibilityCheck):
             input=self.test_video)[OutputKeys.OUTPUT_VIDEO]
         print('pipeline: the output video path is {}'.format(out_video_path))
 
+        cache_path = snapshot_download(self.model_practical_id)
+        pipeline = VideoFrameInterpolationPipeline(cache_path)
+        pipeline.group_key = self.task
+        out_video_path = pipeline(
+            input={'video': self.test_video})[OutputKeys.OUTPUT_VIDEO]
+        print('pipeline: the output video path is {}'.format(out_video_path))
+
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_run_with_model_from_modelhub(self):
         pipeline_ins = pipeline(
@@ -36,6 +44,13 @@ class VideoFrameInterpolationTest(unittest.TestCase, DemoCompatibilityCheck):
             input=self.test_video)[OutputKeys.OUTPUT_VIDEO]
         print('pipeline: the output video path is {}'.format(out_video_path))
 
+        pipeline_ins = pipeline(
+            task=Tasks.video_frame_interpolation,
+            model=self.model_practical_id)
+        out_video_path = pipeline_ins(
+            input={'video': self.test_video})[OutputKeys.OUTPUT_VIDEO]
+        print('pipeline: the output video path is {}'.format(out_video_path))
+
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_with_default_model(self):
         pipeline_ins = pipeline(task=Tasks.video_frame_interpolation)
diff --git a/tests/pipelines/test_video_multi_object_tracking.py b/tests/pipelines/test_video_multi_object_tracking.py
index eb37ffd0..97f1e705 100644
--- a/tests/pipelines/test_video_multi_object_tracking.py
+++ b/tests/pipelines/test_video_multi_object_tracking.py
@@ -20,7 +20,9 @@ class MultiObjectTracking(unittest.TestCase, DemoCompatibilityCheck):
             Tasks.video_multi_object_tracking, model=self.model_id)
         video_path = 'data/test/videos/MOT17-03-partial.mp4'
         result = video_multi_object_tracking(video_path)
-        print('result is : ', result[OutputKeys.BOXES])
+        assert result and (OutputKeys.LABELS in result) and (OutputKeys.BOXES
+                                                             in result)
+        assert len(result[OutputKeys.LABELS]) == len(result[OutputKeys.BOXES])
 
     @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
     def test_run_modelhub_default_model(self):
@@ -28,7 +30,9 @@ class MultiObjectTracking(unittest.TestCase, DemoCompatibilityCheck):
             Tasks.video_multi_object_tracking)
         video_path = 'data/test/videos/MOT17-03-partial.mp4'
         result = video_multi_object_tracking(video_path)
-        print('result is : ', result[OutputKeys.BOXES])
+        assert result and (OutputKeys.LABELS in result) and (OutputKeys.BOXES
+                                                             in result)
+        assert len(result[OutputKeys.LABELS]) == len(result[OutputKeys.BOXES])
 
     @unittest.skip('demo compatibility test is only enabled on a needed-basis')
     def test_demo_compatibility(self):
diff --git a/tests/pipelines/test_video_panoptic_segmentation.py b/tests/pipelines/test_video_panoptic_segmentation.py
new file mode 100644
index 00000000..ad038135
--- /dev/null
+++ b/tests/pipelines/test_video_panoptic_segmentation.py
@@ -0,0 +1,41 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import unittest
+
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
+from modelscope.utils.test_utils import test_level
+
+
+class VideoPanopticSegmentationTest(unittest.TestCase, DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.task = Tasks.video_panoptic_segmentation
+        self.model_id = 'damo/cv_swinb_video-panoptic-segmentation_vipseg'
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_modelhub(self):
+        video_path = 'data/test/videos/kitti-step_testing_image_02_0000.mp4'
+        seg_pipeline = pipeline(
+            Tasks.video_panoptic_segmentation,
+            model=self.model_id,
+            max_video_frames=20)
+        result = seg_pipeline(video_path)
+
+        print(f'video summarization output: \n{result}.')
+
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_run_modelhub_default_model(self):
+        video_path = 'data/test/videos/kitti-step_testing_image_02_0000.mp4'
+        seg_pipeline = pipeline(Tasks.video_summarization, max_video_frames=20)
+        result = seg_pipeline(video_path)
+
+        print(f'video summarization output:\n {result}.')
+
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/pipelines/test_vision_efficient_tuning_adapter.py b/tests/pipelines/test_vision_efficient_tuning_adapter.py
new file mode 100644
index 00000000..4a06a40a
--- /dev/null
+++ b/tests/pipelines/test_vision_efficient_tuning_adapter.py
@@ -0,0 +1,37 @@
+# Copyright 2022-2023 The Alibaba Fundamental Vision Team Authors. All rights reserved.
+import unittest
+
+from modelscope.models import Model
+from modelscope.models.cv.vision_efficient_tuning.vision_efficient_tuning import \
+    VisionEfficientTuningModel
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
+from modelscope.utils.test_utils import test_level
+
+
+class VisionEfficientTuningAdapterTest(unittest.TestCase,
+                                       DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.task = Tasks.vision_efficient_tuning
+        self.model_id = 'damo/cv_vitb16_classification_vision-efficient-tuning-adapter'
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_pipeline(self):
+
+        petl_pipeline = pipeline(self.task, self.model_id)
+        result = petl_pipeline(
+            'data/test/images/vision_efficient_tuning_test_1.png')
+
+        print(f'Vision-efficient-tuning-adapter output: {result}.')
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_load_model_from_pretrained(self):
+        model = Model.from_pretrained(
+            'damo/cv_vitb16_classification_vision-efficient-tuning-adapter')
+        self.assertTrue(model.__class__ == VisionEfficientTuningModel)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/pipelines/test_vision_efficient_tuning_lora.py b/tests/pipelines/test_vision_efficient_tuning_lora.py
new file mode 100644
index 00000000..6c49453a
--- /dev/null
+++ b/tests/pipelines/test_vision_efficient_tuning_lora.py
@@ -0,0 +1,36 @@
+# Copyright 2022-2023 The Alibaba Fundamental Vision Team Authors. All rights reserved.
+import unittest
+
+from modelscope.models import Model
+from modelscope.models.cv.vision_efficient_tuning.vision_efficient_tuning import \
+    VisionEfficientTuningModel
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
+from modelscope.utils.test_utils import test_level
+
+
+class VisionEfficientTuningLoRATest(unittest.TestCase, DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.task = Tasks.vision_efficient_tuning
+        self.model_id = 'damo/cv_vitb16_classification_vision-efficient-tuning-lora'
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_pipeline(self):
+
+        petl_pipeline = pipeline(self.task, self.model_id)
+        result = petl_pipeline(
+            'data/test/images/vision_efficient_tuning_test_1.png')
+
+        print(f'Vision-efficient-tuning-lora output: {result}.')
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_load_model_from_pretrained(self):
+        model = Model.from_pretrained(
+            'damo/cv_vitb16_classification_vision-efficient-tuning-lora')
+        self.assertTrue(model.__class__ == VisionEfficientTuningModel)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/pipelines/test_vision_efficient_tuning_prefix.py b/tests/pipelines/test_vision_efficient_tuning_prefix.py
new file mode 100644
index 00000000..0eca5819
--- /dev/null
+++ b/tests/pipelines/test_vision_efficient_tuning_prefix.py
@@ -0,0 +1,37 @@
+# Copyright 2022-2023 The Alibaba Fundamental Vision Team Authors. All rights reserved.
+import unittest
+
+from modelscope.models import Model
+from modelscope.models.cv.vision_efficient_tuning.vision_efficient_tuning import \
+    VisionEfficientTuningModel
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
+from modelscope.utils.test_utils import test_level
+
+
+class VisionEfficientTuningPrefixTest(unittest.TestCase,
+                                      DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.task = Tasks.vision_efficient_tuning
+        self.model_id = 'damo/cv_vitb16_classification_vision-efficient-tuning-prefix'
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_pipeline(self):
+
+        petl_pipeline = pipeline(self.task, self.model_id)
+        result = petl_pipeline(
+            'data/test/images/vision_efficient_tuning_test_1.png')
+
+        print(f'Vision-efficient-tuning-prefix output: {result}.')
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_load_model_from_pretrained(self):
+        model = Model.from_pretrained(
+            'damo/cv_vitb16_classification_vision-efficient-tuning-prefix')
+        self.assertTrue(model.__class__ == VisionEfficientTuningModel)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/pipelines/test_vision_efficient_tuning_prompt.py b/tests/pipelines/test_vision_efficient_tuning_prompt.py
new file mode 100644
index 00000000..97d97811
--- /dev/null
+++ b/tests/pipelines/test_vision_efficient_tuning_prompt.py
@@ -0,0 +1,37 @@
+# Copyright 2022-2023 The Alibaba Fundamental Vision Team Authors. All rights reserved.
+import unittest
+
+from modelscope.models import Model
+from modelscope.models.cv.vision_efficient_tuning.vision_efficient_tuning import \
+    VisionEfficientTuningModel
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
+from modelscope.utils.test_utils import test_level
+
+
+class VisionEfficientTuningPromptTest(unittest.TestCase,
+                                      DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.task = Tasks.vision_efficient_tuning
+        self.model_id = 'damo/cv_vitb16_classification_vision-efficient-tuning-prompt'
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_pipeline(self):
+
+        petl_pipeline = pipeline(self.task, self.model_id)
+        result = petl_pipeline(
+            'data/test/images/vision_efficient_tuning_test_1.png')
+
+        print(f'Vision-efficient-tuning-prompt output: {result}.')
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_load_model_from_pretrained(self):
+        model = Model.from_pretrained(
+            'damo/cv_vitb16_classification_vision-efficient-tuning-prompt')
+        self.assertTrue(model.__class__ == VisionEfficientTuningModel)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/pipelines/test_word_segmentation.py b/tests/pipelines/test_word_segmentation.py
index 471df01b..f8bdaef7 100644
--- a/tests/pipelines/test_word_segmentation.py
+++ b/tests/pipelines/test_word_segmentation.py
@@ -3,7 +3,7 @@ import unittest
 
 from modelscope.hub.snapshot_download import snapshot_download
 from modelscope.models import Model
-from modelscope.models.nlp import (LSTMCRFForWordSegmentation,
+from modelscope.models.nlp import (LSTMForTokenClassificationWithCRF,
                                    SbertForTokenClassification)
 from modelscope.pipelines import pipeline
 from modelscope.pipelines.nlp import WordSegmentationPipeline
@@ -57,7 +57,7 @@ class WordSegmentationTest(unittest.TestCase, DemoCompatibilityCheck):
     def test_run_lstmcrf_news_by_direct_model_download(self):
         cache_path = snapshot_download(self.lstmcrf_news_model_id)
         tokenizer = TokenClassificationTransformersPreprocessor(cache_path)
-        model = LSTMCRFForWordSegmentation(cache_path, tokenizer=tokenizer)
+        model = LSTMForTokenClassificationWithCRF.from_pretrained(cache_path)
         pipeline1 = WordSegmentationPipeline(model, preprocessor=tokenizer)
         pipeline2 = pipeline(
             Tasks.word_segmentation, model=model, preprocessor=tokenizer)
@@ -69,7 +69,7 @@ class WordSegmentationTest(unittest.TestCase, DemoCompatibilityCheck):
     def test_run_lstmcrf_ecom_by_direct_model_download(self):
         cache_path = snapshot_download(self.lstmcrf_ecom_model_id)
         tokenizer = TokenClassificationTransformersPreprocessor(cache_path)
-        model = LSTMCRFForWordSegmentation(cache_path, tokenizer=tokenizer)
+        model = LSTMForTokenClassificationWithCRF.from_pretrained(cache_path)
         pipeline1 = WordSegmentationPipeline(model, preprocessor=tokenizer)
         pipeline2 = pipeline(
             Tasks.word_segmentation, model=model, preprocessor=tokenizer)
diff --git a/tests/preprocessors/test_nlp.py b/tests/preprocessors/test_nlp.py
index d63660c8..86e127c1 100644
--- a/tests/preprocessors/test_nlp.py
+++ b/tests/preprocessors/test_nlp.py
@@ -1,5 +1,7 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import os.path
+import shutil
+import tempfile
 import unittest
 
 from modelscope.preprocessors import Preprocessor, build_preprocessor, nlp
@@ -11,6 +13,16 @@ logger = get_logger()
 
 class NLPPreprocessorTest(unittest.TestCase):
 
+    def setUp(self):
+        print(('Testing %s.%s' % (type(self).__name__, self._testMethodName)))
+        self.tmp_dir = tempfile.TemporaryDirectory().name
+        if not os.path.exists(self.tmp_dir):
+            os.makedirs(self.tmp_dir)
+
+    def tearDown(self):
+        shutil.rmtree(self.tmp_dir)
+        super().tearDown()
+
     def test_tokenize(self):
         cfg = dict(type='Tokenize', tokenizer_name='bert-base-cased')
         preprocessor = build_preprocessor(cfg, Fields.nlp)
@@ -32,6 +44,14 @@ class NLPPreprocessorTest(unittest.TestCase):
             output['attention_mask'],
             [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])
 
+    def test_save_pretrained(self):
+        preprocessor = Preprocessor.from_pretrained(
+            'damo/nlp_structbert_sentence-similarity_chinese-tiny')
+        save_path = os.path.join(self.tmp_dir, 'test_save_pretrained')
+        preprocessor.save_pretrained(save_path)
+        self.assertTrue(
+            os.path.isfile(os.path.join(save_path, 'configuration.json')))
+
     def test_preprocessor_download(self):
         from modelscope.preprocessors.nlp.token_classification_preprocessor import TokenClassificationPreprocessorBase
         preprocessor: TokenClassificationPreprocessorBase = \
diff --git a/tests/run.py b/tests/run.py
index 6daba6dc..ea78fdd0 100644
--- a/tests/run.py
+++ b/tests/run.py
@@ -3,6 +3,7 @@
 
 import argparse
 import datetime
+import importlib
 import math
 import multiprocessing
 import os
@@ -362,10 +363,49 @@ def run_non_parallelizable_test_suites(suites, result_dir):
     run_command_with_popen(cmd)
 
 
+# Selected cases:
+def get_selected_cases():
+    cmd = ['python', '-u', 'tests/run_analysis.py']
+    selected_cases = []
+    with subprocess.Popen(
+            cmd,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.STDOUT,
+            bufsize=1,
+            encoding='utf8') as sub_process:
+        for line in iter(sub_process.stdout.readline, ''):
+            sys.stdout.write(line)
+            if line.startswith('Selected cases:'):
+                line = line.replace('Selected cases:', '').strip()
+                selected_cases = line.split(',')
+        sub_process.wait()
+        if sub_process.returncode != 0:
+            msg = 'Run analysis exception, returncode: %s!' % sub_process.returncode
+            logger.error(msg)
+            raise Exception(msg)
+    return selected_cases
+
+
 def run_in_subprocess(args):
     # only case args.isolated_cases run in subporcess, all other run in a subprocess
-    test_suite_files = gather_test_suites_files(
-        os.path.abspath(args.test_dir), args.pattern)
+    if not args.no_diff:  # run based on git diff
+        try:
+            test_suite_files = get_selected_cases()
+            logger.info('Tests suite to run: ')
+            for f in test_suite_files:
+                logger.info(f)
+        except Exception:
+            logger.error(
+                'Get test suite based diff exception!, will run all cases.')
+            test_suite_files = gather_test_suites_files(
+                os.path.abspath(args.test_dir), args.pattern)
+        if len(test_suite_files) == 0:
+            logger.error('Get no test suite based on diff, run all the cases.')
+            test_suite_files = gather_test_suites_files(
+                os.path.abspath(args.test_dir), args.pattern)
+    else:
+        test_suite_files = gather_test_suites_files(
+            os.path.abspath(args.test_dir), args.pattern)
 
     non_parallelizable_suites = [
         'test_download_dataset.py',
@@ -579,11 +619,18 @@ if __name__ == '__main__':
         type=int,
         help='Set case parallels, default single process, set with gpu number.'
     )
+    parser.add_argument(
+        '--no-diff',
+        action='store_true',
+        help=
+        'Default running case based on git diff(with master), disable with --no-diff)'
+    )
     parser.add_argument(
         '--suites',
         nargs='*',
         help='Run specified test suites(test suite files list split by space)')
     args = parser.parse_args()
+    print(args)
     set_test_level(args.level)
     os.environ['REGRESSION_BASELINE'] = '1'
     logger.info(f'TEST LEVEL: {test_level()}')
diff --git a/tests/run_analysis.py b/tests/run_analysis.py
new file mode 100644
index 00000000..ff2f2c4a
--- /dev/null
+++ b/tests/run_analysis.py
@@ -0,0 +1,364 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import os
+import subprocess
+import sys
+from fnmatch import fnmatch
+
+from trainers.model_trainer_map import model_trainer_map
+from utils.case_file_analyzer import get_pipelines_trainers_test_info
+from utils.source_file_analyzer import (get_all_register_modules,
+                                        get_file_register_modules,
+                                        get_import_map)
+
+from modelscope.hub.api import HubApi
+from modelscope.hub.errors import NotExistError
+from modelscope.hub.file_download import model_file_download
+from modelscope.hub.utils.utils import get_cache_dir
+from modelscope.utils.config import Config
+from modelscope.utils.constant import ModelFile
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+
+def get_models_info(groups: list) -> dict:
+    models = []
+    api = HubApi()
+    for group in groups:
+        page = 1
+        while True:
+            query_result = api.list_models(group, page, 100)
+            models.extend(query_result['Models'])
+            if len(models) >= query_result['TotalCount']:
+                break
+            page += 1
+    cache_root = get_cache_dir()
+    models_info = {}  # key model id, value model info
+    for model_info in models:
+        model_id = '%s/%s' % (group, model_info['Name'])
+        configuration_file = os.path.join(cache_root, model_id,
+                                          ModelFile.CONFIGURATION)
+        if not os.path.exists(configuration_file):
+            try:
+                model_revisions = api.list_model_revisions(model_id=model_id)
+                if len(model_revisions) == 0:
+                    print('Model: %s has no revision' % model_id)
+                    continue
+                # get latest revision
+                configuration_file = model_file_download(
+                    model_id=model_id,
+                    file_path=ModelFile.CONFIGURATION,
+                    revision=model_revisions[0])
+            except Exception as e:
+                print('Download model: %s configuration file exception'
+                      % model_id)
+                print('Exception: %s' % e)
+                continue
+        try:
+            cfg = Config.from_file(configuration_file)
+        except Exception as e:
+            print('Resolve model: %s configuration file failed!' % model_id)
+            print(('Exception: %s' % e))
+
+        model_info = {}
+        model_info['framework'] = cfg.safe_get('framework')
+        model_info['task'] = cfg.safe_get('task')
+        model_info['model_type'] = cfg.safe_get('model.type')
+        model_info['pipeline_type'] = cfg.safe_get('pipeline.type')
+        model_info['preprocessor_type'] = cfg.safe_get('preprocessor.type')
+        train_hooks_type = []
+        train_hooks = cfg.safe_get('train.hooks')
+        if train_hooks is not None:
+            for train_hook in train_hooks:
+                train_hooks_type.append(train_hook.type)
+        model_info['train_hooks_type'] = train_hooks_type
+        model_info['datasets'] = cfg.safe_get('dataset')
+
+        model_info['evaluation_metics'] = cfg.safe_get('evaluation.metrics',
+                                                       [])  # metrics name list
+        """
+        print('framework: %s, task: %s, model_type: %s, pipeline_type: %s, \
+            preprocessor_type: %s, train_hooks_type: %s,  \
+            dataset: %s, evaluation_metics: %s'%(
+                framework, task, model_type, pipeline_type,
+                preprocessor_type, ','.join(train_hooks_type),
+                datasets, evaluation_metics))
+        """
+        models_info[model_id] = model_info
+    return models_info
+
+
+def gather_test_suites_files(test_dir='./tests',
+                             pattern='test_*.py',
+                             is_full_path=True):
+    case_file_list = []
+    for dirpath, dirnames, filenames in os.walk(test_dir):
+        for file in filenames:
+            if fnmatch(file, pattern):
+                if is_full_path:
+                    case_file_list.append(os.path.join(dirpath, file))
+                else:
+                    case_file_list.append(file)
+
+    return case_file_list
+
+
+def run_command_get_output(cmd):
+    response = subprocess.run(
+        cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+    try:
+        response.check_returncode()
+        output = response.stdout.decode('utf8')
+        return output
+    except subprocess.CalledProcessError as error:
+        print('stdout: %s, stderr: %s' %
+              (response.stdout.decode('utf8'), error.stderr.decode('utf8')))
+        return None
+
+
+def get_current_branch():
+    cmd = ['git', 'rev-parse', '--abbrev-ref', 'HEAD']
+    branch = run_command_get_output(cmd).strip()
+    logger.info('Testing branch: %s' % branch)
+    return branch
+
+
+def get_modified_files():
+    cmd = ['git', 'diff', '--name-only', 'origin/master...']
+    cmd_output = run_command_get_output(cmd)
+    logger.info('Modified files: ')
+    logger.info(cmd_output)
+    return cmd_output.splitlines()
+
+
+def analysis_diff():
+    """Get modified files and their imported files modified modules
+    """
+    # ignore diff for constant define files, these files import by all pipeline, trainer
+    ignore_files = [
+        'modelscope/utils/constant.py', 'modelscope/metainfo.py',
+        'modelscope/pipeline_inputs.py', 'modelscope/outputs/outputs.py'
+    ]
+
+    modified_register_modules = []
+    modified_cases = []
+    modified_files_imported_by = []
+    modified_files = get_modified_files()
+    logger.info('Modified files:\n %s' % '\n'.join(modified_files))
+
+    logger.info('Starting get import map')
+    import_map = get_import_map()
+    logger.info('Finished get import map')
+    for modified_file in modified_files:
+        if ((modified_file.startswith('./modelscope')
+             or modified_file.startswith('modelscope'))
+                and modified_file not in ignore_files):  # is source file
+            for k, v in import_map.items():
+                if modified_file in v and modified_file != k:
+                    modified_files_imported_by.append(k)
+    logger.info('There are affected files: %s'
+                % len(modified_files_imported_by))
+    for f in modified_files_imported_by:
+        logger.info(f)
+    modified_files.extend(modified_files_imported_by)  # add imported by file
+    for modified_file in modified_files:
+        if modified_file.startswith('./modelscope') or \
+           modified_file.startswith('modelscope'):
+            modified_register_modules.extend(
+                get_file_register_modules(modified_file))
+        elif ((modified_file.startswith('./tests')
+               or modified_file.startswith('tests'))
+              and os.path.basename(modified_file).startswith('test_')):
+            modified_cases.append(modified_file)
+
+    return modified_register_modules, modified_cases
+
+
+def split_test_suites():
+    test_suite_full_paths = gather_test_suites_files()
+    pipeline_test_suites = []
+    trainer_test_suites = []
+    other_test_suites = []
+    for test_suite in test_suite_full_paths:
+        if test_suite.find('tests/trainers') != -1:
+            trainer_test_suites.append(test_suite)
+        elif test_suite.find('tests/pipelines') != -1:
+            pipeline_test_suites.append(test_suite)
+        else:
+            other_test_suites.append(test_suite)
+
+    return pipeline_test_suites, trainer_test_suites, other_test_suites
+
+
+def get_test_suites_to_run():
+    branch = get_current_branch()
+    if branch == 'master':
+        # when run with master, run all the cases
+        return gather_test_suites_files(is_full_path=False)
+    affected_register_modules, modified_cases = analysis_diff()
+    # affected_register_modules list of modified file and dependent file's register_module.
+    # ("MODULES|PIPELINES|TRAINERS|...", '', '', model_class_name)
+    # modified_cases, modified case file.
+    all_register_modules = get_all_register_modules()
+    _, _, other_test_suites = split_test_suites()
+    task_pipeline_test_suite_map, trainer_test_suite_map = get_pipelines_trainers_test_info(
+        all_register_modules)
+    # task_pipeline_test_suite_map key: pipeline task, value: case file path
+    # trainer_test_suite_map key: trainer_name, value: case file path
+    models_info = get_models_info(['damo'])
+    # model_info key: model_id, value: model info such as framework task etc.
+    affected_pipeline_cases = []
+    affected_trainer_cases = []
+    for affected_register_module in affected_register_modules:
+        # affected_register_module PIPELINE structure
+        # ["PIPELINES", "acoustic_noise_suppression", "speech_frcrn_ans_cirm_16k", "ANSPipeline"]
+        # ["PIPELINES", task, pipeline_name, pipeline_class_name]
+        if affected_register_module[0] == 'PIPELINES':
+            if affected_register_module[1] in task_pipeline_test_suite_map:
+                affected_pipeline_cases.extend(
+                    task_pipeline_test_suite_map[affected_register_module[1]])
+            else:
+                logger.warn('Pipeline task: %s has no test case!'
+                            % affected_register_module[1])
+        elif affected_register_module[0] == 'MODELS':
+            # ["MODELS", "keyword_spotting", "kws_kwsbp", "GenericKeyWordSpotting"],
+            # ["MODELS", task, model_name, model_class_name]
+            if affected_register_module[1] in task_pipeline_test_suite_map:
+                affected_pipeline_cases.extend(
+                    task_pipeline_test_suite_map[affected_register_module[1]])
+            else:
+                logger.warn('Pipeline task: %s has no test case!'
+                            % affected_register_module[1])
+        elif affected_register_module[0] == 'TRAINERS':
+            # ["TRAINERS", "", "nlp_base_trainer", "NlpEpochBasedTrainer"],
+            # ["TRAINERS", "", trainer_name, trainer_class_name]
+            if affected_register_module[2] in trainer_test_suite_map:
+                affected_trainer_cases.extend(
+                    trainer_test_suite_map[affected_register_module[2]])
+            else:
+                logger.warn('Trainer %s his no case' %
+                            (affected_register_module[2]))
+        elif affected_register_module[0] == 'PREPROCESSORS':
+            # ["PREPROCESSORS", "cv", "object_detection_scrfd", "SCRFDPreprocessor"]
+            # ["PREPROCESSORS", domain, preprocessor_name, class_name]
+            for model_id, model_info in models_info.items():
+                if model_info['preprocessor_type'] is not None and model_info[
+                        'preprocessor_type'] == affected_register_module[2]:
+                    task = model_info['task']
+                    if task in task_pipeline_test_suite_map:
+                        affected_pipeline_cases.extend(
+                            task_pipeline_test_suite_map[task])
+                    if model_id in model_trainer_map:
+                        affected_trainer_cases.extend(
+                            model_trainer_map[model_id])
+        elif (affected_register_module[0] == 'HOOKS'
+              or affected_register_module[0] == 'TASK_DATASETS'):
+            # ["HOOKS", "", "CheckpointHook", "CheckpointHook"]
+            # ["HOOKS", "", hook_name, class_name]
+            # HOOKS, DATASETS modify run all trainer cases
+            for _, cases in trainer_test_suite_map.items():
+                affected_trainer_cases.extend(cases)
+        elif affected_register_module[0] == 'METRICS':
+            # ["METRICS", "default_group", "accuracy", "AccuracyMetric"]
+            # ["METRICS", group, metric_name, class_name]
+            for model_id, model_info in models_info.items():
+                if affected_register_module[2] in model_info[
+                        'evaluation_metics']:
+                    if model_id in model_trainer_map:
+                        affected_trainer_cases.extend(
+                            model_trainer_map[model_id])
+
+    # deduplication
+    affected_pipeline_cases = list(set(affected_pipeline_cases))
+    affected_trainer_cases = list(set(affected_trainer_cases))
+    test_suites_to_run = []
+    for test_suite in other_test_suites:
+        test_suites_to_run.append(os.path.basename(test_suite))
+    for test_suite in affected_pipeline_cases:
+        test_suites_to_run.append(os.path.basename(test_suite))
+    for test_suite in affected_trainer_cases:
+        test_suites_to_run.append(os.path.basename(test_suite))
+
+    for modified_case in modified_cases:
+        if modified_case not in test_suites_to_run:
+            test_suites_to_run.append(os.path.basename(modified_case))
+    return test_suites_to_run
+
+
+def get_files_related_modules(files):
+    register_modules = []
+    for single_file in files:
+        if single_file.startswith('./modelscope') or \
+           single_file.startswith('modelscope'):
+            register_modules.extend(get_file_register_modules(single_file))
+
+    return register_modules
+
+
+def get_modules_related_cases(register_modules, task_pipeline_test_suite_map,
+                              trainer_test_suite_map):
+    affected_pipeline_cases = []
+    affected_trainer_cases = []
+    for register_module in register_modules:
+        if register_module[0] == 'PIPELINES' or \
+           register_module[0] == 'MODELS':
+            if register_module[1] in task_pipeline_test_suite_map:
+                affected_pipeline_cases.extend(
+                    task_pipeline_test_suite_map[register_module[1]])
+            else:
+                logger.warn('Pipeline task: %s has no test case!'
+                            % register_module[1])
+        elif register_module[0] == 'TRAINERS':
+            if register_module[2] in trainer_test_suite_map:
+                affected_trainer_cases.extend(
+                    trainer_test_suite_map[register_module[2]])
+            else:
+                logger.warn('Trainer %s his no case' % (register_module[2]))
+    return affected_pipeline_cases, affected_trainer_cases
+
+
+def get_all_file_test_info():
+    all_files = [
+        os.path.relpath(os.path.join(dp, f), os.getcwd())
+        for dp, dn, filenames in os.walk(
+            os.path.join(os.getcwd(), 'modelscope')) for f in filenames
+        if os.path.splitext(f)[1] == '.py'
+    ]
+    import_map = get_import_map()
+    all_register_modules = get_all_register_modules()
+    task_pipeline_test_suite_map, trainer_test_suite_map = get_pipelines_trainers_test_info(
+        all_register_modules)
+    reverse_depend_map = {}
+    for f in all_files:
+        depend_by = []
+        for k, v in import_map.items():
+            if f in v and f != k:
+                depend_by.append(k)
+        reverse_depend_map[f] = depend_by
+    # get cases.
+    test_info = {}
+    for f in all_files:
+        file_test_info = {}
+        file_test_info['imports'] = import_map[f]
+        file_test_info['imported_by'] = reverse_depend_map[f]
+        register_modules = get_files_related_modules([f]
+                                                     + reverse_depend_map[f])
+        file_test_info['relate_modules'] = register_modules
+        affected_pipeline_cases, affected_trainer_cases = get_modules_related_cases(
+            register_modules, task_pipeline_test_suite_map,
+            trainer_test_suite_map)
+        file_test_info['pipeline_cases'] = affected_pipeline_cases
+        file_test_info['trainer_cases'] = affected_trainer_cases
+        file_relative_path = os.path.relpath(f, os.getcwd())
+        test_info[file_relative_path] = file_test_info
+
+    with open('./test_relate_info.json', 'w') as f:
+        import json
+        json.dump(test_info, f)
+
+
+if __name__ == '__main__':
+    test_suites_to_run = get_test_suites_to_run()
+    msg = ','.join(test_suites_to_run)
+    print('Selected cases: %s' % msg)
diff --git a/tests/run_config.yaml b/tests/run_config.yaml
index efc216de..ef4d34f5 100644
--- a/tests/run_config.yaml
+++ b/tests/run_config.yaml
@@ -12,6 +12,7 @@ isolated:  # test cases that may require excessive anmount of GPU memory or run
   - test_segmentation_pipeline.py
   - test_movie_scene_segmentation.py
   - test_image_inpainting.py
+  - test_image_paint_by_example.py
   - test_mglm_text_summarization.py
   - test_team_transfer_trainer.py
   - test_image_denoise_trainer.py
@@ -49,8 +50,15 @@ isolated:  # test cases that may require excessive anmount of GPU memory or run
   - test_kws_nearfield_trainer.py
   - test_gpt3_text_generation.py
   - test_ddcolor_image_colorization.py
+  - test_video_colorization.py
   - test_image_defrcn_fewshot_trainer.py
   - test_image_deblur_trainer.py
+  - test_image_quality_assessment_degradation.py
+  - test_image_quality_assessment_mos.py
+  - test_image_restoration.py
+  - test_video_deinterlace.py
+  - test_image_inpainting_sdv2.py
+  - test_bad_image_detecting.py
 
 envs:
   default: # default env, case not in other env will in default, pytorch.
diff --git a/tests/trainers/audio/test_kws_nearfield_trainer.py b/tests/trainers/audio/test_kws_nearfield_trainer.py
index a61f70bf..af434048 100644
--- a/tests/trainers/audio/test_kws_nearfield_trainer.py
+++ b/tests/trainers/audio/test_kws_nearfield_trainer.py
@@ -84,14 +84,16 @@ class TestKwsNearfieldTrainer(unittest.TestCase):
         kwargs = dict(
             model=self.model_id,
             work_dir=self.tmp_dir,
-            cfg_file=self.config_file,
-            train_data=self.train_scp,
-            cv_data=self.cv_scp,
-            trans_data=self.trans_file)
+            cfg_file=self.config_file)
 
         trainer = build_trainer(
             Trainers.speech_kws_fsmn_char_ctc_nearfield, default_args=kwargs)
-        trainer.train()
+
+        kwargs = dict(
+            train_data=self.train_scp,
+            cv_data=self.cv_scp,
+            trans_data=self.trans_file)
+        trainer.train(**kwargs)
 
         rank, _ = get_dist_info()
         if rank == 0:
diff --git a/tests/trainers/easycv/test_easycv_trainer_detection_dino.py b/tests/trainers/easycv/test_easycv_trainer_detection_dino.py
new file mode 100644
index 00000000..90d1f691
--- /dev/null
+++ b/tests/trainers/easycv/test_easycv_trainer_detection_dino.py
@@ -0,0 +1,69 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import glob
+import os
+import shutil
+import tempfile
+import unittest
+
+import torch
+
+from modelscope.metainfo import Trainers
+from modelscope.msdatasets import MsDataset
+from modelscope.trainers import build_trainer
+from modelscope.utils.constant import LogKeys
+from modelscope.utils.logger import get_logger
+from modelscope.utils.test_utils import test_level
+
+
+@unittest.skipIf(not torch.cuda.is_available(), 'cuda unittest')
+class EasyCVTrainerTestDetectionDino(unittest.TestCase):
+    model_id = 'damo/cv_swinl_image-object-detection_dino'
+
+    def setUp(self):
+        self.logger = get_logger()
+        self.logger.info(('Testing %s.%s' %
+                          (type(self).__name__, self._testMethodName)))
+
+    def _train(self, tmp_dir):
+        cfg_options = {'train.max_epochs': 1}
+
+        trainer_name = Trainers.easycv
+
+        train_dataset = MsDataset.load(
+            dataset_name='small_coco_for_test',
+            namespace='EasyCV',
+            split='train')
+        eval_dataset = MsDataset.load(
+            dataset_name='small_coco_for_test',
+            namespace='EasyCV',
+            split='validation')
+
+        kwargs = dict(
+            model=self.model_id,
+            train_dataset=train_dataset,
+            eval_dataset=eval_dataset,
+            work_dir=tmp_dir,
+            cfg_options=cfg_options)
+
+        trainer = build_trainer(trainer_name, kwargs)
+        trainer.train()
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_trainer_single_gpu(self):
+        temp_file_dir = tempfile.TemporaryDirectory()
+        tmp_dir = temp_file_dir.name
+        if not os.path.exists(tmp_dir):
+            os.makedirs(tmp_dir)
+
+        self._train(tmp_dir)
+
+        results_files = os.listdir(tmp_dir)
+        json_files = glob.glob(os.path.join(tmp_dir, '*.log.json'))
+        self.assertEqual(len(json_files), 1)
+        self.assertIn(f'{LogKeys.EPOCH}_1.pth', results_files)
+
+        temp_file_dir.cleanup()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/trainers/easycv/test_easycv_trainer_realtime_object_detection.py b/tests/trainers/easycv/test_easycv_trainer_realtime_object_detection.py
new file mode 100644
index 00000000..1171eed4
--- /dev/null
+++ b/tests/trainers/easycv/test_easycv_trainer_realtime_object_detection.py
@@ -0,0 +1,99 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import glob
+import os
+import shutil
+import tempfile
+import unittest
+
+import torch
+
+from modelscope.hub.snapshot_download import snapshot_download
+from modelscope.metainfo import Trainers
+from modelscope.msdatasets import MsDataset
+from modelscope.trainers import build_trainer
+from modelscope.utils.constant import DownloadMode, LogKeys, Tasks
+from modelscope.utils.logger import get_logger
+from modelscope.utils.test_utils import test_level
+
+
+@unittest.skipIf(not torch.cuda.is_available(), 'cuda unittest')
+class EasyCVTrainerTestRealtimeObjectDetection(unittest.TestCase):
+    model_id = 'damo/cv_cspnet_image-object-detection_yolox'
+
+    def setUp(self):
+        self.logger = get_logger()
+        self.logger.info(('Testing %s.%s' %
+                          (type(self).__name__, self._testMethodName)))
+
+    def _train(self, tmp_dir):
+        # cfg_options = {'train.max_epochs': 2}
+        self.cache_path = snapshot_download(self.model_id)
+        cfg_options = {
+            'train.max_epochs':
+            2,
+            'train.dataloader.batch_size_per_gpu':
+            4,
+            'evaluation.dataloader.batch_size_per_gpu':
+            2,
+            'train.hooks': [
+                {
+                    'type': 'CheckpointHook',
+                    'interval': 1
+                },
+                {
+                    'type': 'EvaluationHook',
+                    'interval': 1
+                },
+                {
+                    'type': 'TextLoggerHook',
+                    'ignore_rounding_keys': None,
+                    'interval': 2
+                },
+            ],
+            'load_from':
+            os.path.join(self.cache_path, 'pytorch_model.bin')
+        }
+
+        trainer_name = Trainers.easycv
+
+        train_dataset = MsDataset.load(
+            dataset_name='small_coco_for_test',
+            namespace='EasyCV',
+            split='train')
+        eval_dataset = MsDataset.load(
+            dataset_name='small_coco_for_test',
+            namespace='EasyCV',
+            split='validation')
+
+        kwargs = dict(
+            model=self.model_id,
+            # model_revision='v1.0.2',
+            train_dataset=train_dataset,
+            eval_dataset=eval_dataset,
+            work_dir=tmp_dir,
+            cfg_options=cfg_options)
+
+        trainer = build_trainer(trainer_name, kwargs)
+        trainer.train()
+
+    @unittest.skipUnless(
+        test_level() >= 0,
+        'skip since face_2d_keypoints_dataset is set to private for now')
+    def test_trainer_single_gpu(self):
+        temp_file_dir = tempfile.TemporaryDirectory()
+        tmp_dir = temp_file_dir.name
+        if not os.path.exists(tmp_dir):
+            os.makedirs(tmp_dir)
+
+        self._train(tmp_dir)
+
+        results_files = os.listdir(tmp_dir)
+        json_files = glob.glob(os.path.join(tmp_dir, '*.log.json'))
+        self.assertEqual(len(json_files), 1)
+        self.assertIn(f'{LogKeys.EPOCH}_2.pth', results_files)
+
+        temp_file_dir.cleanup()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/trainers/hooks/logger/test_tensorboard_hook.py b/tests/trainers/hooks/logger/test_tensorboard_hook.py
index 67b1aa63..d705f609 100644
--- a/tests/trainers/hooks/logger/test_tensorboard_hook.py
+++ b/tests/trainers/hooks/logger/test_tensorboard_hook.py
@@ -11,7 +11,7 @@ import torch
 from torch import nn
 
 from modelscope.metainfo import Trainers
-from modelscope.models.base import Model
+from modelscope.models.base import TorchModel
 from modelscope.trainers import build_trainer
 from modelscope.utils.constant import LogKeys, ModelFile
 from modelscope.utils.test_utils import create_dummy_test_dataset
@@ -20,7 +20,7 @@ dummy_dataset = create_dummy_test_dataset(
     np.random.random(size=(5, )), np.random.randint(0, 4, (1, )), 20)
 
 
-class DummyModel(nn.Module, Model):
+class DummyModel(TorchModel):
 
     def __init__(self):
         super().__init__()
diff --git a/tests/trainers/hooks/test_checkpoint_hook.py b/tests/trainers/hooks/test_checkpoint_hook.py
index e7f2d33c..34aea343 100644
--- a/tests/trainers/hooks/test_checkpoint_hook.py
+++ b/tests/trainers/hooks/test_checkpoint_hook.py
@@ -11,7 +11,7 @@ from torch import nn
 
 from modelscope.metainfo import Trainers
 from modelscope.metrics.builder import METRICS, MetricKeys
-from modelscope.models.base import Model
+from modelscope.models.base import TorchModel
 from modelscope.trainers import build_trainer
 from modelscope.utils.constant import LogKeys, ModelFile
 from modelscope.utils.registry import default_group
@@ -42,7 +42,7 @@ dummy_dataset = create_dummy_test_dataset(
     np.random.random(size=(5, )), np.random.randint(0, 4, (1, )), 20)
 
 
-class DummyModel(nn.Module, Model):
+class DummyModel(TorchModel):
 
     def __init__(self):
         super().__init__()
diff --git a/tests/trainers/hooks/test_evaluation_hook.py b/tests/trainers/hooks/test_evaluation_hook.py
index 2c71e790..320bf892 100644
--- a/tests/trainers/hooks/test_evaluation_hook.py
+++ b/tests/trainers/hooks/test_evaluation_hook.py
@@ -11,7 +11,7 @@ from torch import nn
 
 from modelscope.metainfo import Trainers
 from modelscope.metrics.builder import METRICS, MetricKeys
-from modelscope.models.base import Model
+from modelscope.models.base import TorchModel
 from modelscope.trainers import build_trainer
 from modelscope.utils.constant import ModelFile
 from modelscope.utils.registry import default_group
@@ -35,7 +35,7 @@ dummy_dataset = create_dummy_test_dataset(
     np.random.random(size=(5, )), np.random.randint(0, 4, (1, )), 20)
 
 
-class DummyModel(nn.Module, Model):
+class DummyModel(TorchModel):
 
     def __init__(self):
         super().__init__()
diff --git a/tests/trainers/hooks/test_lr_scheduler_hook.py b/tests/trainers/hooks/test_lr_scheduler_hook.py
index 7a1ff220..f4ad91c9 100644
--- a/tests/trainers/hooks/test_lr_scheduler_hook.py
+++ b/tests/trainers/hooks/test_lr_scheduler_hook.py
@@ -13,7 +13,7 @@ from torch.optim.lr_scheduler import MultiStepLR
 
 from modelscope.metainfo import Trainers
 from modelscope.metrics.builder import METRICS, MetricKeys
-from modelscope.models.base import Model
+from modelscope.models.base import TorchModel
 from modelscope.trainers import build_trainer
 from modelscope.utils.constant import LogKeys, ModelFile, TrainerStages
 from modelscope.utils.registry import default_group
@@ -41,7 +41,7 @@ def create_dummy_metric():
             return {MetricKeys.ACCURACY: self._fake_acc_by_epoch[_global_iter]}
 
 
-class DummyModel(nn.Module, Model):
+class DummyModel(TorchModel):
 
     def __init__(self):
         super().__init__()
diff --git a/tests/trainers/model_trainer_map.py b/tests/trainers/model_trainer_map.py
new file mode 100644
index 00000000..4057c331
--- /dev/null
+++ b/tests/trainers/model_trainer_map.py
@@ -0,0 +1,145 @@
+model_trainer_map = {
+    'damo/speech_frcrn_ans_cirm_16k':
+    ['tests/trainers/audio/test_ans_trainer.py'],
+    'damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch':
+    ['tests/trainers/audio/test_asr_trainer.py'],
+    'damo/speech_dfsmn_kws_char_farfield_16k_nihaomiya':
+    ['tests/trainers/audio/test_kws_farfield_trainer.py'],
+    'damo/speech_charctc_kws_phone-xiaoyun':
+    ['tests/trainers/audio/test_kws_nearfield_trainer.py'],
+    'damo/speech_mossformer_separation_temporal_8k':
+    ['tests/trainers/audio/test_separation_trainer.py'],
+    'speech_tts/speech_sambert-hifigan_tts_zh-cn_multisp_pretrain_16k':
+    ['tests/trainers/audio/test_tts_trainer.py'],
+    'damo/cv_mobilenet_face-2d-keypoints_alignment':
+    ['tests/trainers/easycv/test_easycv_trainer_face_2d_keypoints.py'],
+    'damo/cv_hrnetw18_hand-pose-keypoints_coco-wholebody':
+    ['tests/trainers/easycv/test_easycv_trainer_hand_2d_keypoints.py'],
+    'damo/cv_yolox-pai_hand-detection':
+    ['tests/trainers/easycv/test_easycv_trainer_hand_detection.py'],
+    'damo/cv_r50_panoptic-segmentation_cocopan':
+    ['tests/trainers/easycv/test_easycv_trainer_panoptic_mask2former.py'],
+    'damo/cv_segformer-b0_image_semantic-segmentation_coco-stuff164k':
+    ['tests/trainers/easycv/test_segformer.py'],
+    'damo/cv_resnet_carddetection_scrfd34gkps':
+    ['tests/trainers/test_card_detection_scrfd_trainer.py'],
+    'damo/multi-modal_clip-vit-base-patch16_zh': [
+        'tests/trainers/test_clip_trainer.py'
+    ],
+    'damo/nlp_space_pretrained-dialog-model': [
+        'tests/trainers/test_dialog_intent_trainer.py'
+    ],
+    'damo/cv_resnet_facedetection_scrfd10gkps': [
+        'tests/trainers/test_face_detection_scrfd_trainer.py'
+    ],
+    'damo/nlp_structbert_faq-question-answering_chinese-base': [
+        'tests/trainers/test_finetune_faq_question_answering.py'
+    ],
+    'PAI/nlp_gpt3_text-generation_0.35B_MoE-64': [
+        'tests/trainers/test_finetune_gpt_moe.py'
+    ],
+    'damo/nlp_gpt3_text-generation_1.3B': [
+        'tests/trainers/test_finetune_gpt3.py'
+    ],
+    'damo/mgeo_backbone_chinese_base': [
+        'tests/trainers/test_finetune_mgeo.py'
+    ],
+    'damo/mplug_backbone_base_en': ['tests/trainers/test_finetune_mplug.py'],
+    'damo/nlp_structbert_backbone_base_std': [
+        'tests/trainers/test_finetune_sequence_classification.py',
+        'tests/trainers/test_finetune_token_classification.py'
+    ],
+    'damo/nlp_palm2.0_text-generation_english-base': [
+        'tests/trainers/test_finetune_text_generation.py'
+    ],
+    'damo/nlp_gpt3_text-generation_chinese-base': [
+        'tests/trainers/test_finetune_text_generation.py'
+    ],
+    'damo/nlp_palm2.0_text-generation_chinese-base': [
+        'tests/trainers/test_finetune_text_generation.py'
+    ],
+    'damo/nlp_corom_passage-ranking_english-base': [
+        'tests/trainers/test_finetune_text_ranking.py'
+    ],
+    'damo/nlp_rom_passage-ranking_chinese-base': [
+        'tests/trainers/test_finetune_text_ranking.py'
+    ],
+    'damo/cv_nextvit-small_image-classification_Dailylife-labels': [
+        'tests/trainers/test_general_image_classification_trainer.py'
+    ],
+    'damo/cv_convnext-base_image-classification_garbage': [
+        'tests/trainers/test_general_image_classification_trainer.py'
+    ],
+    'damo/cv_beitv2-base_image-classification_patch16_224_pt1k_ft22k_in1k': [
+        'tests/trainers/test_general_image_classification_trainer.py'
+    ],
+    'damo/cv_csrnet_image-color-enhance-models': [
+        'tests/trainers/test_image_color_enhance_trainer.py'
+    ],
+    'damo/cv_nafnet_image-deblur_gopro': [
+        'tests/trainers/test_image_deblur_trainer.py'
+    ],
+    'damo/cv_resnet101_detection_fewshot-defrcn': [
+        'tests/trainers/test_image_defrcn_fewshot_trainer.py'
+    ],
+    'damo/cv_nafnet_image-denoise_sidd': [
+        'tests/trainers/test_image_denoise_trainer.py'
+    ],
+    'damo/cv_fft_inpainting_lama': [
+        'tests/trainers/test_image_inpainting_trainer.py'
+    ],
+    'damo/cv_swin-b_image-instance-segmentation_coco': [
+        'tests/trainers/test_image_instance_segmentation_trainer.py'
+    ],
+    'damo/cv_gpen_image-portrait-enhancement': [
+        'tests/trainers/test_image_portrait_enhancement_trainer.py'
+    ],
+    'damo/cv_clip-it_video-summarization_language-guided_en': [
+        'tests/trainers/test_language_guided_video_summarization_trainer.py'
+    ],
+    'damo/cv_resnet50-bert_video-scene-segmentation_movienet': [
+        'tests/trainers/test_movie_scene_segmentation_trainer.py'
+    ],
+    'damo/ofa_mmspeech_pretrain_base_zh': [
+        'tests/trainers/test_ofa_mmspeech_trainer.py'
+    ],
+    'damo/ofa_ocr-recognition_scene_base_zh': [
+        'tests/trainers/test_ofa_trainer.py'
+    ],
+    'damo/nlp_plug_text-generation_27B': [
+        'tests/trainers/test_plug_finetune_text_generation.py'
+    ],
+    'damo/cv_swin-t_referring_video-object-segmentation': [
+        'tests/trainers/test_referring_video_object_segmentation_trainer.py'
+    ],
+    'damo/nlp_convai_text2sql_pretrain_cn': [
+        'tests/trainers/test_table_question_answering_trainer.py'
+    ],
+    'damo/multi-modal_team-vit-large-patch14_multi-modal-similarity': [
+        'tests/trainers/test_team_transfer_trainer.py'
+    ],
+    'damo/cv_tinynas_object-detection_damoyolo': [
+        'tests/trainers/test_tinynas_damoyolo_trainer.py'
+    ],
+    'damo/nlp_structbert_sentence-similarity_chinese-tiny': [
+        'tests/trainers/test_trainer_with_nlp.py'
+    ],
+    'damo/nlp_structbert_sentiment-classification_chinese-base': [
+        'tests/trainers/test_trainer_with_nlp.py'
+    ],
+    'damo/nlp_structbert_sentence-similarity_chinese-base': [
+        'tests/trainers/test_trainer_with_nlp.py'
+    ],
+    'damo/nlp_csanmt_translation_en2zh': [
+        'tests/trainers/test_translation_trainer.py'
+    ],
+    'damo/nlp_csanmt_translation_en2fr': [
+        'tests/trainers/test_translation_trainer.py'
+    ],
+    'damo/nlp_csanmt_translation_en2es': [
+        'tests/trainers/test_translation_trainer.py'
+    ],
+    'damo/cv_googlenet_pgl-video-summarization': [
+        'tests/trainers/test_video_summarization_trainer.py'
+    ],
+}
diff --git a/tests/trainers/test_document_grounded_dialog_generate_trainer.py b/tests/trainers/test_document_grounded_dialog_generate_trainer.py
new file mode 100644
index 00000000..a2add9cd
--- /dev/null
+++ b/tests/trainers/test_document_grounded_dialog_generate_trainer.py
@@ -0,0 +1,50 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os
+import unittest
+
+import json
+
+from modelscope.msdatasets import MsDataset
+from modelscope.trainers.nlp.document_grounded_dialog_generate_trainer import \
+    DocumentGroundedDialogGenerateTrainer
+from modelscope.utils.constant import DownloadMode, ModelFile
+from modelscope.utils.test_utils import test_level
+
+
+class DocumentGroundedDialogGenerateTest(unittest.TestCase):
+
+    def setUp(self) -> None:
+        self.model_id = 'DAMO_ConvAI/nlp_convai_generation_pretrain'
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_trainer_with_model_name(self):
+        # load data
+        train_dataset = MsDataset.load(
+            'DAMO_ConvAI/FrDoc2BotGeneration',
+            download_mode=DownloadMode.FORCE_REDOWNLOAD)
+        test_len = 1
+        sub_train_dataset = [x for x in train_dataset][:1]
+        sub_train_dataset = [{
+            'query':
+            x['query'][:test_len],
+            'rerank':
+            json.dumps([p[:test_len] for p in json.loads(x['rerank'])]),
+            'response':
+            x['response'][:test_len]
+        } for x in sub_train_dataset]
+
+        trainer = DocumentGroundedDialogGenerateTrainer(
+            model=self.model_id,
+            train_dataset=sub_train_dataset,
+            eval_dataset=sub_train_dataset,
+        )
+        trainer.model.model.config['num_beams'] = 1
+        trainer.model.model.config['target_sequence_length'] = test_len
+        trainer.train(batch_size=1, total_epoches=1, learning_rate=2e-4)
+        trainer.evaluate(
+            checkpoint_path=os.path.join(trainer.model.model_dir,
+                                         'finetuned_model.bin'))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/trainers/test_document_grounded_dialog_rerank_trainer.py b/tests/trainers/test_document_grounded_dialog_rerank_trainer.py
new file mode 100644
index 00000000..fad0b55e
--- /dev/null
+++ b/tests/trainers/test_document_grounded_dialog_rerank_trainer.py
@@ -0,0 +1,77 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os
+import unittest
+
+import json
+
+from modelscope.hub.snapshot_download import snapshot_download
+from modelscope.metainfo import Trainers
+from modelscope.msdatasets import MsDataset
+from modelscope.trainers.nlp.document_grounded_dialog_rerank_trainer import \
+    DocumentGroundedDialogRerankTrainer
+from modelscope.utils.config import Config
+from modelscope.utils.constant import DownloadMode, ModelFile, Tasks
+from modelscope.utils.test_utils import test_level
+
+
+class TestDialogIntentTrainer(unittest.TestCase):
+
+    def setUp(self):
+        self.model_id = 'DAMO_ConvAI/nlp_convai_ranking_pretrain'
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_trainer_with_model_and_args(self):
+        args = {
+            'device': 'gpu',
+            'tokenizer_name': '',
+            'cache_dir': '',
+            'instances_size': 1,
+            'output_dir': './model',
+            'max_num_seq_pairs_per_device': 32,
+            'full_train_batch_size': 32,
+            'gradient_accumulation_steps': 32,
+            'per_gpu_train_batch_size': 1,
+            'num_train_epochs': 1,
+            'train_instances': -1,
+            'learning_rate': 3e-5,
+            'max_seq_length': 128,
+            'num_labels': 2,
+            'fold': '',  # IofN
+            'doc_match_weight': 0.0,
+            'query_length': 64,
+            'resume_from': '',  # to resume training from a checkpoint
+            'config_name': '',
+            'do_lower_case': True,
+            'weight_decay': 0.0,  # previous default was 0.01
+            'adam_epsilon': 1e-8,
+            'max_grad_norm': 1.0,
+            'warmup_instances': 0,  # previous default was 0.1 of total
+            'warmup_fraction': 0.0,  # only applies if warmup_instances <= 0
+            'no_cuda': False,
+            'n_gpu': 1,
+            'seed': 42,
+            'fp16': False,
+            'fp16_opt_level': 'O1',  # previous default was O2
+            'per_gpu_eval_batch_size': 8,
+            'log_on_all_nodes': False,
+            'world_size': 1,
+            'global_rank': 0,
+            'local_rank': -1,
+            'tokenizer_resize': True,
+            'model_resize': True
+        }
+        args[
+            'gradient_accumulation_steps'] = args['full_train_batch_size'] // (
+                args['per_gpu_train_batch_size'] * args['world_size'])
+        data = MsDataset.load(
+            'DAMO_ConvAI/FrDoc2BotRerank',
+            download_mode=DownloadMode.FORCE_REDOWNLOAD,
+            split='train')
+        sub_train_dataset = [x for x in data][:10]
+        trainer = DocumentGroundedDialogRerankTrainer(
+            model=self.model_id, dataset=sub_train_dataset, args=args)
+        trainer.train()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/trainers/test_document_grounded_dialog_retrieval_trainer.py b/tests/trainers/test_document_grounded_dialog_retrieval_trainer.py
new file mode 100644
index 00000000..604bc300
--- /dev/null
+++ b/tests/trainers/test_document_grounded_dialog_retrieval_trainer.py
@@ -0,0 +1,43 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os
+import unittest
+
+import json
+
+from modelscope.msdatasets import MsDataset
+from modelscope.trainers.nlp.document_grounded_dialog_retrieval_trainer import \
+    DocumentGroundedDialogRetrievalTrainer
+from modelscope.utils.constant import DownloadMode, ModelFile
+from modelscope.utils.test_utils import test_level
+
+
+class DocumentGroundedDialogRetrievalTest(unittest.TestCase):
+
+    def setUp(self) -> None:
+        self.model_id = 'DAMO_ConvAI/nlp_convai_retrieval_pretrain'
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_trainer_with_model_name(self):
+        # load data
+        train_dataset = MsDataset.load(
+            'DAMO_ConvAI/FrDoc2BotRetrieval',
+            download_mode=DownloadMode.FORCE_REDOWNLOAD)
+        sub_train_dataset = [x for x in train_dataset][:10]
+        all_passages = ['阑尾炎', '肠胃炎', '肚脐开始', '肚脐为止']
+
+        trainer = DocumentGroundedDialogRetrievalTrainer(
+            model=self.model_id,
+            train_dataset=sub_train_dataset,
+            eval_dataset=sub_train_dataset,
+            all_passages=all_passages)
+        trainer.train(
+            batch_size=64,
+            total_epoches=2,
+        )
+        trainer.evaluate(
+            checkpoint_path=os.path.join(trainer.model.model_dir,
+                                         'finetuned_model.bin'))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/trainers/test_finetune_faq_question_answering.py b/tests/trainers/test_finetune_faq_question_answering.py
index 01c34b63..54b0a840 100644
--- a/tests/trainers/test_finetune_faq_question_answering.py
+++ b/tests/trainers/test_finetune_faq_question_answering.py
@@ -32,6 +32,7 @@ class TestFinetuneFaqQuestionAnswering(unittest.TestCase):
         }]
     }
     model_id = 'damo/nlp_structbert_faq-question-answering_chinese-base'
+    mgimn_model_id = 'damo/nlp_mgimn_faq-question-answering_chinese-base'
 
     def setUp(self):
         print(('Testing %s.%s' % (type(self).__name__, self._testMethodName)))
@@ -43,7 +44,7 @@ class TestFinetuneFaqQuestionAnswering(unittest.TestCase):
         shutil.rmtree(self.tmp_dir)
         super().tearDown()
 
-    def build_trainer(self):
+    def build_trainer(self, model_id, revision):
         train_dataset = MsDataset.load(
             'jd', namespace='DAMO_NLP',
             split='train').remap_columns({'sentence': 'text'})
@@ -51,7 +52,7 @@ class TestFinetuneFaqQuestionAnswering(unittest.TestCase):
             'jd', namespace='DAMO_NLP',
             split='validation').remap_columns({'sentence': 'text'})
 
-        cfg: Config = read_config(self.model_id, revision='v1.0.1')
+        cfg: Config = read_config(model_id, revision)
         cfg.train.train_iters_per_epoch = 50
         cfg.evaluation.val_iters_per_epoch = 2
         cfg.train.seed = 1234
@@ -75,7 +76,7 @@ class TestFinetuneFaqQuestionAnswering(unittest.TestCase):
         trainer = build_trainer(
             Trainers.faq_question_answering_trainer,
             default_args=dict(
-                model=self.model_id,
+                model=model_id,
                 work_dir=self.tmp_dir,
                 train_dataset=train_dataset,
                 eval_dataset=eval_dataset,
@@ -84,7 +85,7 @@ class TestFinetuneFaqQuestionAnswering(unittest.TestCase):
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_faq_model_finetune(self):
-        trainer = self.build_trainer()
+        trainer = self.build_trainer(self.model_id, 'v1.0.1')
         trainer.train()
         evaluate_result = trainer.evaluate()
         self.assertAlmostEqual(evaluate_result['accuracy'], 0.95, delta=0.1)
@@ -106,6 +107,32 @@ class TestFinetuneFaqQuestionAnswering(unittest.TestCase):
         self.assertAlmostEqual(
             result_after['output'][0][0]['score'], 0.8, delta=0.2)
 
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_faq_mgimn_model_finetune(self):
+        trainer = self.build_trainer(self.mgimn_model_id, 'v1.0.0')
+        trainer.train()
+        evaluate_result = trainer.evaluate()
+        self.assertAlmostEqual(evaluate_result['accuracy'], 0.75, delta=0.1)
+
+        results_files = os.listdir(self.tmp_dir)
+        self.assertIn(ModelFile.TRAIN_OUTPUT_DIR, results_files)
+
+        output_dir = os.path.join(self.tmp_dir, ModelFile.TRAIN_OUTPUT_DIR)
+        pipeline_ins = pipeline(
+            task=Tasks.faq_question_answering,
+            model=self.mgimn_model_id,
+            model_revision='v1.0.0')
+        result_before = pipeline_ins(self.param)
+        self.assertEqual(result_before['output'][0][0]['label'], '1')
+        self.assertAlmostEqual(
+            result_before['output'][0][0]['score'], 0.9, delta=0.2)
+        pipeline_ins = pipeline(
+            task=Tasks.faq_question_answering, model=output_dir)
+        result_after = pipeline_ins(self.param)
+        self.assertEqual(result_after['output'][0][0]['label'], '1')
+        self.assertAlmostEqual(
+            result_after['output'][0][0]['score'], 0.9, delta=0.2)
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/trainers/test_finetune_mgeo.py b/tests/trainers/test_finetune_mgeo.py
index b492497b..bb32e412 100644
--- a/tests/trainers/test_finetune_mgeo.py
+++ b/tests/trainers/test_finetune_mgeo.py
@@ -50,7 +50,7 @@ class TestFinetuneMGeo(unittest.TestCase):
         results_files = os.listdir(self.tmp_dir)
         self.assertIn(f'{trainer.timestamp}.log.json', results_files)
 
-    @unittest.skipUnless(test_level() >= 4, 'skip test in current test level')
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
     def test_finetune_geotes_rerank(self):
 
         def cfg_modify_fn(cfg):
@@ -82,8 +82,9 @@ class TestFinetuneMGeo(unittest.TestCase):
             cfg.train.dataloader.batch_size_per_gpu = 3
             cfg.train.dataloader.workers_per_gpu = 16
             cfg.evaluation.dataloader.workers_per_gpu = 16
-
-            cfg['evaluation']['metrics'] = 'mrr@1'
+            cfg.train.train_iters_per_epoch = 10
+            cfg.evaluation.val_iters_per_epoch = 10
+            cfg['evaluation']['metrics'] = 'text-ranking-metric'
             cfg.train.max_epochs = 1
             cfg.model['neg_sample'] = neg_sample
             cfg.model['gis_num'] = 2
@@ -139,14 +140,8 @@ class TestFinetuneMGeo(unittest.TestCase):
             split='validation',
             namespace='damo')
 
-        dataset = MsDataset.load(
-            'json',
-            data_files={
-                'train': [train_dataset['train'] + '/train.json'],
-                'test': [dev_dataset['validation'] + '/dev.json']
-            })
-        train_ds = dataset['train'].to_hf_dataset()
-        dev_ds = dataset['test'].to_hf_dataset()
+        train_ds = train_dataset['train']
+        dev_ds = dev_dataset['validation']
 
         model_id = 'damo/mgeo_backbone_chinese_base'
         self.finetune(
@@ -159,7 +154,7 @@ class TestFinetuneMGeo(unittest.TestCase):
         output_dir = os.path.join(self.tmp_dir, ModelFile.TRAIN_OUTPUT_DIR)
         print(f'model is saved to {output_dir}')
 
-    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
     def test_finetune_geoeag(self):
 
         def cfg_modify_fn(cfg):
@@ -170,6 +165,8 @@ class TestFinetuneMGeo(unittest.TestCase):
             cfg.evaluation.dataloader.batch_size_per_gpu = 64
             cfg.train.optimizer.lr = 2e-5
             cfg.train.max_epochs = 1
+            cfg.train.train_iters_per_epoch = 10
+            cfg.evaluation.val_iters_per_epoch = 10
 
             cfg['dataset'] = {
                 'train': {
@@ -217,7 +214,7 @@ class TestFinetuneMGeo(unittest.TestCase):
         output_dir = os.path.join(self.tmp_dir, ModelFile.TRAIN_OUTPUT_DIR)
         print(f'model is saved to {output_dir}')
 
-    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
     def test_finetune_geoeta(self):
 
         def cfg_modify_fn(cfg):
@@ -236,6 +233,8 @@ class TestFinetuneMGeo(unittest.TestCase):
             }
             cfg.train.max_epochs = 1
             cfg.train.dataloader.batch_size_per_gpu = 32
+            cfg.train.train_iters_per_epoch = 10
+            cfg.evaluation.val_iters_per_epoch = 10
             cfg.train.optimizer.lr = 3e-5
             cfg.train.hooks = [{
                 'type': 'CheckpointHook',
diff --git a/tests/trainers/test_finetune_plug_mental.py b/tests/trainers/test_finetune_plug_mental.py
new file mode 100644
index 00000000..1e0af2b4
--- /dev/null
+++ b/tests/trainers/test_finetune_plug_mental.py
@@ -0,0 +1,108 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os
+import shutil
+import tempfile
+import unittest
+from typing import Any, Callable, Dict, List, NewType, Optional, Tuple, Union
+
+import torch
+from transformers.tokenization_utils_base import PreTrainedTokenizerBase
+
+from modelscope.metainfo import Preprocessors, Trainers
+from modelscope.models import Model
+from modelscope.msdatasets import MsDataset
+from modelscope.pipelines import pipeline
+from modelscope.trainers import build_trainer
+from modelscope.utils.constant import ModelFile, Tasks
+from modelscope.utils.test_utils import test_level
+
+
+class TestFinetunePlugMental(unittest.TestCase):
+
+    def setUp(self):
+        print(('Testing %s.%s' % (type(self).__name__, self._testMethodName)))
+        self.tmp_dir = tempfile.TemporaryDirectory().name
+        if not os.path.exists(self.tmp_dir):
+            os.makedirs(self.tmp_dir)
+
+    def tearDown(self):
+        shutil.rmtree(self.tmp_dir)
+        super().tearDown()
+
+    def finetune(self,
+                 model_id,
+                 train_dataset,
+                 eval_dataset,
+                 name=Trainers.nlp_base_trainer,
+                 cfg_modify_fn=None,
+                 **kwargs):
+        kwargs = dict(
+            model=model_id,
+            train_dataset=train_dataset,
+            eval_dataset=eval_dataset,
+            work_dir=self.tmp_dir,
+            cfg_modify_fn=cfg_modify_fn,
+            **kwargs)
+
+        os.environ['LOCAL_RANK'] = '0'
+        trainer = build_trainer(name=name, default_args=kwargs)
+        trainer.train()
+        results_files = os.listdir(self.tmp_dir)
+        self.assertIn(f'{trainer.timestamp}.log.json', results_files)
+        for i in range(self.epoch_num):
+            self.assertIn(f'epoch_{i + 1}.pth', results_files)
+
+        output_files = os.listdir(
+            os.path.join(self.tmp_dir, ModelFile.TRAIN_OUTPUT_DIR))
+        self.assertIn(ModelFile.CONFIGURATION, output_files)
+        self.assertIn(ModelFile.TORCH_MODEL_BIN_FILE, output_files)
+        copy_src_files = os.listdir(trainer.model_dir)
+
+        print(f'copy_src_files are {copy_src_files}')
+        print(f'output_files are {output_files}')
+        for item in copy_src_files:
+            if not item.startswith('.'):
+                self.assertIn(item, output_files)
+
+    def pipeline_sentence_similarity(self, model_dir):
+        sentence1 = '今天气温比昨天高么？'
+        sentence2 = '今天湿度比昨天高么？'
+        model = Model.from_pretrained(model_dir)
+        pipeline_ins = pipeline(task=Tasks.sentence_similarity, model=model)
+        print(pipeline_ins(input=(sentence1, sentence2)))
+
+    @unittest.skip
+    def test_finetune_afqmc(self):
+        """This unittest is used to reproduce the clue:afqmc dataset + plug meantal model training results.
+
+        User can train a custom dataset by modifying this piece of code and comment the @unittest.skip.
+        """
+
+        def cfg_modify_fn(cfg):
+            cfg.task = Tasks.sentence_similarity
+            cfg['preprocessor'] = {'type': Preprocessors.sen_sim_tokenizer}
+            cfg.train.optimizer.lr = 2e-5
+            cfg['dataset'] = {
+                'train': {
+                    'labels': ['0', '1'],
+                    'first_sequence': 'sentence1',
+                    'second_sequence': 'sentence2',
+                    'label': 'label',
+                }
+            }
+            cfg.train.lr_scheduler.total_iters = int(
+                len(dataset['train']) / 32) * cfg.train.max_epochs
+            return cfg
+
+        dataset = MsDataset.load('clue', subset_name='afqmc')
+        self.finetune(
+            model_id='damo/nlp_plug-mental_backbone_base',
+            train_dataset=dataset['train'],
+            eval_dataset=dataset['validation'],
+            cfg_modify_fn=cfg_modify_fn)
+        output_dir = os.path.join(self.tmp_dir, ModelFile.TRAIN_OUTPUT_DIR)
+        self.pipeline_sentence_similarity(output_dir)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/trainers/test_finetune_sentence_embedding.py b/tests/trainers/test_finetune_sentence_embedding.py
new file mode 100644
index 00000000..f0ca80cd
--- /dev/null
+++ b/tests/trainers/test_finetune_sentence_embedding.py
@@ -0,0 +1,187 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os
+import shutil
+import tempfile
+import unittest
+from typing import Any, Callable, Dict, List, NewType, Optional, Tuple, Union
+
+import torch
+from transformers.tokenization_utils_base import PreTrainedTokenizerBase
+
+from modelscope.metainfo import Trainers
+from modelscope.models import Model
+from modelscope.msdatasets import MsDataset
+from modelscope.pipelines import pipeline
+from modelscope.trainers import build_trainer
+from modelscope.utils.constant import ModelFile, Tasks
+from modelscope.utils.test_utils import test_level
+
+
+class TestFinetuneSentenceEmbedding(unittest.TestCase):
+    inputs = {
+        'source_sentence': ["how long it take to get a master's degree"],
+        'sentences_to_compare': [
+            "On average, students take about 18 to 24 months to complete a master's degree.",
+            'On the other hand, some students prefer to go at a slower pace and choose to take '
+            'several years to complete their studies.',
+            'It can take anywhere from two semesters'
+        ]
+    }
+
+    def setUp(self):
+        print(('Testing %s.%s' % (type(self).__name__, self._testMethodName)))
+        self.tmp_dir = tempfile.TemporaryDirectory().name
+        if not os.path.exists(self.tmp_dir):
+            os.makedirs(self.tmp_dir)
+
+    def tearDown(self):
+        shutil.rmtree(self.tmp_dir)
+        super().tearDown()
+
+    def finetune(self,
+                 model_id,
+                 train_dataset,
+                 eval_dataset,
+                 name=Trainers.nlp_sentence_embedding_trainer,
+                 cfg_modify_fn=None,
+                 **kwargs):
+        kwargs = dict(
+            model=model_id,
+            train_dataset=train_dataset,
+            eval_dataset=eval_dataset,
+            work_dir=self.tmp_dir,
+            cfg_modify_fn=cfg_modify_fn,
+            **kwargs)
+
+        os.environ['LOCAL_RANK'] = '0'
+        trainer = build_trainer(name=name, default_args=kwargs)
+        trainer.train()
+
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_finetune_msmarco(self):
+
+        def cfg_modify_fn(cfg):
+            neg_sample = 2
+            cfg.task = 'sentence-embedding'
+            cfg['preprocessor'] = {'type': 'sentence-embedding'}
+            cfg.train.optimizer.lr = 2e-5
+            cfg['dataset'] = {
+                'train': {
+                    'type': 'bert',
+                    'query_sequence': 'query',
+                    'pos_sequence': 'positive_passages',
+                    'neg_sequence': 'negative_passages',
+                    'text_fileds': ['title', 'text'],
+                    'qid_field': 'query_id',
+                    'neg_sample': neg_sample
+                },
+                'val': {
+                    'type': 'bert',
+                    'query_sequence': 'query',
+                    'pos_sequence': 'positive_passages',
+                    'neg_sequence': 'negative_passages',
+                    'text_fileds': ['title', 'text'],
+                    'qid_field': 'query_id'
+                },
+            }
+            cfg['evaluation']['dataloader']['batch_size_per_gpu'] = 30
+            cfg.train.max_epochs = 1
+            cfg.train.train_batch_size = 2
+            cfg.train.lr_scheduler = {
+                'type': 'LinearLR',
+                'start_factor': 1.0,
+                'end_factor': 0.0,
+                'options': {
+                    'by_epoch': False
+                }
+            }
+            cfg.model['neg_sample'] = 4
+            cfg.train.hooks = [{
+                'type': 'CheckpointHook',
+                'interval': 1
+            }, {
+                'type': 'TextLoggerHook',
+                'interval': 1
+            }, {
+                'type': 'IterTimerHook'
+            }]
+            return cfg
+
+        # load dataset
+        ds = MsDataset.load('passage-ranking-demo', 'zyznull')
+        train_ds = ds['train'].to_hf_dataset()
+        dev_ds = ds['dev'].to_hf_dataset()
+
+        model_id = 'damo/nlp_corom_sentence-embedding_english-base'
+        self.finetune(
+            model_id=model_id,
+            train_dataset=train_ds,
+            eval_dataset=dev_ds,
+            cfg_modify_fn=cfg_modify_fn)
+
+        output_dir = os.path.join(self.tmp_dir, ModelFile.TRAIN_OUTPUT_DIR)
+        self.pipeline_sentence_embedding(output_dir)
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_finetune_dureader(self):
+
+        def cfg_modify_fn(cfg):
+            cfg.task = 'sentence-embedding'
+            cfg['preprocessor'] = {
+                'type': 'sentence-embedding',
+                'max_length': 384
+            }
+            cfg.train.optimizer.lr = 3e-5
+            cfg['dataset'] = {
+                'train': {
+                    'type': 'bert',
+                    'query_sequence': 'query',
+                    'pos_sequence': 'positive_passages',
+                    'neg_sequence': 'negative_passages',
+                    'text_fileds': ['text'],
+                    'qid_field': 'query_id',
+                    'neg_sample': 4
+                },
+                'val': {
+                    'type': 'bert',
+                    'query_sequence': 'query',
+                    'pos_sequence': 'positive_passages',
+                    'neg_sequence': 'negative_passages',
+                    'text_fileds': ['text'],
+                    'qid_field': 'query_id'
+                },
+            }
+            cfg['evaluation']['dataloader']['batch_size_per_gpu'] = 3
+            cfg.train.max_epochs = 2
+            cfg.train.train_batch_size = 4
+            cfg.train.hooks = [{
+                'type': 'CheckpointHook',
+                'interval': 1
+            }, {
+                'type': 'TextLoggerHook',
+                'interval': 1
+            }, {
+                'type': 'IterTimerHook'
+            }]
+            return cfg
+
+        # load dataset
+        ds = MsDataset.load('dureader-retrieval-ranking', 'zyznull')
+        train_ds = ds['train'].to_hf_dataset().shard(1000, index=0)
+        dev_ds = ds['dev'].to_hf_dataset()
+        model_id = 'damo/nlp_corom_sentence-embedding_chinese-base'
+        self.finetune(
+            model_id=model_id,
+            train_dataset=train_ds,
+            eval_dataset=dev_ds,
+            cfg_modify_fn=cfg_modify_fn)
+
+    def pipeline_sentence_embedding(self, model_dir):
+        model = Model.from_pretrained(model_dir)
+        pipeline_ins = pipeline(task=Tasks.sentence_embedding, model=model)
+        print('inputs', self.inputs)
+        print(pipeline_ins(input=self.inputs))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/trainers/test_finetune_sequence_classification.py b/tests/trainers/test_finetune_sequence_classification.py
index f5632b63..797351aa 100644
--- a/tests/trainers/test_finetune_sequence_classification.py
+++ b/tests/trainers/test_finetune_sequence_classification.py
@@ -8,12 +8,13 @@ from modelscope.metainfo import Preprocessors, Trainers
 from modelscope.models import Model
 from modelscope.msdatasets import MsDataset
 from modelscope.pipelines import pipeline
-from modelscope.trainers import NlpTrainerArguments, build_trainer
+from modelscope.trainers import build_trainer
 from modelscope.trainers.hooks import Hook
 from modelscope.trainers.nlp_trainer import (EpochBasedTrainer,
                                              NlpEpochBasedTrainer)
 from modelscope.trainers.optimizer.child_tuning_adamw_optimizer import \
     calculate_fisher
+from modelscope.trainers.training_args import TrainingArgs
 from modelscope.utils.constant import ModelFile, Tasks
 from modelscope.utils.data_utils import to_device
 from modelscope.utils.regress_test_utils import (MsRegressTool,
@@ -43,7 +44,7 @@ class TestFinetuneSequenceClassification(unittest.TestCase):
         dataset = MsDataset.load('clue', subset_name='tnews')
         train_dataset = dataset['train']
         validation_dataset = dataset['validation']
-        cfg_modify_fn = NlpTrainerArguments(
+        cfg_modify_fn = TrainingArgs(
             task=Tasks.text_classification,
             preprocessor_type=Preprocessors.sen_cls_tokenizer,
             train_first_sequence='sentence',
diff --git a/tests/trainers/test_image_defrcn_fewshot_trainer.py b/tests/trainers/test_image_defrcn_fewshot_trainer.py
index d007e23c..c981e42c 100644
--- a/tests/trainers/test_image_defrcn_fewshot_trainer.py
+++ b/tests/trainers/test_image_defrcn_fewshot_trainer.py
@@ -46,17 +46,35 @@ class TestImageDefrcnFewShotTrainer(unittest.TestCase):
     def test_trainer(self):
 
         split = 1
-        kwargs = dict(
-            model=self.model_id,
-            data_dir=self.data_dir,
-            work_dir=self.tmp_dir,
-            model_weights=os.path.join(get_cache_dir(), self.model_id,
-                                       'ImageNetPretrained/MSRA/R-101.pkl'),
-            data_type='pascal_voc',
-            config_path='defrcn_det_r101_base{}.yaml'.format(split),
-            datasets_train=('voc_2007_trainval_base{}'.format(split),
-                            'voc_2012_trainval_base{}'.format(split)),
-            datasets_test=('voc_2007_test_base{}'.format(split), ))
+
+        def base_cfg_modify_fn(cfg):
+            cfg.train.work_dir = self.tmp_dir
+
+            cfg.model.roi_heads.backward_scale = 0.75
+            cfg.model.roi_heads.num_classes = 15
+            cfg.model.roi_heads.freeze_feat = False
+            cfg.model.roi_heads.cls_dropout = False
+            cfg.model.weights = os.path.join(
+                get_cache_dir(), self.model_id,
+                'ImageNetPretrained/MSRA/R-101.pkl')
+
+            cfg.datasets.root = self.data_dir
+            cfg.datasets.type = 'pascal_voc'
+            cfg.datasets.train = [
+                'voc_2007_trainval_base{}'.format(split),
+                'voc_2012_trainval_base{}'.format(split)
+            ]
+            cfg.datasets.test = ['voc_2007_test_base{}'.format(split)]
+            cfg.input.min_size_test = 50
+            cfg.train.dataloader.ims_per_batch = 4
+            cfg.train.max_iter = 300
+            cfg.train.optimizer.lr = 0.001
+            cfg.train.lr_scheduler.warmup_iters = 100
+
+            cfg.test.pcb_enable = False
+            return cfg
+
+        kwargs = dict(model=self.model_id, cfg_modify_fn=base_cfg_modify_fn)
         trainer = build_trainer(
             name=Trainers.image_fewshot_detection, default_args=kwargs)
         trainer.train()
diff --git a/tests/trainers/test_tinynas_damoyolo_trainer.py b/tests/trainers/test_tinynas_damoyolo_trainer.py
index cf7760d2..4b110c5d 100644
--- a/tests/trainers/test_tinynas_damoyolo_trainer.py
+++ b/tests/trainers/test_tinynas_damoyolo_trainer.py
@@ -9,7 +9,6 @@ import torch
 
 from modelscope.hub.snapshot_download import snapshot_download
 from modelscope.metainfo import Trainers
-from modelscope.msdatasets import MsDataset
 from modelscope.trainers import build_trainer
 from modelscope.utils.config import Config
 from modelscope.utils.constant import ModelFile
@@ -51,8 +50,8 @@ class TestTinynasDamoyoloTrainerSingleGPU(unittest.TestCase):
             name=Trainers.tinynas_damoyolo, default_args=kwargs)
         trainer.train()
         trainer.evaluate(
-            checkpoint_path=os.path.join(self.cache_path,
-                                         'damoyolo_tinynasL25_S.pt'))
+            checkpoint_path=os.path.join('./workdirs/damoyolo_s',
+                                         'epoch_3_ckpt.pth'))
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_trainer_from_scratch_singleGPU_model_id(self):
diff --git a/tests/trainers/test_trainer.py b/tests/trainers/test_trainer.py
index c692196a..1fb915c6 100644
--- a/tests/trainers/test_trainer.py
+++ b/tests/trainers/test_trainer.py
@@ -16,7 +16,7 @@ from torch.utils.data import IterableDataset
 
 from modelscope.metainfo import Metrics, Trainers
 from modelscope.metrics.builder import MetricKeys
-from modelscope.models.base import Model
+from modelscope.models.base import TorchModel
 from modelscope.trainers import build_trainer
 from modelscope.trainers.base import DummyTrainer
 from modelscope.trainers.builder import TRAINERS
@@ -41,7 +41,7 @@ dummy_dataset_big = create_dummy_test_dataset(
     np.random.random(size=(5, )), np.random.randint(0, 4, (1, )), 40)
 
 
-class DummyModel(nn.Module, Model):
+class DummyModel(TorchModel):
 
     def __init__(self):
         super().__init__()
@@ -268,6 +268,9 @@ class TrainerTest(unittest.TestCase):
                 }, {
                     'type': 'EvaluationHook',
                     'interval': 1
+                }, {
+                    'type': 'TensorboardHook',
+                    'interval': 1
                 }]
             },
             'evaluation': {
diff --git a/tests/trainers/test_trainer_gpu.py b/tests/trainers/test_trainer_gpu.py
index ca0a15f0..1d3df533 100644
--- a/tests/trainers/test_trainer_gpu.py
+++ b/tests/trainers/test_trainer_gpu.py
@@ -15,7 +15,7 @@ from torch.utils.data import IterableDataset
 
 from modelscope.metainfo import Metrics, Trainers
 from modelscope.metrics.builder import MetricKeys
-from modelscope.models.base import Model
+from modelscope.models.base import Model, TorchModel
 from modelscope.trainers import EpochBasedTrainer, build_trainer
 from modelscope.utils.constant import LogKeys, ModeKeys, ModelFile, Tasks
 from modelscope.utils.test_utils import (DistributedTestCase,
@@ -38,7 +38,7 @@ dummy_dataset_big = create_dummy_test_dataset(
     np.random.random(size=(5, )), np.random.randint(0, 4, (1, )), 40)
 
 
-class DummyModel(nn.Module, Model):
+class DummyModel(TorchModel):
 
     def __init__(self):
         super().__init__()
diff --git a/tests/trainers/test_trainer_with_nlp.py b/tests/trainers/test_trainer_with_nlp.py
index 5e9850a7..24672cf4 100644
--- a/tests/trainers/test_trainer_with_nlp.py
+++ b/tests/trainers/test_trainer_with_nlp.py
@@ -4,6 +4,8 @@ import shutil
 import tempfile
 import unittest
 
+import numpy as np
+
 from modelscope.hub.snapshot_download import snapshot_download
 from modelscope.metainfo import Metrics
 from modelscope.models.base import Model
@@ -49,7 +51,7 @@ class TestTrainerWithNlp(unittest.TestCase):
         results_files = os.listdir(self.tmp_dir)
         self.assertIn(f'{trainer.timestamp}.log.json', results_files)
         for i in range(10):
-            self.assertIn(f'epoch_{i+1}.pth', results_files)
+            self.assertIn(f'epoch_{i + 1}.pth', results_files)
 
         output_files = os.listdir(
             os.path.join(self.tmp_dir, ModelFile.TRAIN_OUTPUT_DIR))
@@ -86,7 +88,7 @@ class TestTrainerWithNlp(unittest.TestCase):
         results_files = os.listdir(self.tmp_dir)
         self.assertIn(f'{trainer.timestamp}.log.json', results_files)
         for i in range(10):
-            self.assertIn(f'epoch_{i+1}.pth', results_files)
+            self.assertIn(f'epoch_{i + 1}.pth', results_files)
 
         eval_results = trainer.evaluate(
             checkpoint_path=os.path.join(self.tmp_dir, 'epoch_10.pth'))
@@ -113,7 +115,7 @@ class TestTrainerWithNlp(unittest.TestCase):
         results_files = os.listdir(self.tmp_dir)
         self.assertIn(f'{trainer.timestamp}.log.json', results_files)
         for i in range(20):
-            self.assertIn(f'epoch_{i+1}.pth', results_files)
+            self.assertIn(f'epoch_{i + 1}.pth', results_files)
 
         eval_results = trainer.evaluate(
             checkpoint_path=os.path.join(self.tmp_dir, 'epoch_10.pth'))
@@ -143,6 +145,7 @@ class TestTrainerWithNlp(unittest.TestCase):
             'by_epoch': False,
             'metric_key': 'accuracy',
             'max_checkpoint_num': 4,
+            'restore_best': True,
         }, {
             'type': 'TextLoggerHook',
             'interval': 1
@@ -190,6 +193,7 @@ class TestTrainerWithNlp(unittest.TestCase):
             trainer.train()
 
         results_files = os.listdir(self.tmp_dir)
+        print(results_files)
         self.assertIn(f'{trainer.timestamp}.log.json', results_files)
         for i in [22, 24, 26, 28]:
             self.assertTrue(
@@ -197,6 +201,13 @@ class TestTrainerWithNlp(unittest.TestCase):
                     f'accuracy{i}.pth' in filename
                     for filename in results_files
                 ]))
+        self.assertTrue(
+            os.path.isfile(
+                os.path.join(self.tmp_dir, 'output', 'pytorch_model.bin')))
+        self.assertTrue(
+            os.path.isfile(
+                os.path.join(self.tmp_dir, 'output_best',
+                             'pytorch_model.bin')))
 
     @unittest.skip('skip for now before test is re-configured')
     def test_trainer_with_configured_datasets(self):
@@ -227,7 +238,7 @@ class TestTrainerWithNlp(unittest.TestCase):
         results_files = os.listdir(self.tmp_dir)
         self.assertIn(f'{trainer.timestamp}.log.json', results_files)
         for i in range(cfg.train.max_epochs):
-            self.assertIn(f'epoch_{i+1}.pth', results_files)
+            self.assertIn(f'epoch_{i + 1}.pth', results_files)
 
         eval_results = trainer.evaluate(
             checkpoint_path=os.path.join(self.tmp_dir, 'epoch_10.pth'))
@@ -278,6 +289,8 @@ class TestTrainerWithNlp(unittest.TestCase):
             class EarlyStopHook(Hook):
                 PRIORITY = Priority.VERY_LOW
 
+                _should_save = False
+
                 def after_iter(self, trainer):
                     if trainer.iter == 3:
                         raise MsRegressTool.EarlyStopError('Test finished.')
@@ -302,6 +315,38 @@ class TestTrainerWithNlp(unittest.TestCase):
                 trainer, 'trainer_continue_train', level='strict'):
             trainer.train(os.path.join(self.tmp_dir, 'iter_3.pth'))
 
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_trainer_with_new_style_configuration(self):
+        tmp_dir = tempfile.TemporaryDirectory().name
+        if not os.path.exists(tmp_dir):
+            os.makedirs(tmp_dir)
+
+        def cfg_modify_fn(cfg):
+            cfg.train['checkpoint'] = {
+                # 保存最优metric对应的checkpoint
+                'best': {
+                    # 是否按照epoch进行保存，false为按照iter
+                    'by_epoch': True,
+                    # 保存的间隔
+                    'interval': 2,
+                    # 保存checkpoint数量的最大值
+                    'max_checkpoint_num': 2,
+                    # 根据指定的指标判断当前checkpoint是否为历史最优
+                    'metric_key': 'f1',
+                }
+            }
+            return cfg
+
+        kwargs = dict(
+            model='damo/nlp_structbert_sentence-similarity_chinese-tiny',
+            train_dataset=self.dataset,
+            eval_dataset=self.dataset,
+            cfg_modify_fn=cfg_modify_fn,
+            work_dir=self.tmp_dir)
+
+        trainer = build_trainer(default_args=kwargs)
+        trainer.train()
+
     @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
     def test_trainer_with_evaluation(self):
         tmp_dir = tempfile.TemporaryDirectory().name
@@ -311,6 +356,92 @@ class TestTrainerWithNlp(unittest.TestCase):
         model_id = 'damo/nlp_structbert_sentence-similarity_chinese-tiny'
         cache_path = snapshot_download(model_id)
         model = SbertForSequenceClassification.from_pretrained(cache_path)
+
+        def cfg_modify_fn(cfg):
+            cfg.preprocessor.val.keep_original_columns = [
+                'sentence1', 'sentence2'
+            ]
+            return cfg
+
+        kwargs = dict(
+            cfg_file=os.path.join(cache_path, ModelFile.CONFIGURATION),
+            model=model,
+            eval_dataset=self.dataset,
+            cfg_modify_fn=cfg_modify_fn,
+            work_dir=self.tmp_dir,
+            remove_unused_data=True)
+
+        trainer = build_trainer(default_args=kwargs)
+
+        def saving_fn(inputs, outputs):
+            with open(f'{tmp_dir}/predicts.txt', 'a') as f:
+                sentence1 = inputs.sentence1
+                sentence2 = inputs.sentence2
+                labels = inputs['labels']
+                predictions = np.argmax(
+                    outputs['logits'].cpu().numpy(), axis=1)
+                labels = labels.cpu().numpy()
+                for sent1, sent2, pred, label in zip(sentence1, sentence2,
+                                                     predictions, labels):
+                    f.writelines(f'{sent1}, {sent2}, {pred}, {label}\n')
+
+        print(
+            trainer.evaluate(
+                cache_path + '/pytorch_model.bin', saving_fn=saving_fn))
+        self.assertTrue(os.path.isfile(f'{tmp_dir}/predicts.txt'))
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_trainer_with_prediction(self):
+        tmp_dir = tempfile.TemporaryDirectory().name
+        if not os.path.exists(tmp_dir):
+            os.makedirs(tmp_dir)
+
+        model_id = 'damo/nlp_structbert_sentence-similarity_chinese-tiny'
+        cache_path = snapshot_download(model_id)
+        model = SbertForSequenceClassification.from_pretrained(cache_path)
+
+        def cfg_modify_fn(cfg):
+            cfg.preprocessor.val.keep_original_columns = [
+                'sentence1', 'sentence2'
+            ]
+            return cfg
+
+        kwargs = dict(
+            cfg_file=os.path.join(cache_path, ModelFile.CONFIGURATION),
+            model=model,
+            eval_dataset=self.dataset,
+            cfg_modify_fn=cfg_modify_fn,
+            work_dir=self.tmp_dir,
+            remove_unused_data=True)
+
+        trainer = build_trainer(default_args=kwargs)
+
+        def saving_fn(inputs, outputs):
+            with open(f'{tmp_dir}/predicts.txt', 'a') as f:
+                sentence1 = inputs.sentence1
+                sentence2 = inputs.sentence2
+                predictions = np.argmax(
+                    outputs['logits'].cpu().numpy(), axis=1)
+                for sent1, sent2, pred in zip(sentence1, sentence2,
+                                              predictions):
+                    f.writelines(f'{sent1}, {sent2}, {pred}\n')
+
+        trainer.predict(
+            predict_datasets=self.dataset,
+            saving_fn=saving_fn,
+            checkpoint_path=cache_path + '/pytorch_model.bin')
+        self.assertTrue(os.path.isfile(f'{tmp_dir}/predicts.txt'))
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_trainer_with_prediction_msdataset(self):
+        tmp_dir = tempfile.TemporaryDirectory().name
+        if not os.path.exists(tmp_dir):
+            os.makedirs(tmp_dir)
+
+        model_id = 'damo/nlp_structbert_sentence-similarity_chinese-tiny'
+        cache_path = snapshot_download(model_id)
+        model = SbertForSequenceClassification.from_pretrained(cache_path)
+
         kwargs = dict(
             cfg_file=os.path.join(cache_path, ModelFile.CONFIGURATION),
             model=model,
@@ -318,7 +449,21 @@ class TestTrainerWithNlp(unittest.TestCase):
             work_dir=self.tmp_dir)
 
         trainer = build_trainer(default_args=kwargs)
-        print(trainer.evaluate(cache_path + '/pytorch_model.bin'))
+
+        def saving_fn(inputs, outputs):
+            with open(f'{tmp_dir}/predicts.txt', 'a') as f:
+                predictions = np.argmax(
+                    outputs['logits'].cpu().numpy(), axis=1)
+                for pred in predictions:
+                    f.writelines(f'{pred}\n')
+
+        dataset = MsDataset.load('afqmc_small', split='train')
+
+        trainer.predict(
+            predict_datasets=dataset,
+            saving_fn=saving_fn,
+            checkpoint_path=cache_path + '/pytorch_model.bin')
+        self.assertTrue(os.path.isfile(f'{tmp_dir}/predicts.txt'))
 
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_trainer_with_model_and_args(self):
@@ -342,7 +487,7 @@ class TestTrainerWithNlp(unittest.TestCase):
         results_files = os.listdir(self.tmp_dir)
         self.assertIn(f'{trainer.timestamp}.log.json', results_files)
         for i in range(2):
-            self.assertIn(f'epoch_{i+1}.pth', results_files)
+            self.assertIn(f'epoch_{i + 1}.pth', results_files)
 
 
 if __name__ == '__main__':
diff --git a/tests/trainers/test_training_args.py b/tests/trainers/test_training_args.py
index 0aad9ddc..6e4d306e 100644
--- a/tests/trainers/test_training_args.py
+++ b/tests/trainers/test_training_args.py
@@ -1,17 +1,8 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
-import glob
-import os
-import shutil
-import tempfile
 import unittest
 
-import cv2
-import json
-import numpy as np
-import torch
-
-from modelscope.trainers.training_args import (ArgAttr, CliArgumentParser,
-                                               training_args)
+from modelscope.trainers.default_config import DEFAULT_CONFIG
+from modelscope.trainers.training_args import CliArgumentParser, TrainingArgs
 from modelscope.utils.test_utils import test_level
 
 
@@ -25,54 +16,32 @@ class TrainingArgsTest(unittest.TestCase):
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_define_args(self):
-        myparser = CliArgumentParser(training_args)
+        myparser = CliArgumentParser(TrainingArgs())
         input_args = [
-            '--max_epochs', '100', '--work_dir', 'ddddd', '--train_batch_size',
-            '8', '--unkown', 'unkown'
+            '--max_epochs', '100', '--work_dir', 'ddddd',
+            '--per_device_train_batch_size', '8', '--unkown', 'unkown'
         ]
         args, remainning = myparser.parse_known_args(input_args)
         myparser.print_help()
         self.assertTrue(args.max_epochs == 100)
         self.assertTrue(args.work_dir == 'ddddd')
-        self.assertTrue(args.train_batch_size == 8)
+        self.assertTrue(args.per_device_train_batch_size == 8)
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
-    def test_new_args(self):
-        training_args.num_classes = ArgAttr(
-            'model.mm_model.head.num_classes',
-            type=int,
-            help='number of classes')
-        training_args.mean = ArgAttr(
-            'train.data.mean', help='3-dim mean vector')
-        training_args.flip = ArgAttr('train.data.flip', help='flip or not')
-        training_args.img_size = ArgAttr(
-            'train.data.img_size', help='image size')
-        myparser = CliArgumentParser(training_args)
+    def test_flatten_args(self):
+        cfg = DEFAULT_CONFIG
         input_args = [
-            '--max_epochs', '100', '--work_dir', 'ddddd', '--train_batch_size',
-            '8', '--num_classes', '10', '--mean', '[125.0,125.0,125.0]',
-            '--flip', 'false', '--img_size', '(640,640)'
+            '--optimizer_params',
+            'weight_decay=0.8,eps=1e-6,correct_bias=False',
+            '--lr_scheduler_params', 'initial_lr=3e-5,niter_decay=1'
         ]
-        args, remainning = myparser.parse_known_args(input_args)
-        myparser.print_help()
-        self.assertTrue(args.max_epochs == 100)
-        self.assertTrue(args.work_dir == 'ddddd')
-        self.assertTrue(args.train_batch_size == 8)
-        self.assertTrue(args.num_classes == 10)
-        self.assertTrue(len(args.mean) == 3)
-        self.assertTrue(not args.flip)
-        self.assertAlmostEqual(args.mean[0], 125.0)
-        self.assertAlmostEqual(args.img_size, (640, 640))
-
-        cfg_dict = myparser.get_cfg_dict(args=input_args)
-        self.assertTrue(cfg_dict['model.mm_model.head.num_classes'] == 10)
-        self.assertAlmostEqual(cfg_dict['train.data.mean'],
-                               [125.0, 125.0, 125.0])
-        self.assertTrue(not cfg_dict['train.data.flip'])
-        self.assertEqual(cfg_dict['train.dataloader.batch_size_per_gpu'], 8)
-        self.assertEqual(cfg_dict['train.work_dir'], 'ddddd')
-        self.assertEqual(cfg_dict['train.max_epochs'], 100)
-        self.assertEqual(cfg_dict['train.data.img_size'], (640, 640))
+        training_args = TrainingArgs.from_cli(input_args)
+        cfg = training_args(cfg)
+        self.assertAlmostEqual(cfg.train.optimizer.weight_decay, 0.8)
+        self.assertAlmostEqual(cfg.train.optimizer.eps, 1e-6)
+        self.assertFalse(cfg.train.optimizer.correct_bias)
+        self.assertAlmostEqual(cfg.train.lr_scheduler.initial_lr, 3e-5)
+        self.assertEqual(cfg.train.lr_scheduler.niter_decay, 1)
 
 
 if __name__ == '__main__':
diff --git a/tests/utils/case_file_analyzer.py b/tests/utils/case_file_analyzer.py
new file mode 100644
index 00000000..64707340
--- /dev/null
+++ b/tests/utils/case_file_analyzer.py
@@ -0,0 +1,415 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+from __future__ import print_function
+import ast
+import os
+from typing import Any
+
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+SYSTEM_TRAINER_BUILDER_FINCTION_NAME = 'build_trainer'
+SYSTEM_TRAINER_BUILDER_PARAMETER_NAME = 'name'
+SYSTEM_PIPELINE_BUILDER_FUNCTION_NAME = 'pipeline'
+SYSTEM_PIPELINE_BUILDER_PARAMETER_NAME = 'task'
+
+
+class AnalysisTestFile(ast.NodeVisitor):
+    """Analysis test suite files.
+       Get global function and test class
+
+    Args:
+        ast (NodeVisitor): The ast node.
+    Examples:
+        >>> with open(test_suite_file, "rb") as f:
+        >>>     src = f.read()
+        >>> analyzer = AnalysisTestFile(test_suite_file)
+        >>> analyzer.visit(ast.parse(src, filename=test_suite_file))
+    """
+
+    def __init__(self, test_suite_file, builder_function_name) -> None:
+        super().__init__()
+        self.test_classes = []
+        self.builder_function_name = builder_function_name
+        self.global_functions = []
+        self.custom_global_builders = [
+        ]  # global trainer builder method(call build_trainer)
+        self.custom_global_builder_calls = []  # the builder call statement
+
+    def visit_ClassDef(self, node) -> bool:
+        """Check if the class is a unittest suite.
+
+        Args:
+            node (ast.Node): the ast node
+
+        Returns: True if is a test class.
+        """
+        for base in node.bases:
+            if isinstance(base, ast.Attribute) and base.attr == 'TestCase':
+                self.test_classes.append(node)
+            elif isinstance(base, ast.Name) and 'TestCase' in base.id:
+                self.test_classes.append(node)
+
+    def visit_FunctionDef(self, node: ast.FunctionDef):
+        self.global_functions.append(node)
+        for statement in ast.walk(node):
+            if isinstance(statement, ast.Call) and \
+               isinstance(statement.func, ast.Name):
+                if statement.func.id == self.builder_function_name:
+                    self.custom_global_builders.append(node)
+                    self.custom_global_builder_calls.append(statement)
+
+
+class AnalysisTestClass(ast.NodeVisitor):
+
+    def __init__(self, test_class_node, builder_function_name) -> None:
+        super().__init__()
+        self.test_class_node = test_class_node
+        self.builder_function_name = builder_function_name
+        self.setup_variables = {}
+        self.test_methods = []
+        self.custom_class_method_builders = [
+        ]  # class method trainer builder(call build_trainer)
+        self.custom_class_method_builder_calls = [
+        ]  # the builder call statement
+
+    def visit_FunctionDef(self, node: ast.FunctionDef) -> Any:
+        if node.name.startswith('setUp'):
+            for statement in node.body:
+                if isinstance(statement, ast.Assign):
+                    if len(statement.targets) == 1 and \
+                       isinstance(statement.targets[0], ast.Attribute) and \
+                       isinstance(statement.value, ast.Attribute):
+                        self.setup_variables[str(
+                            statement.targets[0].attr)] = str(
+                                statement.value.attr)
+        elif node.name.startswith('test_'):
+            self.test_methods.append(node)
+        else:
+            for statement in ast.walk(node):
+                if isinstance(statement, ast.Call) and \
+                   isinstance(statement.func, ast.Name):
+                    if statement.func.id == self.builder_function_name:
+                        self.custom_class_method_builders.append(node)
+                        self.custom_class_method_builder_calls.append(
+                            statement)
+
+
+def get_local_arg_value(target_method, args_name):
+    for statement in target_method.body:
+        if isinstance(statement, ast.Assign):
+            for target in statement.targets:
+                if isinstance(target, ast.Name) and target.id == args_name:
+                    if isinstance(statement.value, ast.Attribute):
+                        return statement.value.attr
+                    elif isinstance(statement.value, ast.Str):
+                        return statement.value.s
+    return None
+
+
+def get_custom_builder_parameter_name(args, keywords, builder, builder_call,
+                                      builder_arg_name):
+    # get build_trainer call name argument name.
+    arg_name = None
+    if len(builder_call.args) > 0:
+        if isinstance(builder_call.args[0], ast.Name):
+            # build_trainer name is a variable
+            arg_name = builder_call.args[0].id
+        elif isinstance(builder_call.args[0], ast.Attribute):
+            # Attribute access, such as Trainers.image_classification_team
+            return builder_call.args[0].attr
+        else:
+            raise Exception('Invalid argument name')
+    else:
+        use_default_name = True
+        for kw in builder_call.keywords:
+            if kw.arg == builder_arg_name:
+                use_default_name = False
+                if isinstance(kw.value, ast.Attribute):
+                    return kw.value.attr
+                elif isinstance(kw.value,
+                                ast.Name) and kw.arg == builder_arg_name:
+                    arg_name = kw.value.id
+                else:
+                    raise Exception('Invalid keyword argument')
+        if use_default_name:
+            return 'default'
+
+    if arg_name is None:
+        raise Exception('Invalid build_trainer call')
+
+    arg_value = get_local_arg_value(builder, arg_name)
+    if arg_value is not None:  # trainer_name is a local variable
+        return arg_value
+    # get build_trainer name parameter, if it's passed
+    default_name = None
+    arg_idx = 100000
+    for idx, arg in enumerate(builder.args.args):
+        if arg.arg == arg_name:
+            arg_idx = idx
+            if idx >= len(builder.args.args) - len(builder.args.defaults):
+                default_name = builder.args.defaults[idx - (
+                    len(builder.args.args) - len(builder.args.defaults))].attr
+                break
+    if len(builder.args.args
+           ) > 0 and builder.args.args[0].arg == 'self':  # class method
+        if len(args) > arg_idx - 1:  # - self
+            if isinstance(args[arg_idx - 1], ast.Attribute):
+                return args[arg_idx - 1].attr
+
+    for keyword in keywords:
+        if keyword.arg == arg_name:
+            if isinstance(keyword.value, ast.Attribute):
+                return keyword.value.attr
+
+    return default_name
+
+
+def get_system_builder_parameter_value(builder_call, test_method,
+                                       setup_attributes,
+                                       builder_parameter_name):
+    if len(builder_call.args) > 0:
+        if isinstance(builder_call.args[0], ast.Name):
+            return get_local_arg_value(test_method, builder_call.args[0].id)
+        elif isinstance(builder_call.args[0], ast.Attribute):
+            if builder_call.args[0].attr in setup_attributes:
+                return setup_attributes[builder_call.args[0].attr]
+            return builder_call.args[0].attr
+        elif isinstance(builder_call.args[0], ast.Str):  # TODO check py38
+            return builder_call.args[0].s
+
+    for kw in builder_call.keywords:
+        if kw.arg == builder_parameter_name:
+            if isinstance(kw.value, ast.Attribute):
+                if kw.value.attr in setup_attributes:
+                    return setup_attributes[kw.value.attr]
+                else:
+                    return kw.value.attr
+            elif isinstance(kw.value,
+                            ast.Name) and kw.arg == builder_parameter_name:
+                return kw.value.id
+
+    return 'default'  # use build_trainer default argument.
+
+
+def get_builder_parameter_value(test_method, setup_variables, builder,
+                                builder_call, system_builder_func_name,
+                                builder_parameter_name):
+    """
+    get target builder parameter name, for tariner we get trainer name, for pipeline we get pipeline task
+    """
+    for node in ast.walk(test_method):
+        if builder is None:  # direct call build_trainer
+            for node in ast.walk(test_method):
+                if (isinstance(node, ast.Call)
+                        and isinstance(node.func, ast.Name)
+                        and node.func.id == system_builder_func_name):
+                    return get_system_builder_parameter_value(
+                        node, test_method, setup_variables,
+                        builder_parameter_name)
+        elif (isinstance(node, ast.Call)
+              and isinstance(node.func, ast.Attribute)
+              and node.func.attr == builder.name):
+            return get_custom_builder_parameter_name(node.args, node.keywords,
+                                                     builder, builder_call,
+                                                     builder_parameter_name)
+        elif (isinstance(node, ast.Expr) and isinstance(node.value, ast.Call)
+              and isinstance(node.value.func, ast.Name)
+              and node.value.func.id == builder.name):
+            return get_custom_builder_parameter_name(node.value.args,
+                                                     node.value.keywords,
+                                                     builder, builder_call,
+                                                     builder_parameter_name)
+        elif (isinstance(node, ast.Expr) and isinstance(node.value, ast.Call)
+              and isinstance(node.value.func, ast.Attribute)
+              and node.value.func.attr == builder.name):
+            # self.class_method_builder
+            return get_custom_builder_parameter_name(node.value.args,
+                                                     node.value.keywords,
+                                                     builder, builder_call,
+                                                     builder_parameter_name)
+        elif isinstance(node, ast.Expr) and isinstance(node.value, ast.Call):
+            for arg in node.value.args:
+                if isinstance(arg, ast.Name) and arg.id == builder.name:
+                    # self.start(train_func, num_gpus=2, **kwargs)
+                    return get_custom_builder_parameter_name(
+                        None, None, builder, builder_call,
+                        builder_parameter_name)
+
+    return None
+
+
+def get_class_constructor(test_method, modified_register_modules, module_name):
+    # module_name 'TRAINERS' | 'PIPELINES'
+    for node in ast.walk(test_method):
+        if isinstance(node, ast.Assign) and isinstance(node.value, ast.Call):
+            # trainer = CsanmtTranslationTrainer(model=model_id)
+            for modified_register_module in modified_register_modules:
+                if isinstance(node.value.func, ast.Name) and \
+                   node.value.func.id == modified_register_module[3] and \
+                   modified_register_module[0] == module_name:
+                    if module_name == 'TRAINERS':
+                        return modified_register_module[2]
+                    elif module_name == 'PIPELINES':
+                        return modified_register_module[1]  # pipeline
+
+    return None
+
+
+def analysis_trainer_test_suite(test_file, modified_register_modules):
+    tested_trainers = []
+    with open(test_file, 'rb') as tsf:
+        src = tsf.read()
+    # get test file global function and test class
+    test_suite_root = ast.parse(src, test_file)
+    test_suite_analyzer = AnalysisTestFile(
+        test_file, SYSTEM_TRAINER_BUILDER_FINCTION_NAME)
+    test_suite_analyzer.visit(test_suite_root)
+
+    for test_class in test_suite_analyzer.test_classes:
+        test_class_analyzer = AnalysisTestClass(
+            test_class, SYSTEM_TRAINER_BUILDER_FINCTION_NAME)
+        test_class_analyzer.visit(test_class)
+        for test_method in test_class_analyzer.test_methods:
+            for idx, custom_global_builder in enumerate(
+                    test_suite_analyzer.custom_global_builders
+            ):  # custom test method is global method
+                trainer_name = get_builder_parameter_value(
+                    test_method, test_class_analyzer.setup_variables,
+                    custom_global_builder,
+                    test_suite_analyzer.custom_global_builder_calls[idx],
+                    SYSTEM_TRAINER_BUILDER_FINCTION_NAME,
+                    SYSTEM_TRAINER_BUILDER_PARAMETER_NAME)
+                if trainer_name is not None:
+                    tested_trainers.append(trainer_name)
+            for idx, custom_class_method_builder in enumerate(
+                    test_class_analyzer.custom_class_method_builders
+            ):  # custom class method builder.
+                trainer_name = get_builder_parameter_value(
+                    test_method, test_class_analyzer.setup_variables,
+                    custom_class_method_builder,
+                    test_class_analyzer.custom_class_method_builder_calls[idx],
+                    SYSTEM_TRAINER_BUILDER_FINCTION_NAME,
+                    SYSTEM_TRAINER_BUILDER_PARAMETER_NAME)
+                if trainer_name is not None:
+                    tested_trainers.append(trainer_name)
+
+            trainer_name = get_builder_parameter_value(
+                test_method, test_class_analyzer.setup_variables, None, None,
+                SYSTEM_TRAINER_BUILDER_FINCTION_NAME,
+                SYSTEM_TRAINER_BUILDER_PARAMETER_NAME
+            )  # direct call the build_trainer
+            if trainer_name is not None:
+                tested_trainers.append(trainer_name)
+
+            if len(tested_trainers
+                   ) == 0:  # suppose no builder call is direct construct.
+                trainer_name = get_class_constructor(
+                    test_method, modified_register_modules, 'TRAINERS')
+                if trainer_name is not None:
+                    tested_trainers.append(trainer_name)
+
+    return tested_trainers
+
+
+def analysis_pipeline_test_suite(test_file, modified_register_modules):
+    tested_tasks = []
+    with open(test_file, 'rb') as tsf:
+        src = tsf.read()
+    # get test file global function and test class
+    test_suite_root = ast.parse(src, test_file)
+    test_suite_analyzer = AnalysisTestFile(
+        test_file, SYSTEM_PIPELINE_BUILDER_FUNCTION_NAME)
+    test_suite_analyzer.visit(test_suite_root)
+
+    for test_class in test_suite_analyzer.test_classes:
+        test_class_analyzer = AnalysisTestClass(
+            test_class, SYSTEM_PIPELINE_BUILDER_FUNCTION_NAME)
+        test_class_analyzer.visit(test_class)
+        for test_method in test_class_analyzer.test_methods:
+            for idx, custom_global_builder in enumerate(
+                    test_suite_analyzer.custom_global_builders
+            ):  # custom test method is global method
+                task_name = get_builder_parameter_value(
+                    test_method, test_class_analyzer.setup_variables,
+                    custom_global_builder,
+                    test_suite_analyzer.custom_global_builder_calls[idx],
+                    SYSTEM_PIPELINE_BUILDER_FUNCTION_NAME,
+                    SYSTEM_PIPELINE_BUILDER_PARAMETER_NAME)
+                if task_name is not None:
+                    tested_tasks.append(task_name)
+            for idx, custom_class_method_builder in enumerate(
+                    test_class_analyzer.custom_class_method_builders
+            ):  # custom class method builder.
+                task_name = get_builder_parameter_value(
+                    test_method, test_class_analyzer.setup_variables,
+                    custom_class_method_builder,
+                    test_class_analyzer.custom_class_method_builder_calls[idx],
+                    SYSTEM_PIPELINE_BUILDER_FUNCTION_NAME,
+                    SYSTEM_PIPELINE_BUILDER_PARAMETER_NAME)
+                if task_name is not None:
+                    tested_tasks.append(task_name)
+
+            task_name = get_builder_parameter_value(
+                test_method, test_class_analyzer.setup_variables, None, None,
+                SYSTEM_PIPELINE_BUILDER_FUNCTION_NAME,
+                SYSTEM_PIPELINE_BUILDER_PARAMETER_NAME
+            )  # direct call the build_trainer
+            if task_name is not None:
+                tested_tasks.append(task_name)
+
+            if len(tested_tasks
+                   ) == 0:  # suppose no builder call is direct construct.
+                task_name = get_class_constructor(test_method,
+                                                  modified_register_modules,
+                                                  'PIPELINES')
+                if task_name is not None:
+                    tested_tasks.append(task_name)
+
+    return tested_tasks
+
+
+def get_pipelines_trainers_test_info(register_modules):
+    all_trainer_cases = [
+        os.path.join(dp, f) for dp, dn, filenames in os.walk(
+            os.path.join(os.getcwd(), 'tests', 'trainers')) for f in filenames
+        if os.path.splitext(f)[1] == '.py'
+    ]
+    trainer_test_info = {}
+    for test_file in all_trainer_cases:
+        tested_trainers = analysis_trainer_test_suite(test_file,
+                                                      register_modules)
+        if len(tested_trainers) == 0:
+            logger.warn('test_suite: %s has no trainer name' % test_file)
+        else:
+            tested_trainers = list(set(tested_trainers))
+            for trainer_name in tested_trainers:
+                if trainer_name not in trainer_test_info:
+                    trainer_test_info[trainer_name] = []
+                trainer_test_info[trainer_name].append(test_file)
+
+    pipeline_test_info = {}
+    all_pipeline_cases = [
+        os.path.join(dp, f) for dp, dn, filenames in os.walk(
+            os.path.join(os.getcwd(), 'tests', 'pipelines')) for f in filenames
+        if os.path.splitext(f)[1] == '.py'
+    ]
+    for test_file in all_pipeline_cases:
+        tested_pipelines = analysis_pipeline_test_suite(
+            test_file, register_modules)
+        if len(tested_pipelines) == 0:
+            logger.warn('test_suite: %s has no pipeline task' % test_file)
+        else:
+            tested_pipelines = list(set(tested_pipelines))
+            for pipeline_task in tested_pipelines:
+                if pipeline_task not in pipeline_test_info:
+                    pipeline_test_info[pipeline_task] = []
+                pipeline_test_info[pipeline_task].append(test_file)
+    return pipeline_test_info, trainer_test_info
+
+
+if __name__ == '__main__':
+    test_file = 'tests/pipelines/test_action_detection.py'
+    tasks = analysis_pipeline_test_suite(test_file, None)
+
+    print(tasks)
diff --git a/tests/utils/source_file_analyzer.py b/tests/utils/source_file_analyzer.py
new file mode 100644
index 00000000..ef31c8aa
--- /dev/null
+++ b/tests/utils/source_file_analyzer.py
@@ -0,0 +1,293 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+from __future__ import print_function
+import ast
+import importlib.util
+import os
+import pkgutil
+import site
+import sys
+
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+
+def is_relative_import(path):
+    # from .x import y or from ..x import y
+    return path.startswith('.')
+
+
+def resolve_import(module_name):
+    try:
+        spec = importlib.util.find_spec(module_name)
+        return spec and spec.origin
+    except Exception:
+        return None
+
+
+def convert_to_path(name):
+    if name.startswith('.'):
+        remainder = name.lstrip('.')
+        dot_count = (len(name) - len(remainder))
+        prefix = '../' * (dot_count - 1)
+    else:
+        remainder = name
+        dot_count = 0
+        prefix = ''
+    filename = prefix + os.path.join(*remainder.split('.'))
+    return filename
+
+
+def resolve_relative_import(source_file_path, module_name):
+    current_package = os.path.dirname(source_file_path).replace('/', '.')
+    absolute_name = importlib.util.resolve_name(module_name,
+                                                current_package)  # get
+    return resolve_absolute_import(absolute_name)
+
+
+def onerror(name):
+    logger.error('Importing module %s error!' % name)
+
+
+def resolve_absolute_import(module_name):
+    module_file_path = resolve_import(module_name)
+    if module_file_path is None:
+        # find from base module.
+        parent_module, sub_module = module_name.rsplit('.', 1)
+        if parent_module in sys.modules:
+            if hasattr(sys.modules[parent_module], '_import_structure'):
+                import_structure = sys.modules[parent_module]._import_structure
+                for k, v in import_structure.items():
+                    if sub_module in v:
+                        parent_module = parent_module + '.' + k
+                        break
+            module_file_path = resolve_absolute_import(parent_module)
+            # the parent_module is a package, we need find the module_name's file
+            if os.path.basename(module_file_path) == '__init__.py' and \
+                (os.path.relpath(module_file_path, site.getsitepackages()[0]) != 'modelscope/__init__.py'
+                 or os.path.relpath(module_file_path, os.getcwd()) != 'modelscope/__init__.py'):
+                for _, sub_module_name, _ in pkgutil.walk_packages(
+                    [os.path.dirname(module_file_path)],
+                        parent_module + '.',
+                        onerror=onerror):
+                    try:
+                        module_ = importlib.import_module(sub_module_name)
+                        for k, v in module_.__dict__.items():
+                            if k == sub_module and v.__module__ == module_.__name__:
+                                module_file_path = module_.__file__
+                                break
+                    except ModuleNotFoundError as e:
+                        logger.warn(
+                            'Import error in %s, ModuleNotFoundError: %s' %
+                            (sub_module_name, e))
+                        continue
+                    except Exception as e:
+                        logger.warn('Import error in %s, Exception: %s' %
+                                    (sub_module_name, e))
+                        continue
+            else:
+                return module_file_path
+        else:
+            module_file_path = resolve_absolute_import(parent_module)
+    return module_file_path
+
+
+class AnalysisSourceFileImports(ast.NodeVisitor):
+    """Analysis source file imports
+        List imports of the modelscope.
+    """
+
+    def __init__(self, source_file_path) -> None:
+        super().__init__()
+        self.imports = []
+        self.source_file_path = source_file_path
+
+    def visit_Import(self, node):
+        """Processing import x,y,z or import os.path as osp"""
+        for alias in node.names:
+            if alias.name.startswith('modelscope'):
+                file_path = resolve_absolute_import(alias.name)
+                if file_path.startswith(site.getsitepackages()[0]):
+                    self.imports.append(
+                        os.path.relpath(file_path,
+                                        site.getsitepackages()[0]))
+                else:
+                    self.imports.append(
+                        os.path.relpath(file_path, os.getcwd()))
+
+    def visit_ImportFrom(self, node):
+        # level 0 absolute import such as from os.path import join
+        # level 1 from .x import y
+        # level 2 from ..x import y
+        module_name = '.' * node.level + (node.module or '')
+        for alias in node.names:
+            if alias.name == '*':  # from x import *
+                if is_relative_import(module_name):
+                    # resolve model path.
+                    file_path = resolve_relative_import(
+                        self.source_file_path, module_name)
+                elif module_name.startswith('modelscope'):
+                    file_path = resolve_absolute_import(module_name)
+                else:
+                    file_path = None  # ignore other package.
+            else:
+                if not module_name.endswith('.'):
+                    module_name = module_name + '.'
+                name = module_name + alias.name
+                if is_relative_import(name):
+                    # resolve model path.
+                    file_path = resolve_relative_import(
+                        self.source_file_path, name)
+                elif name.startswith('modelscope'):
+                    file_path = resolve_absolute_import(name)
+                else:
+                    file_path = None  # ignore other package.
+
+            if file_path is not None:
+                if file_path.startswith(site.getsitepackages()[0]):
+                    self.imports.append(
+                        os.path.relpath(file_path,
+                                        site.getsitepackages()[0]))
+                else:
+                    self.imports.append(
+                        os.path.relpath(file_path, os.getcwd()))
+
+
+class AnalysisSourceFileRegisterModules(ast.NodeVisitor):
+    """Get register_module call of the python source file.
+
+
+    Args:
+        ast (NodeVisitor): The ast node.
+
+    Examples:
+        >>> with open(source_file_path, "rb") as f:
+        >>>     src = f.read()
+        >>> analyzer = AnalysisSourceFileRegisterModules(source_file_path)
+        >>> analyzer.visit(ast.parse(src, filename=source_file_path))
+    """
+
+    def __init__(self, source_file_path) -> None:
+        super().__init__()
+        self.source_file_path = source_file_path
+        self.register_modules = []
+
+    def visit_ClassDef(self, node: ast.ClassDef):
+        if len(node.decorator_list) > 0:
+            for dec in node.decorator_list:
+                if isinstance(dec, ast.Call):
+                    target_name = ''
+                    module_name_param = ''
+                    task_param = ''
+                    if isinstance(dec.func, ast.Attribute
+                                  ) and dec.func.attr == 'register_module':
+                        target_name = dec.func.value.id  # MODELS
+                        if len(dec.args) > 0:
+                            if isinstance(dec.args[0], ast.Attribute):
+                                task_param = dec.args[0].attr
+                            elif isinstance(dec.args[0], ast.Constant):
+                                task_param = dec.args[0].value
+                        if len(dec.keywords) > 0:
+                            for kw in dec.keywords:
+                                if kw.arg == 'module_name':
+                                    if isinstance(kw.value, ast.Str):
+                                        module_name_param = kw.value.s
+                                    else:
+                                        module_name_param = kw.value.attr
+                                elif kw.arg == 'group_key':
+                                    if isinstance(kw.value, ast.Str):
+                                        task_param = kw.value.s
+                                    elif isinstance(kw.value, ast.Name):
+                                        task_param = kw.value.id
+                                    else:
+                                        task_param = kw.value.attr
+                        if task_param == '' and module_name_param == '':
+                            logger.warn(
+                                'File %s %s.register_module has no parameters'
+                                % (self.source_file_path, target_name))
+                            continue
+                        if target_name == 'PIPELINES' and task_param == '':
+                            logger.warn(
+                                'File %s %s.register_module has no task_param'
+                                % (self.source_file_path, target_name))
+                        self.register_modules.append(
+                            (target_name, task_param, module_name_param,
+                             node.name))  # PIPELINES, task, module, class_name
+
+
+def get_imported_files(file_path):
+    """Get file dependencies.
+    """
+    if os.path.isabs(file_path):
+        file_path = os.path.relpath(file_path, os.getcwd())
+    with open(file_path, 'rb') as f:
+        src = f.read()
+    analyzer = AnalysisSourceFileImports(file_path)
+    analyzer.visit(ast.parse(src, filename=file_path))
+    return list(set(analyzer.imports))
+
+
+def path_to_module_name(file_path):
+    if os.path.isabs(file_path):
+        file_path = os.path.relpath(file_path, os.getcwd())
+    module_name = os.path.dirname(file_path).replace('/', '.')
+    return module_name
+
+
+def get_file_register_modules(file_path):
+    logger.info('Get file: %s register_module' % file_path)
+    with open(file_path, 'rb') as f:
+        src = f.read()
+    analyzer = AnalysisSourceFileRegisterModules(file_path)
+    analyzer.visit(ast.parse(src, filename=file_path))
+    return analyzer.register_modules
+
+
+def get_import_map():
+    all_files = [
+        os.path.join(dp, f) for dp, dn, filenames in os.walk(
+            os.path.join(os.getcwd(), 'modelscope')) for f in filenames
+        if os.path.splitext(f)[1] == '.py'
+    ]
+    import_map = {}
+    for f in all_files:
+        files = get_imported_files(f)
+        import_map[os.path.relpath(f, os.getcwd())] = files
+
+    return import_map
+
+
+def get_reverse_import_map():
+    all_files = [
+        os.path.join(dp, f) for dp, dn, filenames in os.walk(
+            os.path.join(os.getcwd(), 'modelscope')) for f in filenames
+        if os.path.splitext(f)[1] == '.py'
+    ]
+    import_map = get_import_map()
+
+    reverse_depend_map = {}
+    for f in all_files:
+        depend_by = []
+        for k, v in import_map.items():
+            if f in v and f != k:
+                depend_by.append(k)
+        reverse_depend_map[f] = depend_by
+
+    return reverse_depend_map, import_map
+
+
+def get_all_register_modules():
+    all_files = [
+        os.path.join(dp, f) for dp, dn, filenames in os.walk(
+            os.path.join(os.getcwd(), 'modelscope')) for f in filenames
+        if os.path.splitext(f)[1] == '.py'
+    ]
+    all_register_modules = []
+    for f in all_files:
+        all_register_modules.extend(get_file_register_modules(f))
+    return all_register_modules
+
+
+if __name__ == '__main__':
+    pass
diff --git a/tests/utils/test_ast.py b/tests/utils/test_ast.py
index 5aafdfc7..5d92737d 100644
--- a/tests/utils/test_ast.py
+++ b/tests/utils/test_ast.py
@@ -9,8 +9,9 @@ from pathlib import Path
 
 from modelscope.utils.ast_utils import (FILES_MTIME_KEY, INDEX_KEY, MD5_KEY,
                                         MODELSCOPE_PATH_KEY, REQUIREMENT_KEY,
-                                        VERSION_KEY, AstScaning,
-                                        FilesAstScaning, generate_ast_template,
+                                        VERSION_KEY, AstScanning,
+                                        FilesAstScanning,
+                                        generate_ast_template,
                                         load_from_prebuilt, load_index)
 
 p = Path(__file__)
@@ -32,7 +33,7 @@ class AstScaningTest(unittest.TestCase):
         shutil.rmtree(self.tmp_dir)
 
     def test_ast_scaning_class(self):
-        astScaner = AstScaning()
+        astScaner = AstScanning()
         pipeline_file = os.path.join(MODELSCOPE_PATH, 'pipelines', 'nlp',
                                      'text_generation_pipeline.py')
         output = astScaner.generate_ast(pipeline_file)
@@ -58,7 +59,7 @@ class AstScaningTest(unittest.TestCase):
              ('PIPELINES', 'text2text-generation', 'text2text-generation')])
 
     def test_files_scaning_method(self):
-        fileScaner = FilesAstScaning()
+        fileScaner = FilesAstScanning()
         # case of pass in files directly
         pipeline_file = os.path.join(MODELSCOPE_PATH, 'pipelines', 'nlp',
                                      'text_generation_pipeline.py')
@@ -80,7 +81,7 @@ class AstScaningTest(unittest.TestCase):
         self.assertIsInstance(requirements[index_0], list)
 
     def test_file_mtime_md5_method(self):
-        fileScaner = FilesAstScaning()
+        fileScaner = FilesAstScanning()
         # create first file
         with open(self.test_file, 'w', encoding='utf-8') as f:
             f.write('This is the new test!')
@@ -143,6 +144,8 @@ class AstScaningTest(unittest.TestCase):
         index_from_prebuilt = load_from_prebuilt(file_path)
         self.assertEqual(index, index_from_prebuilt)
 
+    @unittest.skip(
+        'skipped the method for not cpu time on this case not stable')
     def test_update_load_index_method(self):
         file_number = 20
         file_list = []